Merge pull request #3491 from TiejunChina/master-dev

enable rt for 5.4.x
This commit is contained in:
Rolf Neugebauer 2020-04-09 23:34:21 +01:00 committed by GitHub
commit f98fa5ca41
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
293 changed files with 36891 additions and 0 deletions

View File

@ -253,6 +253,7 @@ endef
#
ifeq ($(ARCH),x86_64)
$(eval $(call kernel,5.4.28,5.4.x,$(EXTRA),$(DEBUG)))
$(eval $(call kernel,5.4.28,5.4.x,-rt,))
$(eval $(call kernel,4.19.113,4.19.x,$(EXTRA),$(DEBUG)))
$(eval $(call kernel,4.19.113,4.19.x,,-dbg))
$(eval $(call kernel,4.19.106,4.19.x,-rt,))
@ -260,6 +261,7 @@ $(eval $(call kernel,4.14.174,4.14.x,$(EXTRA),$(DEBUG)))
else ifeq ($(ARCH),aarch64)
$(eval $(call kernel,5.4.28,5.4.x,$(EXTRA),$(DEBUG)))
$(eval $(call kernel,5.4.28,5.4.x,-rt,))
$(eval $(call kernel,4.19.106,4.19.x,-rt,))
else ifeq ($(ARCH),s390x)

View File

@ -0,0 +1,20 @@
CONFIG_SLUB_DEBUG=y
# CONFIG_SLUB_MEMCG_SYSFS_ON is not set
CONFIG_SLUB=y
# CONFIG_SLAB_FREELIST_HARDENED is not set
CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y
CONFIG_PREEMPT=y
CONFIG_PREEMPT_RT_BASE=y
CONFIG_HAVE_PREEMPT_LAZY=y
CONFIG_PREEMPT_LAZY=y
# CONFIG_PREEMPT_VOLUNTARY is not set
# CONFIG_PREEMPT__LL is not set
# CONFIG_PREEMPT_RTB is not set
CONFIG_PREEMPT_RT_FULL=y
CONFIG_PREEMPT_COUNT=y
# CONFIG_SLUB_DEBUG_ON is not set
# CONFIG_SLUB_STATS is not set
CONFIG_DEBUG_PREEMPT=y
# CONFIG_PREEMPT_TRACER is not set
CONFIG_HZ_1000=y
CONFIG_HZ=1000

View File

@ -0,0 +1,22 @@
CONFIG_RWSEM_GENERIC_SPINLOCK=y
# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set
CONFIG_PREEMPT_RCU=y
CONFIG_TASKS_RCU=y
CONFIG_SLUB_DEBUG=y
# CONFIG_SLUB_MEMCG_SYSFS_ON is not set
CONFIG_SLUB=y
# CONFIG_SLAB_FREELIST_HARDENED is not set
CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y
CONFIG_PREEMPT=y
CONFIG_PREEMPT_RT_BASE=y
CONFIG_HAVE_PREEMPT_LAZY=y
CONFIG_PREEMPT_LAZY=y
# CONFIG_PREEMPT_VOLUNTARY is not set
# CONFIG_PREEMPT__LL is not set
# CONFIG_PREEMPT_RTB is not set
CONFIG_PREEMPT_RT_FULL=y
CONFIG_PREEMPT_COUNT=y
# CONFIG_SLUB_DEBUG_ON is not set
# CONFIG_SLUB_STATS is not set
CONFIG_DEBUG_PREEMPT=y
# CONFIG_PREEMPT_TRACER is not set

View File

@ -0,0 +1,35 @@
From: Waiman Long <longman@redhat.com>
Date: Thu, 3 Oct 2019 16:36:08 -0400
Subject: [PATCH] lib/smp_processor_id: Don't use cpumask_equal()
The check_preemption_disabled() function uses cpumask_equal() to see
if the task is bounded to the current CPU only. cpumask_equal() calls
memcmp() to do the comparison. As x86 doesn't have __HAVE_ARCH_MEMCMP,
the slow memcmp() function in lib/string.c is used.
On a RT kernel that call check_preemption_disabled() very frequently,
below is the perf-record output of a certain microbenchmark:
42.75% 2.45% testpmd [kernel.kallsyms] [k] check_preemption_disabled
40.01% 39.97% testpmd [kernel.kallsyms] [k] memcmp
We should avoid calling memcmp() in performance critical path. So the
cpumask_equal() call is now replaced with an equivalent simpler check.
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
lib/smp_processor_id.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -23,7 +23,7 @@ unsigned int check_preemption_disabled(c
* Kernel threads bound to a single CPU can safely use
* smp_processor_id():
*/
- if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
+ if (current->nr_cpus_allowed == 1)
goto out;
/*

View File

@ -0,0 +1,57 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 9 Aug 2019 14:42:27 +0200
Subject: [PATCH 1/7] jbd2: Simplify journal_unmap_buffer()
journal_unmap_buffer() checks first whether the buffer head is a journal.
If so it takes locks and then invokes jbd2_journal_grab_journal_head()
followed by another check whether this is journal head buffer.
The double checking is pointless.
Replace the initial check with jbd2_journal_grab_journal_head() which
alredy checks whether the buffer head is actually a journal.
Allows also early access to the journal head pointer for the upcoming
conversion of state lock to a regular spinlock.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: linux-ext4@vger.kernel.org
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/jbd2/transaction.c | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2203,7 +2203,8 @@ static int journal_unmap_buffer(journal_
* holding the page lock. --sct
*/
- if (!buffer_jbd(bh))
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh)
goto zap_buffer_unlocked;
/* OK, we have data buffer in journaled mode */
@@ -2211,10 +2212,6 @@ static int journal_unmap_buffer(journal_
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
- jh = jbd2_journal_grab_journal_head(bh);
- if (!jh)
- goto zap_buffer_no_jh;
-
/*
* We cannot remove the buffer from checkpoint lists until the
* transaction adding inode to orphan list (let's call it T)
@@ -2338,7 +2335,6 @@ static int journal_unmap_buffer(journal_
*/
jh->b_modified = 0;
jbd2_journal_put_journal_head(jh);
-zap_buffer_no_jh:
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
write_unlock(&journal->j_state_lock);

View File

@ -0,0 +1,30 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 9 Aug 2019 14:42:28 +0200
Subject: [PATCH 2/7] jbd2: Remove jbd_trylock_bh_state()
No users.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: linux-ext4@vger.kernel.org
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/jbd2.h | 5 -----
1 file changed, 5 deletions(-)
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -347,11 +347,6 @@ static inline void jbd_lock_bh_state(str
bit_spin_lock(BH_State, &bh->b_state);
}
-static inline int jbd_trylock_bh_state(struct buffer_head *bh)
-{
- return bit_spin_trylock(BH_State, &bh->b_state);
-}
-
static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
{
return bit_spin_is_locked(BH_State, &bh->b_state);

View File

@ -0,0 +1,150 @@
From: Jan Kara <jack@suse.cz>
Date: Fri, 9 Aug 2019 14:42:29 +0200
Subject: [PATCH 3/7] jbd2: Move dropping of jh reference out of un/re-filing
functions
__jbd2_journal_unfile_buffer() and __jbd2_journal_refile_buffer() drop
transaction's jh reference when they remove jh from a transaction. This
will be however inconvenient once we move state lock into journal_head
itself as we still need to unlock it and we'd need to grab jh reference
just for that. Move dropping of jh reference out of these functions into
the few callers.
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/jbd2/commit.c | 5 ++++-
fs/jbd2/transaction.c | 23 +++++++++++++++--------
include/linux/jbd2.h | 2 +-
3 files changed, 20 insertions(+), 10 deletions(-)
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -920,6 +920,7 @@ void jbd2_journal_commit_transaction(jou
transaction_t *cp_transaction;
struct buffer_head *bh;
int try_to_free = 0;
+ bool drop_ref;
jh = commit_transaction->t_forget;
spin_unlock(&journal->j_list_lock);
@@ -1028,8 +1029,10 @@ void jbd2_journal_commit_transaction(jou
try_to_free = 1;
}
JBUFFER_TRACE(jh, "refile or unfile buffer");
- __jbd2_journal_refile_buffer(jh);
+ drop_ref = __jbd2_journal_refile_buffer(jh);
jbd_unlock_bh_state(bh);
+ if (drop_ref)
+ jbd2_journal_put_journal_head(jh);
if (try_to_free)
release_buffer_page(bh); /* Drops bh reference */
else
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1602,6 +1602,7 @@ int jbd2_journal_forget (handle_t *handl
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
} else {
__jbd2_journal_unfile_buffer(jh);
+ jbd2_journal_put_journal_head(jh);
if (!buffer_jbd(bh)) {
spin_unlock(&journal->j_list_lock);
goto not_jbd;
@@ -1975,17 +1976,15 @@ static void __jbd2_journal_temp_unlink_b
}
/*
- * Remove buffer from all transactions.
+ * Remove buffer from all transactions. The caller is responsible for dropping
+ * the jh reference that belonged to the transaction.
*
* Called with bh_state lock and j_list_lock
- *
- * jh and bh may be already freed when this function returns.
*/
static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
{
__jbd2_journal_temp_unlink_buffer(jh);
jh->b_transaction = NULL;
- jbd2_journal_put_journal_head(jh);
}
void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
@@ -1999,6 +1998,7 @@ void jbd2_journal_unfile_buffer(journal_
__jbd2_journal_unfile_buffer(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
+ jbd2_journal_put_journal_head(jh);
__brelse(bh);
}
@@ -2137,6 +2137,7 @@ static int __dispose_buffer(struct journ
} else {
JBUFFER_TRACE(jh, "on running transaction");
__jbd2_journal_unfile_buffer(jh);
+ jbd2_journal_put_journal_head(jh);
}
return may_free;
}
@@ -2502,9 +2503,11 @@ void jbd2_journal_file_buffer(struct jou
* Called under j_list_lock
* Called under jbd_lock_bh_state(jh2bh(jh))
*
- * jh and bh may be already free when this function returns
+ * When this function returns true, there's no next transaction to refile to
+ * and the caller has to drop jh reference through
+ * jbd2_journal_put_journal_head().
*/
-void __jbd2_journal_refile_buffer(struct journal_head *jh)
+bool __jbd2_journal_refile_buffer(struct journal_head *jh)
{
int was_dirty, jlist;
struct buffer_head *bh = jh2bh(jh);
@@ -2516,7 +2519,7 @@ void __jbd2_journal_refile_buffer(struct
/* If the buffer is now unused, just drop it. */
if (jh->b_next_transaction == NULL) {
__jbd2_journal_unfile_buffer(jh);
- return;
+ return true;
}
/*
@@ -2544,6 +2547,7 @@ void __jbd2_journal_refile_buffer(struct
if (was_dirty)
set_buffer_jbddirty(bh);
+ return false;
}
/*
@@ -2555,15 +2559,18 @@ void __jbd2_journal_refile_buffer(struct
void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
struct buffer_head *bh = jh2bh(jh);
+ bool drop;
/* Get reference so that buffer cannot be freed before we unlock it */
get_bh(bh);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
- __jbd2_journal_refile_buffer(jh);
+ drop = __jbd2_journal_refile_buffer(jh);
jbd_unlock_bh_state(bh);
spin_unlock(&journal->j_list_lock);
__brelse(bh);
+ if (drop)
+ jbd2_journal_put_journal_head(jh);
}
/*
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1252,7 +1252,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM
/* Filing buffers */
extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *);
-extern void __jbd2_journal_refile_buffer(struct journal_head *);
+extern bool __jbd2_journal_refile_buffer(struct journal_head *);
extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *);
extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int);
extern void __journal_free_buffer(struct journal_head *bh);

View File

@ -0,0 +1,27 @@
From: Jan Kara <jack@suse.cz>
Date: Fri, 9 Aug 2019 14:42:30 +0200
Subject: [PATCH 4/7] jbd2: Drop unnecessary branch from jbd2_journal_forget()
We have cleared both dirty & jbddirty bits from the bh. So there's no
difference between bforget() and brelse(). Thus there's no point jumping
to no_jbd branch.
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/jbd2/transaction.c | 4 ----
1 file changed, 4 deletions(-)
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1603,10 +1603,6 @@ int jbd2_journal_forget (handle_t *handl
} else {
__jbd2_journal_unfile_buffer(jh);
jbd2_journal_put_journal_head(jh);
- if (!buffer_jbd(bh)) {
- spin_unlock(&journal->j_list_lock);
- goto not_jbd;
- }
}
spin_unlock(&journal->j_list_lock);
} else if (jh->b_transaction) {

View File

@ -0,0 +1,58 @@
From: Jan Kara <jack@suse.cz>
Date: Fri, 9 Aug 2019 14:42:31 +0200
Subject: [PATCH 5/7] jbd2: Don't call __bforget() unnecessarily
jbd2_journal_forget() jumps to 'not_jbd' branch which calls __bforget()
in cases where the buffer is clean which is pointless. In case of failed
assertion, it can be even argued that it is safer not to touch buffer's
dirty bits. Also logically it makes more sense to just jump to 'drop'
and that will make logic also simpler when we switch bh_state_lock to a
spinlock.
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/jbd2/transaction.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1554,7 +1554,7 @@ int jbd2_journal_forget (handle_t *handl
if (!J_EXPECT_JH(jh, !jh->b_committed_data,
"inconsistent data on disk")) {
err = -EIO;
- goto not_jbd;
+ goto drop;
}
/* keep track of whether or not this transaction modified us */
@@ -1644,7 +1644,7 @@ int jbd2_journal_forget (handle_t *handl
if (!jh->b_cp_transaction) {
JBUFFER_TRACE(jh, "belongs to none transaction");
spin_unlock(&journal->j_list_lock);
- goto not_jbd;
+ goto drop;
}
/*
@@ -1654,7 +1654,7 @@ int jbd2_journal_forget (handle_t *handl
if (!buffer_dirty(bh)) {
__jbd2_journal_remove_checkpoint(jh);
spin_unlock(&journal->j_list_lock);
- goto not_jbd;
+ goto drop;
}
/*
@@ -1667,10 +1667,9 @@ int jbd2_journal_forget (handle_t *handl
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
spin_unlock(&journal->j_list_lock);
}
-
+drop:
jbd_unlock_bh_state(bh);
__brelse(bh);
-drop:
if (drop_reserve) {
/* no need to reserve log space for this block -bzzz */
handle->h_buffer_credits++;

View File

@ -0,0 +1,675 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 9 Aug 2019 14:42:32 +0200
Subject: [PATCH 6/7] jbd2: Make state lock a spinlock
Bit-spinlocks are problematic on PREEMPT_RT if functions which might sleep
on RT, e.g. spin_lock(), alloc/free(), are invoked inside the lock held
region because bit spinlocks disable preemption even on RT.
A first attempt was to replace state lock with a spinlock placed in struct
buffer_head and make the locking conditional on PREEMPT_RT and
DEBUG_BIT_SPINLOCKS.
Jan pointed out that there is a 4 byte hole in struct journal_head where a
regular spinlock fits in and he would not object to convert the state lock
to a spinlock unconditionally.
Aside of solving the RT problem, this also gains lockdep coverage for the
journal head state lock (bit-spinlocks are not covered by lockdep as it's
hard to fit a lockdep map into a single bit).
The trivial change would have been to convert the jbd_*lock_bh_state()
inlines, but that comes with the downside that these functions take a
buffer head pointer which needs to be converted to a journal head pointer
which adds another level of indirection.
As almost all functions which use this lock have a journal head pointer
readily available, it makes more sense to remove the lock helper inlines
and write out spin_*lock() at all call sites.
Fixup all locking comments as well.
Suggested-by: Jan Kara <jack@suse.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jan Kara <jack@suse.com>
Cc: linux-ext4@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/jbd2/commit.c | 8 +--
fs/jbd2/journal.c | 10 ++--
fs/jbd2/transaction.c | 100 ++++++++++++++++++++-----------------------
fs/ocfs2/suballoc.c | 19 ++++----
include/linux/jbd2.h | 20 --------
include/linux/journal-head.h | 21 ++++++---
6 files changed, 84 insertions(+), 94 deletions(-)
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -482,10 +482,10 @@ void jbd2_journal_commit_transaction(jou
if (jh->b_committed_data) {
struct buffer_head *bh = jh2bh(jh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
jbd2_free(jh->b_committed_data, bh->b_size);
jh->b_committed_data = NULL;
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
jbd2_journal_refile_buffer(journal, jh);
}
@@ -930,7 +930,7 @@ void jbd2_journal_commit_transaction(jou
* done with it.
*/
get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
/*
@@ -1030,7 +1030,7 @@ void jbd2_journal_commit_transaction(jou
}
JBUFFER_TRACE(jh, "refile or unfile buffer");
drop_ref = __jbd2_journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
if (drop_ref)
jbd2_journal_put_journal_head(jh);
if (try_to_free)
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -363,7 +363,7 @@ int jbd2_journal_write_metadata_buffer(t
/* keep subsequent assertions sane */
atomic_set(&new_bh->b_count, 1);
- jbd_lock_bh_state(bh_in);
+ spin_lock(&jh_in->b_state_lock);
repeat:
/*
* If a new transaction has already done a buffer copy-out, then
@@ -405,13 +405,13 @@ int jbd2_journal_write_metadata_buffer(t
if (need_copy_out && !done_copy_out) {
char *tmp;
- jbd_unlock_bh_state(bh_in);
+ spin_unlock(&jh_in->b_state_lock);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
brelse(new_bh);
return -ENOMEM;
}
- jbd_lock_bh_state(bh_in);
+ spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
goto repeat;
@@ -464,7 +464,7 @@ int jbd2_journal_write_metadata_buffer(t
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock);
set_buffer_shadow(bh_in);
- jbd_unlock_bh_state(bh_in);
+ spin_unlock(&jh_in->b_state_lock);
return do_escape | (done_copy_out << 1);
}
@@ -2407,6 +2407,8 @@ static struct journal_head *journal_allo
ret = kmem_cache_zalloc(jbd2_journal_head_cache,
GFP_NOFS | __GFP_NOFAIL);
}
+ if (ret)
+ spin_lock_init(&ret->b_state_lock);
return ret;
}
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -877,7 +877,7 @@ do_get_write_access(handle_t *handle, st
start_lock = jiffies;
lock_buffer(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
/* If it takes too long to lock the buffer, trace it */
time_lock = jbd2_time_diff(start_lock, jiffies);
@@ -927,7 +927,7 @@ do_get_write_access(handle_t *handle, st
error = -EROFS;
if (is_handle_aborted(handle)) {
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
goto out;
}
error = 0;
@@ -991,7 +991,7 @@ do_get_write_access(handle_t *handle, st
*/
if (buffer_shadow(bh)) {
JBUFFER_TRACE(jh, "on shadow: sleep");
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE);
goto repeat;
}
@@ -1012,7 +1012,7 @@ do_get_write_access(handle_t *handle, st
JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer");
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS | __GFP_NOFAIL);
goto repeat;
@@ -1031,7 +1031,7 @@ do_get_write_access(handle_t *handle, st
jh->b_next_transaction = transaction;
done:
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
/*
* If we are about to journal a buffer, then any revoke pending on it is
@@ -1173,7 +1173,7 @@ int jbd2_journal_get_create_access(handl
* that case: the transaction must have deleted the buffer for it to be
* reused here.
*/
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
jh->b_transaction == NULL ||
(jh->b_transaction == journal->j_committing_transaction &&
@@ -1208,7 +1208,7 @@ int jbd2_journal_get_create_access(handl
jh->b_next_transaction = transaction;
spin_unlock(&journal->j_list_lock);
}
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
/*
* akpm: I added this. ext3_alloc_branch can pick up new indirect
@@ -1279,13 +1279,13 @@ int jbd2_journal_get_undo_access(handle_
committed_data = jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS|__GFP_NOFAIL);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (!jh->b_committed_data) {
/* Copy out the current buffer contents into the
* preserved, committed copy. */
JBUFFER_TRACE(jh, "generate b_committed data");
if (!committed_data) {
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
goto repeat;
}
@@ -1293,7 +1293,7 @@ int jbd2_journal_get_undo_access(handle_
committed_data = NULL;
memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
}
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
out:
jbd2_journal_put_journal_head(jh);
if (unlikely(committed_data))
@@ -1394,16 +1394,16 @@ int jbd2_journal_dirty_metadata(handle_t
*/
if (jh->b_transaction != transaction &&
jh->b_next_transaction != transaction) {
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
J_ASSERT_JH(jh, jh->b_transaction == transaction ||
jh->b_next_transaction == transaction);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
if (jh->b_modified == 1) {
/* If it's in our transaction it must be in BJ_Metadata list. */
if (jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata) {
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata)
pr_err("JBD2: assertion failure: h_type=%u "
@@ -1413,13 +1413,13 @@ int jbd2_journal_dirty_metadata(handle_t
jh->b_jlist);
J_ASSERT_JH(jh, jh->b_transaction != transaction ||
jh->b_jlist == BJ_Metadata);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
}
goto out;
}
journal = transaction->t_journal;
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
if (jh->b_modified == 0) {
/*
@@ -1505,7 +1505,7 @@ int jbd2_journal_dirty_metadata(handle_t
__jbd2_journal_file_buffer(jh, transaction, BJ_Metadata);
spin_unlock(&journal->j_list_lock);
out_unlock_bh:
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
out:
JBUFFER_TRACE(jh, "exit");
return ret;
@@ -1543,11 +1543,13 @@ int jbd2_journal_forget (handle_t *handl
BUFFER_TRACE(bh, "entry");
- jbd_lock_bh_state(bh);
+ jh = jbd2_journal_grab_journal_head(bh);
+ if (!jh) {
+ __bforget(bh);
+ return 0;
+ }
- if (!buffer_jbd(bh))
- goto not_jbd;
- jh = bh2jh(bh);
+ spin_lock(&jh->b_state_lock);
/* Critical error: attempting to delete a bitmap buffer, maybe?
* Don't do any jbd operations, and return an error. */
@@ -1668,18 +1670,14 @@ int jbd2_journal_forget (handle_t *handl
spin_unlock(&journal->j_list_lock);
}
drop:
- jbd_unlock_bh_state(bh);
__brelse(bh);
+ spin_unlock(&jh->b_state_lock);
+ jbd2_journal_put_journal_head(jh);
if (drop_reserve) {
/* no need to reserve log space for this block -bzzz */
handle->h_buffer_credits++;
}
return err;
-
-not_jbd:
- jbd_unlock_bh_state(bh);
- __bforget(bh);
- goto drop;
}
/**
@@ -1878,7 +1876,7 @@ int jbd2_journal_stop(handle_t *handle)
*
* j_list_lock is held.
*
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
*/
static inline void
@@ -1902,7 +1900,7 @@ static inline void
*
* Called with j_list_lock held, and the journal may not be locked.
*
- * jbd_lock_bh_state(jh2bh(jh)) is held.
+ * jh->b_state_lock is held.
*/
static inline void
@@ -1934,7 +1932,7 @@ static void __jbd2_journal_temp_unlink_b
transaction_t *transaction;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
transaction = jh->b_transaction;
if (transaction)
assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1988,11 +1986,11 @@ void jbd2_journal_unfile_buffer(journal_
/* Get reference so that buffer cannot be freed before we unlock it */
get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
__jbd2_journal_unfile_buffer(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
jbd2_journal_put_journal_head(jh);
__brelse(bh);
}
@@ -2000,7 +1998,7 @@ void jbd2_journal_unfile_buffer(journal_
/*
* Called from jbd2_journal_try_to_free_buffers().
*
- * Called under jbd_lock_bh_state(bh)
+ * Called under jh->b_state_lock
*/
static void
__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
@@ -2087,10 +2085,10 @@ int jbd2_journal_try_to_free_buffers(jou
if (!jh)
continue;
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
__journal_try_to_free_buffer(journal, bh);
+ spin_unlock(&jh->b_state_lock);
jbd2_journal_put_journal_head(jh);
- jbd_unlock_bh_state(bh);
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
@@ -2111,7 +2109,7 @@ int jbd2_journal_try_to_free_buffers(jou
*
* Called under j_list_lock.
*
- * Called under jbd_lock_bh_state(bh).
+ * Called under jh->b_state_lock.
*/
static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
{
@@ -2205,7 +2203,7 @@ static int journal_unmap_buffer(journal_
/* OK, we have data buffer in journaled mode */
write_lock(&journal->j_state_lock);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
/*
@@ -2286,10 +2284,10 @@ static int journal_unmap_buffer(journal_
* for commit and try again.
*/
if (partial_page) {
- jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
return -EBUSY;
}
/*
@@ -2303,10 +2301,10 @@ static int journal_unmap_buffer(journal_
if (journal->j_running_transaction && buffer_jbddirty(bh))
jh->b_next_transaction = journal->j_running_transaction;
jh->b_modified = 0;
- jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
return 0;
} else {
/* Good, the buffer belongs to the running transaction.
@@ -2330,10 +2328,10 @@ static int journal_unmap_buffer(journal_
* here.
*/
jh->b_modified = 0;
- jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
write_unlock(&journal->j_state_lock);
+ jbd2_journal_put_journal_head(jh);
zap_buffer_unlocked:
clear_buffer_dirty(bh);
J_ASSERT_BH(bh, !buffer_jbddirty(bh));
@@ -2420,7 +2418,7 @@ void __jbd2_journal_file_buffer(struct j
int was_dirty = 0;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
assert_spin_locked(&transaction->t_journal->j_list_lock);
J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2482,11 +2480,11 @@ void __jbd2_journal_file_buffer(struct j
void jbd2_journal_file_buffer(struct journal_head *jh,
transaction_t *transaction, int jlist)
{
- jbd_lock_bh_state(jh2bh(jh));
+ spin_lock(&jh->b_state_lock);
spin_lock(&transaction->t_journal->j_list_lock);
__jbd2_journal_file_buffer(jh, transaction, jlist);
spin_unlock(&transaction->t_journal->j_list_lock);
- jbd_unlock_bh_state(jh2bh(jh));
+ spin_unlock(&jh->b_state_lock);
}
/*
@@ -2496,7 +2494,7 @@ void jbd2_journal_file_buffer(struct jou
* buffer on that transaction's metadata list.
*
* Called under j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh))
+ * Called under jh->b_state_lock
*
* When this function returns true, there's no next transaction to refile to
* and the caller has to drop jh reference through
@@ -2507,7 +2505,7 @@ bool __jbd2_journal_refile_buffer(struct
int was_dirty, jlist;
struct buffer_head *bh = jh2bh(jh);
- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+ lockdep_assert_held(&jh->b_state_lock);
if (jh->b_transaction)
assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
@@ -2553,17 +2551,13 @@ bool __jbd2_journal_refile_buffer(struct
*/
void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
{
- struct buffer_head *bh = jh2bh(jh);
bool drop;
- /* Get reference so that buffer cannot be freed before we unlock it */
- get_bh(bh);
- jbd_lock_bh_state(bh);
+ spin_lock(&jh->b_state_lock);
spin_lock(&journal->j_list_lock);
drop = __jbd2_journal_refile_buffer(jh);
- jbd_unlock_bh_state(bh);
+ spin_unlock(&jh->b_state_lock);
spin_unlock(&journal->j_list_lock);
- __brelse(bh);
if (drop)
jbd2_journal_put_journal_head(jh);
}
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1252,6 +1252,7 @@ static int ocfs2_test_bg_bit_allocatable
int nr)
{
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
+ struct journal_head *jh;
int ret;
if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
@@ -1260,13 +1261,14 @@ static int ocfs2_test_bg_bit_allocatable
if (!buffer_jbd(bg_bh))
return 1;
- jbd_lock_bh_state(bg_bh);
- bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
+ jh = bh2jh(bg_bh);
+ spin_lock(&jh->b_state_lock);
+ bg = (struct ocfs2_group_desc *) jh->b_committed_data;
if (bg)
ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
else
ret = 1;
- jbd_unlock_bh_state(bg_bh);
+ spin_unlock(&jh->b_state_lock);
return ret;
}
@@ -2387,6 +2389,7 @@ static int ocfs2_block_group_clear_bits(
int status;
unsigned int tmp;
struct ocfs2_group_desc *undo_bg = NULL;
+ struct journal_head *jh;
/* The caller got this descriptor from
* ocfs2_read_group_descriptor(). Any corruption is a code bug. */
@@ -2405,10 +2408,10 @@ static int ocfs2_block_group_clear_bits(
goto bail;
}
+ jh = bh2jh(group_bh);
if (undo_fn) {
- jbd_lock_bh_state(group_bh);
- undo_bg = (struct ocfs2_group_desc *)
- bh2jh(group_bh)->b_committed_data;
+ spin_lock(&jh->b_state_lock);
+ undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data;
BUG_ON(!undo_bg);
}
@@ -2423,7 +2426,7 @@ static int ocfs2_block_group_clear_bits(
le16_add_cpu(&bg->bg_free_bits_count, num_bits);
if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
if (undo_fn)
- jbd_unlock_bh_state(group_bh);
+ spin_unlock(&jh->b_state_lock);
return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
(unsigned long long)le64_to_cpu(bg->bg_blkno),
le16_to_cpu(bg->bg_bits),
@@ -2432,7 +2435,7 @@ static int ocfs2_block_group_clear_bits(
}
if (undo_fn)
- jbd_unlock_bh_state(group_bh);
+ spin_unlock(&jh->b_state_lock);
ocfs2_journal_dirty(handle, group_bh);
bail:
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -313,7 +313,6 @@ enum jbd_state_bits {
BH_Revoked, /* Has been revoked from the log */
BH_RevokeValid, /* Revoked flag is valid */
BH_JBDDirty, /* Is dirty but journaled */
- BH_State, /* Pins most journal_head state */
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
BH_Shadow, /* IO on shadow buffer is running */
BH_Verified, /* Metadata block has been verified ok */
@@ -342,21 +341,6 @@ static inline struct journal_head *bh2jh
return bh->b_private;
}
-static inline void jbd_lock_bh_state(struct buffer_head *bh)
-{
- bit_spin_lock(BH_State, &bh->b_state);
-}
-
-static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
-{
- return bit_spin_is_locked(BH_State, &bh->b_state);
-}
-
-static inline void jbd_unlock_bh_state(struct buffer_head *bh)
-{
- bit_spin_unlock(BH_State, &bh->b_state);
-}
-
static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
{
bit_spin_lock(BH_JournalHead, &bh->b_state);
@@ -551,9 +535,9 @@ struct transaction_chp_stats_s {
* ->jbd_lock_bh_journal_head() (This is "innermost")
*
* j_state_lock
- * ->jbd_lock_bh_state()
+ * ->b_state_lock
*
- * jbd_lock_bh_state()
+ * b_state_lock
* ->j_list_lock
*
* j_state_lock
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -11,6 +11,8 @@
#ifndef JOURNAL_HEAD_H_INCLUDED
#define JOURNAL_HEAD_H_INCLUDED
+#include <linux/spinlock.h>
+
typedef unsigned int tid_t; /* Unique transaction ID */
typedef struct transaction_s transaction_t; /* Compound transaction type */
@@ -24,13 +26,18 @@ struct journal_head {
struct buffer_head *b_bh;
/*
+ * Protect the buffer head state
+ */
+ spinlock_t b_state_lock;
+
+ /*
* Reference count - see description in journal.c
* [jbd_lock_bh_journal_head()]
*/
int b_jcount;
/*
- * Journalling list for this buffer [jbd_lock_bh_state()]
+ * Journalling list for this buffer [b_state_lock]
* NOTE: We *cannot* combine this with b_modified into a bitfield
* as gcc would then (which the C standard allows but which is
* very unuseful) make 64-bit accesses to the bitfield and clobber
@@ -41,20 +48,20 @@ struct journal_head {
/*
* This flag signals the buffer has been modified by
* the currently running transaction
- * [jbd_lock_bh_state()]
+ * [b_state_lock]
*/
unsigned b_modified;
/*
* Copy of the buffer data frozen for writing to the log.
- * [jbd_lock_bh_state()]
+ * [b_state_lock]
*/
char *b_frozen_data;
/*
* Pointer to a saved copy of the buffer containing no uncommitted
* deallocation references, so that allocations can avoid overwriting
- * uncommitted deletes. [jbd_lock_bh_state()]
+ * uncommitted deletes. [b_state_lock]
*/
char *b_committed_data;
@@ -63,7 +70,7 @@ struct journal_head {
* metadata: either the running transaction or the committing
* transaction (if there is one). Only applies to buffers on a
* transaction's data or metadata journaling list.
- * [j_list_lock] [jbd_lock_bh_state()]
+ * [j_list_lock] [b_state_lock]
* Either of these locks is enough for reading, both are needed for
* changes.
*/
@@ -73,13 +80,13 @@ struct journal_head {
* Pointer to the running compound transaction which is currently
* modifying the buffer's metadata, if there was already a transaction
* committing it when the new transaction touched it.
- * [t_list_lock] [jbd_lock_bh_state()]
+ * [t_list_lock] [b_state_lock]
*/
transaction_t *b_next_transaction;
/*
* Doubly-linked list of buffers on a transaction's data, metadata or
- * forget queue. [t_list_lock] [jbd_lock_bh_state()]
+ * forget queue. [t_list_lock] [b_state_lock]
*/
struct journal_head *b_tnext, *b_tprev;

View File

@ -0,0 +1,88 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 9 Aug 2019 14:42:33 +0200
Subject: [PATCH 7/7] jbd2: Free journal head outside of locked region
On PREEMPT_RT bit-spinlocks have the same semantics as on PREEMPT_RT=n,
i.e. they disable preemption. That means functions which are not safe to be
called in preempt disabled context on RT trigger a might_sleep() assert.
The journal head bit spinlock is mostly held for short code sequences with
trivial RT safe functionality, except for one place:
jbd2_journal_put_journal_head() invokes __journal_remove_journal_head()
with the journal head bit spinlock held. __journal_remove_journal_head()
invokes kmem_cache_free() which must not be called with preemption disabled
on RT.
Jan suggested to rework the removal function so the actual free happens
outside the bit-spinlocked region.
Split it into two parts:
- Do the sanity checks and the buffer head detach under the lock
- Do the actual free after dropping the lock
There is error case handling in the free part which needs to dereference
the b_size field of the now detached buffer head. Due to paranoia (caused
by ignorance) the size is retrieved in the detach function and handed into
the free function. Might be over-engineered, but better safe than sorry.
This makes the journal head bit-spinlock usage RT compliant and also avoids
nested locking which is not covered by lockdep.
Suggested-by: Jan Kara <jack@suse.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-ext4@vger.kernel.org
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jan Kara <jack@suse.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/jbd2/journal.c | 20 ++++++++++++++------
1 file changed, 14 insertions(+), 6 deletions(-)
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2528,17 +2528,23 @@ static void __journal_remove_journal_hea
J_ASSERT_BH(bh, buffer_jbd(bh));
J_ASSERT_BH(bh, jh2bh(jh) == bh);
BUFFER_TRACE(bh, "remove journal_head");
+
+ /* Unlink before dropping the lock */
+ bh->b_private = NULL;
+ jh->b_bh = NULL; /* debug, really */
+ clear_buffer_jbd(bh);
+}
+
+static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
+{
if (jh->b_frozen_data) {
printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
- jbd2_free(jh->b_frozen_data, bh->b_size);
+ jbd2_free(jh->b_frozen_data, b_size);
}
if (jh->b_committed_data) {
printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
- jbd2_free(jh->b_committed_data, bh->b_size);
+ jbd2_free(jh->b_committed_data, b_size);
}
- bh->b_private = NULL;
- jh->b_bh = NULL; /* debug, really */
- clear_buffer_jbd(bh);
journal_free_journal_head(jh);
}
@@ -2556,9 +2562,11 @@ void jbd2_journal_put_journal_head(struc
if (!jh->b_jcount) {
__journal_remove_journal_head(bh);
jbd_unlock_bh_journal_head(bh);
+ journal_release_journal_head(jh, bh->b_size);
__brelse(bh);
- } else
+ } else {
jbd_unlock_bh_journal_head(bh);
+ }
}
/*

View File

@ -0,0 +1,86 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 17 Oct 2019 12:19:02 +0200
Subject: [PATCH] x86/ioapic: Rename misnamed functions
ioapic_irqd_[un]mask() are misnomers as both functions do way more than
masking and unmasking the interrupt line. Both deal with the moving the
affinity of the interrupt within interrupt context. The mask/unmask is just
a tiny part of the functionality.
Rename them to ioapic_prepare/finish_move(), fixup the call sites and
rename the related variables in the code to reflect what this is about.
No functional change.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sebastian Siewior <bigeasy@linutronix.de>
Link: https://lkml.kernel.org/r/20191017101938.412489856@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/x86/kernel/apic/io_apic.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1725,7 +1725,7 @@ static bool io_apic_level_ack_pending(st
return false;
}
-static inline bool ioapic_irqd_mask(struct irq_data *data)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
/* If we are moving the IRQ we need to mask it */
if (unlikely(irqd_is_setaffinity_pending(data))) {
@@ -1736,9 +1736,9 @@ static inline bool ioapic_irqd_mask(stru
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
- if (unlikely(masked)) {
+ if (unlikely(moveit)) {
/* Only migrate the irq if the ack has been received.
*
* On rare occasions the broadcast level triggered ack gets
@@ -1773,11 +1773,11 @@ static inline void ioapic_irqd_unmask(st
}
}
#else
-static inline bool ioapic_irqd_mask(struct irq_data *data)
+static inline bool ioapic_prepare_move(struct irq_data *data)
{
return false;
}
-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
+static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
{
}
#endif
@@ -1786,11 +1786,11 @@ static void ioapic_ack_level(struct irq_
{
struct irq_cfg *cfg = irqd_cfg(irq_data);
unsigned long v;
- bool masked;
+ bool moveit;
int i;
irq_complete_move(cfg);
- masked = ioapic_irqd_mask(irq_data);
+ moveit = ioapic_prepare_move(irq_data);
/*
* It appears there is an erratum which affects at least version 0x11
@@ -1845,7 +1845,7 @@ static void ioapic_ack_level(struct irq_
eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
}
- ioapic_irqd_unmask(irq_data, masked);
+ ioapic_finish_move(irq_data, moveit);
}
static void ioapic_ir_ack_level(struct irq_data *irq_data)

View File

@ -0,0 +1,100 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 4 Sep 2019 17:59:36 +0200
Subject: [PATCH] percpu-refcount: use normal instead of RCU-sched"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This is a revert of commit
a4244454df129 ("percpu-refcount: use RCU-sched insted of normal RCU")
which claims the only reason for using RCU-sched is
"rcu_read_[un]lock() … are slightly more expensive than preempt_disable/enable()"
and
"As the RCU critical sections are extremely short, using sched-RCU
shouldn't have any latency implications."
The problem with RCU-sched is that it disables preemption and the
callback must not acquire any sleeping locks like spinlock_t on
PREEMPT_RT which is the case.
Convert back to normal RCU.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/percpu-refcount.h | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -186,14 +186,14 @@ static inline void percpu_ref_get_many(s
{
unsigned long __percpu *percpu_count;
- rcu_read_lock_sched();
+ rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count))
this_cpu_add(*percpu_count, nr);
else
atomic_long_add(nr, &ref->count);
- rcu_read_unlock_sched();
+ rcu_read_unlock();
}
/**
@@ -223,7 +223,7 @@ static inline bool percpu_ref_tryget(str
unsigned long __percpu *percpu_count;
bool ret;
- rcu_read_lock_sched();
+ rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count);
@@ -232,7 +232,7 @@ static inline bool percpu_ref_tryget(str
ret = atomic_long_inc_not_zero(&ref->count);
}
- rcu_read_unlock_sched();
+ rcu_read_unlock();
return ret;
}
@@ -257,7 +257,7 @@ static inline bool percpu_ref_tryget_liv
unsigned long __percpu *percpu_count;
bool ret = false;
- rcu_read_lock_sched();
+ rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count);
@@ -266,7 +266,7 @@ static inline bool percpu_ref_tryget_liv
ret = atomic_long_inc_not_zero(&ref->count);
}
- rcu_read_unlock_sched();
+ rcu_read_unlock();
return ret;
}
@@ -285,14 +285,14 @@ static inline void percpu_ref_put_many(s
{
unsigned long __percpu *percpu_count;
- rcu_read_lock_sched();
+ rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count))
this_cpu_sub(*percpu_count, nr);
else if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
ref->release(ref);
- rcu_read_unlock_sched();
+ rcu_read_unlock();
}
/**

View File

@ -0,0 +1,70 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 10 Apr 2019 11:01:37 +0200
Subject: [PATCH] drm/i915: Don't disable interrupts independently of the
lock
The locks (active.lock and rq->lock) need to be taken with disabled
interrupts. This is done in i915_request_retire() by disabling the
interrupts independently of the locks itself.
While local_irq_disable()+spin_lock() equals spin_lock_irq() on vanilla
it does not on PREEMPT_RT.
Chris Wilson confirmed that local_irq_disable() was just introduced as
an optimisation to avoid enabling/disabling interrupts during
lock/unlock combo.
Enable/disable interrupts as part of the locking instruction.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/gpu/drm/i915/i915_request.c | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -205,14 +205,14 @@ static void remove_from_engine(struct i9
* check that the rq still belongs to the newly locked engine.
*/
locked = READ_ONCE(rq->engine);
- spin_lock(&locked->active.lock);
+ spin_lock_irq(&locked->active.lock);
while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) {
spin_unlock(&locked->active.lock);
spin_lock(&engine->active.lock);
locked = engine;
}
list_del(&rq->sched.link);
- spin_unlock(&locked->active.lock);
+ spin_unlock_irq(&locked->active.lock);
}
static bool i915_request_retire(struct i915_request *rq)
@@ -272,8 +272,6 @@ static bool i915_request_retire(struct i
active->retire(active, rq);
}
- local_irq_disable();
-
/*
* We only loosely track inflight requests across preemption,
* and so we may find ourselves attempting to retire a _completed_
@@ -282,7 +280,7 @@ static bool i915_request_retire(struct i
*/
remove_from_engine(rq);
- spin_lock(&rq->lock);
+ spin_lock_irq(&rq->lock);
i915_request_mark_complete(rq);
if (!i915_request_signaled(rq))
dma_fence_signal_locked(&rq->fence);
@@ -297,9 +295,7 @@ static bool i915_request_retire(struct i
__notify_execute_cb(rq);
}
GEM_BUG_ON(!list_empty(&rq->execute_cb));
- spin_unlock(&rq->lock);
-
- local_irq_enable();
+ spin_unlock_irq(&rq->lock);
remove_from_client(rq);
list_del(&rq->link);

View File

@ -0,0 +1,35 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 15 Nov 2019 21:37:22 +0100
Subject: [PATCH] block: Don't disable interrupts in trigger_softirq()
trigger_softirq() is always invoked as a SMP-function call which is
always invoked with disables interrupts.
Don't disable interrupt in trigger_softirq() because interrupts are
already disabled.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
block/blk-softirq.c | 4 ----
1 file changed, 4 deletions(-)
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -42,17 +42,13 @@ static __latent_entropy void blk_done_so
static void trigger_softirq(void *data)
{
struct request *rq = data;
- unsigned long flags;
struct list_head *list;
- local_irq_save(flags);
list = this_cpu_ptr(&blk_cpu_done);
list_add_tail(&rq->ipi_list, list);
if (list->next == &rq->ipi_list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
-
- local_irq_restore(flags);
}
/*

View File

@ -0,0 +1,89 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 26 Jul 2018 09:13:42 +0200
Subject: [PATCH] arm64: KVM: Invoke compute_layout() before alternatives are
applied
compute_layout() is invoked as part of an alternative fixup under
stop_machine(). This function invokes get_random_long() which acquires a
sleeping lock on -RT which can not be acquired in this context.
Rename compute_layout() to kvm_compute_layout() and invoke it before
stop_machine() applies the alternatives. Add a __init prefix to
kvm_compute_layout() because the caller has it, too (and so the code can be
discarded after boot).
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm64/include/asm/kvm_mmu.h | 1 +
arch/arm64/kernel/smp.c | 4 ++++
arch/arm64/kvm/va_layout.c | 8 +-------
3 files changed, 6 insertions(+), 7 deletions(-)
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -91,6 +91,7 @@ alternative_cb_end
void kvm_update_va_mask(struct alt_instr *alt,
__le32 *origptr, __le32 *updptr, int nr_inst);
+void kvm_compute_layout(void);
static inline unsigned long __kern_hyp_va(unsigned long v)
{
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -31,6 +31,7 @@
#include <linux/of.h>
#include <linux/irq_work.h>
#include <linux/kexec.h>
+#include <linux/kvm_host.h>
#include <asm/alternative.h>
#include <asm/atomic.h>
@@ -39,6 +40,7 @@
#include <asm/cputype.h>
#include <asm/cpu_ops.h>
#include <asm/daifflags.h>
+#include <asm/kvm_mmu.h>
#include <asm/mmu_context.h>
#include <asm/numa.h>
#include <asm/pgtable.h>
@@ -408,6 +410,8 @@ static void __init hyp_mode_check(void)
"CPU: CPUs started in inconsistent modes");
else
pr_info("CPU: All CPU(s) started at EL1\n");
+ if (IS_ENABLED(CONFIG_KVM_ARM_HOST))
+ kvm_compute_layout();
}
void __init smp_cpus_done(unsigned int max_cpus)
--- a/arch/arm64/kvm/va_layout.c
+++ b/arch/arm64/kvm/va_layout.c
@@ -22,7 +22,7 @@ static u8 tag_lsb;
static u64 tag_val;
static u64 va_mask;
-static void compute_layout(void)
+__init void kvm_compute_layout(void)
{
phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start);
u64 hyp_va_msb;
@@ -110,9 +110,6 @@ void __init kvm_update_va_mask(struct al
BUG_ON(nr_inst != 5);
- if (!has_vhe() && !va_mask)
- compute_layout();
-
for (i = 0; i < nr_inst; i++) {
u32 rd, rn, insn, oinsn;
@@ -156,9 +153,6 @@ void kvm_patch_vector_branch(struct alt_
return;
}
- if (!va_mask)
- compute_layout();
-
/*
* Compute HYP VA by using the same computation as kern_hyp_va()
*/

View File

@ -0,0 +1,57 @@
From: Marc Kleine-Budde <mkl@pengutronix.de>
Date: Wed, 5 Mar 2014 00:49:47 +0100
Subject: net: sched: Use msleep() instead of yield()
On PREEMPT_RT enabled systems the interrupt handler run as threads at prio 50
(by default). If a high priority userspace process tries to shut down a busy
network interface it might spin in a yield loop waiting for the device to
become idle. With the interrupt thread having a lower priority than the
looping process it might never be scheduled and so result in a deadlock on UP
systems.
With Magic SysRq the following backtrace can be produced:
> test_app R running 0 174 168 0x00000000
> [<c02c7070>] (__schedule+0x220/0x3fc) from [<c02c7870>] (preempt_schedule_irq+0x48/0x80)
> [<c02c7870>] (preempt_schedule_irq+0x48/0x80) from [<c0008fa8>] (svc_preempt+0x8/0x20)
> [<c0008fa8>] (svc_preempt+0x8/0x20) from [<c001a984>] (local_bh_enable+0x18/0x88)
> [<c001a984>] (local_bh_enable+0x18/0x88) from [<c025316c>] (dev_deactivate_many+0x220/0x264)
> [<c025316c>] (dev_deactivate_many+0x220/0x264) from [<c023be04>] (__dev_close_many+0x64/0xd4)
> [<c023be04>] (__dev_close_many+0x64/0xd4) from [<c023be9c>] (__dev_close+0x28/0x3c)
> [<c023be9c>] (__dev_close+0x28/0x3c) from [<c023f7f0>] (__dev_change_flags+0x88/0x130)
> [<c023f7f0>] (__dev_change_flags+0x88/0x130) from [<c023f904>] (dev_change_flags+0x10/0x48)
> [<c023f904>] (dev_change_flags+0x10/0x48) from [<c024c140>] (do_setlink+0x370/0x7ec)
> [<c024c140>] (do_setlink+0x370/0x7ec) from [<c024d2f0>] (rtnl_newlink+0x2b4/0x450)
> [<c024d2f0>] (rtnl_newlink+0x2b4/0x450) from [<c024cfa0>] (rtnetlink_rcv_msg+0x158/0x1f4)
> [<c024cfa0>] (rtnetlink_rcv_msg+0x158/0x1f4) from [<c0256740>] (netlink_rcv_skb+0xac/0xc0)
> [<c0256740>] (netlink_rcv_skb+0xac/0xc0) from [<c024bbd8>] (rtnetlink_rcv+0x18/0x24)
> [<c024bbd8>] (rtnetlink_rcv+0x18/0x24) from [<c02561b8>] (netlink_unicast+0x13c/0x198)
> [<c02561b8>] (netlink_unicast+0x13c/0x198) from [<c025651c>] (netlink_sendmsg+0x264/0x2e0)
> [<c025651c>] (netlink_sendmsg+0x264/0x2e0) from [<c022af98>] (sock_sendmsg+0x78/0x98)
> [<c022af98>] (sock_sendmsg+0x78/0x98) from [<c022bb50>] (___sys_sendmsg.part.25+0x268/0x278)
> [<c022bb50>] (___sys_sendmsg.part.25+0x268/0x278) from [<c022cf08>] (__sys_sendmsg+0x48/0x78)
> [<c022cf08>] (__sys_sendmsg+0x48/0x78) from [<c0009320>] (ret_fast_syscall+0x0/0x2c)
This patch works around the problem by replacing yield() by msleep(1), giving
the interrupt thread time to finish, similar to other changes contained in the
rt patch set. Using wait_for_completion() instead would probably be a better
solution.
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
net/sched/sch_generic.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -1215,7 +1215,7 @@ void dev_deactivate_many(struct list_hea
/* Wait for outstanding qdisc_run calls. */
list_for_each_entry(dev, head, close_list) {
while (some_qdisc_is_busy(dev))
- yield();
+ msleep(1);
/* The new qdisc is assigned at this point so we can safely
* unwind stale skb lists and qdisc statistics
*/

View File

@ -0,0 +1,105 @@
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Sat, 30 Nov 2019 17:54:33 -0800
Subject: [PATCH] mm/vmalloc: remove preempt_disable/enable when doing
preloading
Some background. The preemption was disabled before to guarantee that a
preloaded object is available for a CPU, it was stored for. That was
achieved by combining the disabling the preemption and taking the spin
lock while the ne_fit_preload_node is checked.
The aim was to not allocate in atomic context when spinlock is taken
later, for regular vmap allocations. But that approach conflicts with
CONFIG_PREEMPT_RT philosophy. It means that calling spin_lock() with
disabled preemption is forbidden in the CONFIG_PREEMPT_RT kernel.
Therefore, get rid of preempt_disable() and preempt_enable() when the
preload is done for splitting purpose. As a result we do not guarantee
now that a CPU is preloaded, instead we minimize the case when it is
not, with this change, by populating the per cpu preload pointer under
the vmap_area_lock.
This implies that at least each caller that has done the preallocation
will not fallback to an atomic allocation later. It is possible that
the preallocation would be pointless or that no preallocation is done
because of the race but the data shows that this is really rare.
For example i run the special test case that follows the preload pattern
and path. 20 "unbind" threads run it and each does 1000000 allocations.
Only 3.5 times among 1000000 a CPU was not preloaded. So it can happen
but the number is negligible.
[mhocko@suse.com: changelog additions]
Link: http://lkml.kernel.org/r/20191016095438.12391-1-urezki@gmail.com
Fixes: 82dd23e84be3 ("mm/vmalloc.c: preload a CPU with one object for split purpose")
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Daniel Wagner <dwagner@suse.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/vmalloc.c | 37 ++++++++++++++++++++-----------------
1 file changed, 20 insertions(+), 17 deletions(-)
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1077,31 +1077,34 @@ static struct vmap_area *alloc_vmap_area
retry:
/*
- * Preload this CPU with one extra vmap_area object to ensure
- * that we have it available when fit type of free area is
- * NE_FIT_TYPE.
+ * Preload this CPU with one extra vmap_area object. It is used
+ * when fit type of free area is NE_FIT_TYPE. Please note, it
+ * does not guarantee that an allocation occurs on a CPU that
+ * is preloaded, instead we minimize the case when it is not.
+ * It can happen because of cpu migration, because there is a
+ * race until the below spinlock is taken.
*
* The preload is done in non-atomic context, thus it allows us
* to use more permissive allocation masks to be more stable under
- * low memory condition and high memory pressure.
+ * low memory condition and high memory pressure. In rare case,
+ * if not preloaded, GFP_NOWAIT is used.
*
- * Even if it fails we do not really care about that. Just proceed
- * as it is. "overflow" path will refill the cache we allocate from.
+ * Set "pva" to NULL here, because of "retry" path.
*/
- preempt_disable();
- if (!__this_cpu_read(ne_fit_preload_node)) {
- preempt_enable();
- pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
- preempt_disable();
+ pva = NULL;
- if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
- if (pva)
- kmem_cache_free(vmap_area_cachep, pva);
- }
- }
+ if (!this_cpu_read(ne_fit_preload_node))
+ /*
+ * Even if it fails we do not really care about that.
+ * Just proceed as it is. If needed "overflow" path
+ * will refill the cache we allocate from.
+ */
+ pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
spin_lock(&vmap_area_lock);
- preempt_enable();
+
+ if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva))
+ kmem_cache_free(vmap_area_cachep, pva);
/*
* If an allocation fails, the "vend" address is

View File

@ -0,0 +1,46 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Aug 2019 14:29:41 +0200
Subject: [PATCH] KVM: arm/arm64: Let the timer expire in hardirq context
on RT
The timers are canceled from an preempt-notifier which is invoked with
disabled preemption which is not allowed on PREEMPT_RT.
The timer callback is short so in could be invoked in hard-IRQ context
on -RT.
Let the timer expire on hard-IRQ context even on -RT.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <maz@kernel.org>
Tested-by: Julien Grall <julien.grall@arm.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
virt/kvm/arm/arch_timer.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -80,7 +80,7 @@ static inline bool userspace_irqchip(str
static void soft_timer_start(struct hrtimer *hrt, u64 ns)
{
hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
- HRTIMER_MODE_ABS);
+ HRTIMER_MODE_ABS_HARD);
}
static void soft_timer_cancel(struct hrtimer *hrt)
@@ -697,11 +697,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu
update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
ptimer->cntvoff = 0;
- hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
timer->bg_timer.function = kvm_bg_timer_expire;
- hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+ hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
vtimer->hrtimer.function = kvm_hrtimer_expire;
ptimer->hrtimer.function = kvm_hrtimer_expire;

View File

@ -0,0 +1,55 @@
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 9 Mar 2020 18:15:29 +0000
Subject: [PATCH] time/sched_clock: Expire timer in hardirq context
To minimize latency, PREEMPT_RT kernels expires hrtimers in preemptible
softirq context by default. This can be overriden by marking the timer's
expiry with HRTIMER_MODE_HARD.
sched_clock_timer is missing this annotation: if its callback is preempted
and the duration of the preemption exceeds the wrap around time of the
underlying clocksource, sched clock will get out of sync.
Mark the sched_clock_timer for expiry in hard interrupt context.
Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200309181529.26558-1-a.darwish@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/time/sched_clock.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -207,7 +207,8 @@ sched_clock_register(u64 (*read)(void),
if (sched_clock_timer.function != NULL) {
/* update timeout for clock wrap */
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt,
+ HRTIMER_MODE_REL_HARD);
}
r = rate;
@@ -251,9 +252,9 @@ void __init generic_sched_clock_init(voi
* Start the timer to keep sched_clock() properly updated and
* sets the initial epoch.
*/
- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
sched_clock_timer.function = sched_clock_poll;
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}
/*
@@ -290,7 +291,7 @@ void sched_clock_resume(void)
struct clock_read_data *rd = &cd.read_data[0];
rd->epoch_cyc = cd.actual_read_sched_clock();
- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
+ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
rd->read_sched_clock = cd.actual_read_sched_clock;
}

View File

@ -0,0 +1,393 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:39 +0100
Subject: [PATCH 01/25] printk-rb: add printk ring buffer documentation
The full documentation file for the printk ring buffer.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
Documentation/printk-ringbuffer.txt | 377 ++++++++++++++++++++++++++++++++++++
1 file changed, 377 insertions(+)
create mode 100644 Documentation/printk-ringbuffer.txt
--- /dev/null
+++ b/Documentation/printk-ringbuffer.txt
@@ -0,0 +1,377 @@
+struct printk_ringbuffer
+------------------------
+John Ogness <john.ogness@linutronix.de>
+
+Overview
+~~~~~~~~
+As the name suggests, this ring buffer was implemented specifically to serve
+the needs of the printk() infrastructure. The ring buffer itself is not
+specific to printk and could be used for other purposes. _However_, the
+requirements and semantics of printk are rather unique. If you intend to use
+this ring buffer for anything other than printk, you need to be very clear on
+its features, behavior, and pitfalls.
+
+Features
+^^^^^^^^
+The printk ring buffer has the following features:
+
+- single global buffer
+- resides in initialized data section (available at early boot)
+- lockless readers
+- supports multiple writers
+- supports multiple non-consuming readers
+- safe from any context (including NMI)
+- groups bytes into variable length blocks (referenced by entries)
+- entries tagged with sequence numbers
+
+Behavior
+^^^^^^^^
+Since the printk ring buffer readers are lockless, there exists no
+synchronization between readers and writers. Basically writers are the tasks
+in control and may overwrite any and all committed data at any time and from
+any context. For this reason readers can miss entries if they are overwritten
+before the reader was able to access the data. The reader API implementation
+is such that reader access to entries is atomic, so there is no risk of
+readers having to deal with partial or corrupt data. Also, entries are
+tagged with sequence numbers so readers can recognize if entries were missed.
+
+Writing to the ring buffer consists of 2 steps. First a writer must reserve
+an entry of desired size. After this step the writer has exclusive access
+to the memory region. Once the data has been written to memory, it needs to
+be committed to the ring buffer. After this step the entry has been inserted
+into the ring buffer and assigned an appropriate sequence number.
+
+Once committed, a writer must no longer access the data directly. This is
+because the data may have been overwritten and no longer exists. If a
+writer must access the data, it should either keep a private copy before
+committing the entry or use the reader API to gain access to the data.
+
+Because of how the data backend is implemented, entries that have been
+reserved but not yet committed act as barriers, preventing future writers
+from filling the ring buffer beyond the location of the reserved but not
+yet committed entry region. For this reason it is *important* that writers
+perform both reserve and commit as quickly as possible. Also, be aware that
+preemption and local interrupts are disabled and writing to the ring buffer
+is processor-reentrant locked during the reserve/commit window. Writers in
+NMI contexts can still preempt any other writers, but as long as these
+writers do not write a large amount of data with respect to the ring buffer
+size, this should not become an issue.
+
+API
+~~~
+
+Declaration
+^^^^^^^^^^^
+The printk ring buffer can be instantiated as a static structure:
+
+ /* declare a static struct printk_ringbuffer */
+ #define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr)
+
+The value of szbits specifies the size of the ring buffer in bits. The
+cpulockptr field is a pointer to a prb_cpulock struct that is used to
+perform processor-reentrant spin locking for the writers. It is specified
+externally because it may be used for multiple ring buffers (or other
+code) to synchronize writers without risk of deadlock.
+
+Here is an example of a declaration of a printk ring buffer specifying a
+32KB (2^15) ring buffer:
+
+....
+DECLARE_STATIC_PRINTKRB_CPULOCK(rb_cpulock);
+DECLARE_STATIC_PRINTKRB(rb, 15, &rb_cpulock);
+....
+
+If writers will be using multiple ring buffers and the ordering of that usage
+is not clear, the same prb_cpulock should be used for both ring buffers.
+
+Writer API
+^^^^^^^^^^
+The writer API consists of 2 functions. The first is to reserve an entry in
+the ring buffer, the second is to commit that data to the ring buffer. The
+reserved entry information is stored within a provided `struct prb_handle`.
+
+ /* reserve an entry */
+ char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb,
+ unsigned int size);
+
+ /* commit a reserved entry to the ring buffer */
+ void prb_commit(struct prb_handle *h);
+
+Here is an example of a function to write data to a ring buffer:
+
+....
+int write_data(struct printk_ringbuffer *rb, char *data, int size)
+{
+ struct prb_handle h;
+ char *buf;
+
+ buf = prb_reserve(&h, rb, size);
+ if (!buf)
+ return -1;
+ memcpy(buf, data, size);
+ prb_commit(&h);
+
+ return 0;
+}
+....
+
+Pitfalls
+++++++++
+Be aware that prb_reserve() can fail. A retry might be successful, but it
+depends entirely on whether or not the next part of the ring buffer to
+overwrite belongs to reserved but not yet committed entries of other writers.
+Writers can use the prb_inc_lost() function to allow readers to notice that a
+message was lost.
+
+Reader API
+^^^^^^^^^^
+The reader API utilizes a `struct prb_iterator` to track the reader's
+position in the ring buffer.
+
+ /* declare a pre-initialized static iterator for a ring buffer */
+ #define DECLARE_STATIC_PRINTKRB_ITER(name, rbaddr)
+
+ /* initialize iterator for a ring buffer (if static macro NOT used) */
+ void prb_iter_init(struct prb_iterator *iter,
+ struct printk_ringbuffer *rb, u64 *seq);
+
+ /* make a deep copy of an iterator */
+ void prb_iter_copy(struct prb_iterator *dest,
+ struct prb_iterator *src);
+
+ /* non-blocking, advance to next entry (and read the data) */
+ int prb_iter_next(struct prb_iterator *iter, char *buf,
+ int size, u64 *seq);
+
+ /* blocking, advance to next entry (and read the data) */
+ int prb_iter_wait_next(struct prb_iterator *iter, char *buf,
+ int size, u64 *seq);
+
+ /* position iterator at the entry seq */
+ int prb_iter_seek(struct prb_iterator *iter, u64 seq);
+
+ /* read data at current position */
+ int prb_iter_data(struct prb_iterator *iter, char *buf,
+ int size, u64 *seq);
+
+Typically prb_iter_data() is not needed because the data can be retrieved
+directly with prb_iter_next().
+
+Here is an example of a non-blocking function that will read all the data in
+a ring buffer:
+
+....
+void read_all_data(struct printk_ringbuffer *rb, char *buf, int size)
+{
+ struct prb_iterator iter;
+ u64 prev_seq = 0;
+ u64 seq;
+ int ret;
+
+ prb_iter_init(&iter, rb, NULL);
+
+ for (;;) {
+ ret = prb_iter_next(&iter, buf, size, &seq);
+ if (ret > 0) {
+ if (seq != ++prev_seq) {
+ /* "seq - prev_seq" entries missed */
+ prev_seq = seq;
+ }
+ /* process buf here */
+ } else if (ret == 0) {
+ /* hit the end, done */
+ break;
+ } else if (ret < 0) {
+ /*
+ * iterator is invalid, a writer overtook us, reset the
+ * iterator and keep going, entries were missed
+ */
+ prb_iter_init(&iter, rb, NULL);
+ }
+ }
+}
+....
+
+Pitfalls
+++++++++
+The reader's iterator can become invalid at any time because the reader was
+overtaken by a writer. Typically the reader should reset the iterator back
+to the current oldest entry (which will be newer than the entry the reader
+was at) and continue, noting the number of entries that were missed.
+
+Utility API
+^^^^^^^^^^^
+Several functions are available as convenience for external code.
+
+ /* query the size of the data buffer */
+ int prb_buffer_size(struct printk_ringbuffer *rb);
+
+ /* skip a seq number to signify a lost record */
+ void prb_inc_lost(struct printk_ringbuffer *rb);
+
+ /* processor-reentrant spin lock */
+ void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store);
+
+ /* processor-reentrant spin unlock */
+ void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store);
+
+Pitfalls
+++++++++
+Although the value returned by prb_buffer_size() does represent an absolute
+upper bound, the amount of data that can be stored within the ring buffer
+is actually less because of the additional storage space of a header for each
+entry.
+
+The prb_lock() and prb_unlock() functions can be used to synchronize between
+ring buffer writers and other external activities. The function of a
+processor-reentrant spin lock is to disable preemption and local interrupts
+and synchronize against other processors. It does *not* protect against
+multiple contexts of a single processor, i.e NMI.
+
+Implementation
+~~~~~~~~~~~~~~
+This section describes several of the implementation concepts and details to
+help developers better understand the code.
+
+Entries
+^^^^^^^
+All ring buffer data is stored within a single static byte array. The reason
+for this is to ensure that any pointers to the data (past and present) will
+always point to valid memory. This is important because the lockless readers
+may be preempted for long periods of time and when they resume may be working
+with expired pointers.
+
+Entries are identified by start index and size. (The start index plus size
+is the start index of the next entry.) The start index is not simply an
+offset into the byte array, but rather a logical position (lpos) that maps
+directly to byte array offsets.
+
+For example, for a byte array of 1000, an entry may have have a start index
+of 100. Another entry may have a start index of 1100. And yet another 2100.
+All of these entry are pointing to the same memory region, but only the most
+recent entry is valid. The other entries are pointing to valid memory, but
+represent entries that have been overwritten.
+
+Note that due to overflowing, the most recent entry is not necessarily the one
+with the highest lpos value. Indeed, the printk ring buffer initializes its
+data such that an overflow happens relatively quickly in order to validate the
+handling of this situation. The implementation assumes that an lpos (unsigned
+long) will never completely wrap while a reader is preempted. If this were to
+become an issue, the seq number (which never wraps) could be used to increase
+the robustness of handling this situation.
+
+Buffer Wrapping
+^^^^^^^^^^^^^^^
+If an entry starts near the end of the byte array but would extend beyond it,
+a special terminating entry (size = -1) is inserted into the byte array and
+the real entry is placed at the beginning of the byte array. This can waste
+space at the end of the byte array, but simplifies the implementation by
+allowing writers to always work with contiguous buffers.
+
+Note that the size field is the first 4 bytes of the entry header. Also note
+that calc_next() always ensures that there are at least 4 bytes left at the
+end of the byte array to allow room for a terminating entry.
+
+Ring Buffer Pointers
+^^^^^^^^^^^^^^^^^^^^
+Three pointers (lpos values) are used to manage the ring buffer:
+
+ - _tail_: points to the oldest entry
+ - _head_: points to where the next new committed entry will be
+ - _reserve_: points to where the next new reserved entry will be
+
+These pointers always maintain a logical ordering:
+
+ tail <= head <= reserve
+
+The reserve pointer moves forward when a writer reserves a new entry. The
+head pointer moves forward when a writer commits a new entry.
+
+The reserve pointer cannot overwrite the tail pointer in a wrap situation. In
+such a situation, the tail pointer must be "pushed forward", thus
+invalidating that oldest entry. Readers identify if they are accessing a
+valid entry by ensuring their entry pointer is `>= tail && < head`.
+
+If the tail pointer is equal to the head pointer, it cannot be pushed and any
+reserve operation will fail. The only resolution is for writers to commit
+their reserved entries.
+
+Processor-Reentrant Locking
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+The purpose of the processor-reentrant locking is to limit the interruption
+scenarios of writers to 2 contexts. This allows for a simplified
+implementation where:
+
+- The reserve/commit window only exists on 1 processor at a time. A reserve
+ can never fail due to uncommitted entries of other processors.
+
+- When committing entries, it is trivial to handle the situation when
+ subsequent entries have already been committed, i.e. managing the head
+ pointer.
+
+Performance
+~~~~~~~~~~~
+Some basic tests were performed on a quad Intel(R) Xeon(R) CPU E5-2697 v4 at
+2.30GHz (36 cores / 72 threads). All tests involved writing a total of
+32,000,000 records at an average of 33 bytes each. Each writer was pinned to
+its own CPU and would write as fast as it could until a total of 32,000,000
+records were written. All tests involved 2 readers that were both pinned
+together to another CPU. Each reader would read as fast as it could and track
+how many of the 32,000,000 records it could read. All tests used a ring buffer
+of 16KB in size, which holds around 350 records (header + data for each
+entry).
+
+The only difference between the tests is the number of writers (and thus also
+the number of records per writer). As more writers are added, the time to
+write a record increases. This is because data pointers, modified via cmpxchg,
+and global data access in general become more contended.
+
+1 writer
+^^^^^^^^
+ runtime: 0m 18s
+ reader1: 16219900/32000000 (50%) records
+ reader2: 16141582/32000000 (50%) records
+
+2 writers
+^^^^^^^^^
+ runtime: 0m 32s
+ reader1: 16327957/32000000 (51%) records
+ reader2: 16313988/32000000 (50%) records
+
+4 writers
+^^^^^^^^^
+ runtime: 0m 42s
+ reader1: 16421642/32000000 (51%) records
+ reader2: 16417224/32000000 (51%) records
+
+8 writers
+^^^^^^^^^
+ runtime: 0m 43s
+ reader1: 16418300/32000000 (51%) records
+ reader2: 16432222/32000000 (51%) records
+
+16 writers
+^^^^^^^^^^
+ runtime: 0m 54s
+ reader1: 16539189/32000000 (51%) records
+ reader2: 16542711/32000000 (51%) records
+
+32 writers
+^^^^^^^^^^
+ runtime: 1m 13s
+ reader1: 16731808/32000000 (52%) records
+ reader2: 16735119/32000000 (52%) records
+
+Comments
+^^^^^^^^
+It is particularly interesting to compare/contrast the 1-writer and 32-writer
+tests. Despite the writing of the 32,000,000 records taking over 4 times
+longer, the readers (which perform no cmpxchg) were still unable to keep up.
+This shows that the memory contention between the increasing number of CPUs
+also has a dramatic effect on readers.
+
+It should also be noted that in all cases each reader was able to read >=50%
+of the records. This means that a single reader would have been able to keep
+up with the writer(s) in all cases, becoming slightly easier as more writers
+are added. This was the purpose of pinning 2 readers to 1 CPU: to observe how
+maximum reader performance changes.

View File

@ -0,0 +1,158 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:40 +0100
Subject: [PATCH 02/25] printk-rb: add prb locking functions
Add processor-reentrant spin locking functions. These allow
restricting the number of possible contexts to 2, which can simplify
implementing code that also supports NMI interruptions.
prb_lock();
/*
* This code is synchronized with all contexts
* except an NMI on the same processor.
*/
prb_unlock();
In order to support printk's emergency messages, a
processor-reentrant spin lock will be used to control raw access to
the emergency console. However, it must be the same
processor-reentrant spin lock as the one used by the ring buffer,
otherwise a deadlock can occur:
CPU1: printk lock -> emergency -> serial lock
CPU2: serial lock -> printk lock
By making the processor-reentrant implemtation available externally,
printk can use the same atomic_t for the ring buffer as for the
emergency console and thus avoid the above deadlock.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/printk_ringbuffer.h | 24 +++++++++++
lib/Makefile | 2
lib/printk_ringbuffer.c | 77 ++++++++++++++++++++++++++++++++++++++
3 files changed, 102 insertions(+), 1 deletion(-)
create mode 100644 include/linux/printk_ringbuffer.h
create mode 100644 lib/printk_ringbuffer.c
--- /dev/null
+++ b/include/linux/printk_ringbuffer.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PRINTK_RINGBUFFER_H
+#define _LINUX_PRINTK_RINGBUFFER_H
+
+#include <linux/atomic.h>
+#include <linux/percpu.h>
+
+struct prb_cpulock {
+ atomic_t owner;
+ unsigned long __percpu *irqflags;
+};
+
+#define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \
+static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \
+static struct prb_cpulock name = { \
+ .owner = ATOMIC_INIT(-1), \
+ .irqflags = &_##name##_percpu_irqflags, \
+}
+
+/* utility functions */
+void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store);
+void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store);
+
+#endif /*_LINUX_PRINTK_RINGBUFFER_H */
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -26,7 +26,7 @@ endif
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o timerqueue.o xarray.o \
- idr.o extable.o \
+ idr.o extable.o printk_ringbuffer.o \
sha1.o chacha.o irq_regs.o argv_split.o \
flex_proportions.o ratelimit.o show_mem.o \
is_single_threaded.o plist.o decompress.o kobject_uevent.o \
--- /dev/null
+++ b/lib/printk_ringbuffer.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/smp.h>
+#include <linux/printk_ringbuffer.h>
+
+static bool __prb_trylock(struct prb_cpulock *cpu_lock,
+ unsigned int *cpu_store)
+{
+ unsigned long *flags;
+ unsigned int cpu;
+
+ cpu = get_cpu();
+
+ *cpu_store = atomic_read(&cpu_lock->owner);
+ /* memory barrier to ensure the current lock owner is visible */
+ smp_rmb();
+ if (*cpu_store == -1) {
+ flags = per_cpu_ptr(cpu_lock->irqflags, cpu);
+ local_irq_save(*flags);
+ if (atomic_try_cmpxchg_acquire(&cpu_lock->owner,
+ cpu_store, cpu)) {
+ return true;
+ }
+ local_irq_restore(*flags);
+ } else if (*cpu_store == cpu) {
+ return true;
+ }
+
+ put_cpu();
+ return false;
+}
+
+/*
+ * prb_lock: Perform a processor-reentrant spin lock.
+ * @cpu_lock: A pointer to the lock object.
+ * @cpu_store: A "flags" pointer to store lock status information.
+ *
+ * If no processor has the lock, the calling processor takes the lock and
+ * becomes the owner. If the calling processor is already the owner of the
+ * lock, this function succeeds immediately. If lock is locked by another
+ * processor, this function spins until the calling processor becomes the
+ * owner.
+ *
+ * It is safe to call this function from any context and state.
+ */
+void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store)
+{
+ for (;;) {
+ if (__prb_trylock(cpu_lock, cpu_store))
+ break;
+ cpu_relax();
+ }
+}
+
+/*
+ * prb_unlock: Perform a processor-reentrant spin unlock.
+ * @cpu_lock: A pointer to the lock object.
+ * @cpu_store: A "flags" object storing lock status information.
+ *
+ * Release the lock. The calling processor must be the owner of the lock.
+ *
+ * It is safe to call this function from any context and state.
+ */
+void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store)
+{
+ unsigned long *flags;
+ unsigned int cpu;
+
+ cpu = atomic_read(&cpu_lock->owner);
+ atomic_set_release(&cpu_lock->owner, cpu_store);
+
+ if (cpu_store == -1) {
+ flags = per_cpu_ptr(cpu_lock->irqflags, cpu);
+ local_irq_restore(*flags);
+ }
+
+ put_cpu();
+}

View File

@ -0,0 +1,57 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:41 +0100
Subject: [PATCH 03/25] printk-rb: define ring buffer struct and initializer
See Documentation/printk-ringbuffer.txt for details about the
initializer arguments.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/printk_ringbuffer.h | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
--- a/include/linux/printk_ringbuffer.h
+++ b/include/linux/printk_ringbuffer.h
@@ -10,6 +10,20 @@ struct prb_cpulock {
unsigned long __percpu *irqflags;
};
+struct printk_ringbuffer {
+ void *buffer;
+ unsigned int size_bits;
+
+ u64 seq;
+
+ atomic_long_t tail;
+ atomic_long_t head;
+ atomic_long_t reserve;
+
+ struct prb_cpulock *cpulock;
+ atomic_t ctx;
+};
+
#define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \
static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \
static struct prb_cpulock name = { \
@@ -17,6 +31,20 @@ static struct prb_cpulock name = { \
.irqflags = &_##name##_percpu_irqflags, \
}
+#define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr) \
+static char _##name##_buffer[1 << (szbits)] \
+ __aligned(__alignof__(long)); \
+static struct printk_ringbuffer name = { \
+ .buffer = &_##name##_buffer[0], \
+ .size_bits = szbits, \
+ .seq = 0, \
+ .tail = ATOMIC_LONG_INIT(-111 * sizeof(long)), \
+ .head = ATOMIC_LONG_INIT(-111 * sizeof(long)), \
+ .reserve = ATOMIC_LONG_INIT(-111 * sizeof(long)), \
+ .cpulock = cpulockptr, \
+ .ctx = ATOMIC_INIT(0), \
+}
+
/* utility functions */
void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store);
void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store);

View File

@ -0,0 +1,233 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:42 +0100
Subject: [PATCH 04/25] printk-rb: add writer interface
Add the writer functions prb_reserve() and prb_commit(). These make
use of processor-reentrant spin locks to limit the number of possible
interruption scenarios for the writers.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/printk_ringbuffer.h | 17 +++
lib/printk_ringbuffer.c | 172 ++++++++++++++++++++++++++++++++++++++
2 files changed, 189 insertions(+)
--- a/include/linux/printk_ringbuffer.h
+++ b/include/linux/printk_ringbuffer.h
@@ -24,6 +24,18 @@ struct printk_ringbuffer {
atomic_t ctx;
};
+struct prb_entry {
+ unsigned int size;
+ u64 seq;
+ char data[0];
+};
+
+struct prb_handle {
+ struct printk_ringbuffer *rb;
+ unsigned int cpu;
+ struct prb_entry *entry;
+};
+
#define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \
static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \
static struct prb_cpulock name = { \
@@ -45,6 +57,11 @@ static struct printk_ringbuffer name = {
.ctx = ATOMIC_INIT(0), \
}
+/* writer interface */
+char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb,
+ unsigned int size);
+void prb_commit(struct prb_handle *h);
+
/* utility functions */
void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store);
void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store);
--- a/lib/printk_ringbuffer.c
+++ b/lib/printk_ringbuffer.c
@@ -2,6 +2,14 @@
#include <linux/smp.h>
#include <linux/printk_ringbuffer.h>
+#define PRB_SIZE(rb) (1 << rb->size_bits)
+#define PRB_SIZE_BITMASK(rb) (PRB_SIZE(rb) - 1)
+#define PRB_INDEX(rb, lpos) (lpos & PRB_SIZE_BITMASK(rb))
+#define PRB_WRAPS(rb, lpos) (lpos >> rb->size_bits)
+#define PRB_WRAP_LPOS(rb, lpos, xtra) \
+ ((PRB_WRAPS(rb, lpos) + xtra) << rb->size_bits)
+#define PRB_DATA_ALIGN sizeof(long)
+
static bool __prb_trylock(struct prb_cpulock *cpu_lock,
unsigned int *cpu_store)
{
@@ -75,3 +83,167 @@ void prb_unlock(struct prb_cpulock *cpu_
put_cpu();
}
+
+static struct prb_entry *to_entry(struct printk_ringbuffer *rb,
+ unsigned long lpos)
+{
+ char *buffer = rb->buffer;
+ buffer += PRB_INDEX(rb, lpos);
+ return (struct prb_entry *)buffer;
+}
+
+static int calc_next(struct printk_ringbuffer *rb, unsigned long tail,
+ unsigned long lpos, int size, unsigned long *calced_next)
+{
+ unsigned long next_lpos;
+ int ret = 0;
+again:
+ next_lpos = lpos + size;
+ if (next_lpos - tail > PRB_SIZE(rb))
+ return -1;
+
+ if (PRB_WRAPS(rb, lpos) != PRB_WRAPS(rb, next_lpos)) {
+ lpos = PRB_WRAP_LPOS(rb, next_lpos, 0);
+ ret |= 1;
+ goto again;
+ }
+
+ *calced_next = next_lpos;
+ return ret;
+}
+
+static bool push_tail(struct printk_ringbuffer *rb, unsigned long tail)
+{
+ unsigned long new_tail;
+ struct prb_entry *e;
+ unsigned long head;
+
+ if (tail != atomic_long_read(&rb->tail))
+ return true;
+
+ e = to_entry(rb, tail);
+ if (e->size != -1)
+ new_tail = tail + e->size;
+ else
+ new_tail = PRB_WRAP_LPOS(rb, tail, 1);
+
+ /* make sure the new tail does not overtake the head */
+ head = atomic_long_read(&rb->head);
+ if (head - new_tail > PRB_SIZE(rb))
+ return false;
+
+ atomic_long_cmpxchg(&rb->tail, tail, new_tail);
+ return true;
+}
+
+/*
+ * prb_commit: Commit a reserved entry to the ring buffer.
+ * @h: An entry handle referencing the data entry to commit.
+ *
+ * Commit data that has been reserved using prb_reserve(). Once the data
+ * block has been committed, it can be invalidated at any time. If a writer
+ * is interested in using the data after committing, the writer should make
+ * its own copy first or use the prb_iter_ reader functions to access the
+ * data in the ring buffer.
+ *
+ * It is safe to call this function from any context and state.
+ */
+void prb_commit(struct prb_handle *h)
+{
+ struct printk_ringbuffer *rb = h->rb;
+ struct prb_entry *e;
+ unsigned long head;
+ unsigned long res;
+
+ for (;;) {
+ if (atomic_read(&rb->ctx) != 1) {
+ /* the interrupted context will fixup head */
+ atomic_dec(&rb->ctx);
+ break;
+ }
+ /* assign sequence numbers before moving head */
+ head = atomic_long_read(&rb->head);
+ res = atomic_long_read(&rb->reserve);
+ while (head != res) {
+ e = to_entry(rb, head);
+ if (e->size == -1) {
+ head = PRB_WRAP_LPOS(rb, head, 1);
+ continue;
+ }
+ e->seq = ++rb->seq;
+ head += e->size;
+ }
+ atomic_long_set_release(&rb->head, res);
+ atomic_dec(&rb->ctx);
+
+ if (atomic_long_read(&rb->reserve) == res)
+ break;
+ atomic_inc(&rb->ctx);
+ }
+
+ prb_unlock(rb->cpulock, h->cpu);
+}
+
+/*
+ * prb_reserve: Reserve an entry within a ring buffer.
+ * @h: An entry handle to be setup and reference an entry.
+ * @rb: A ring buffer to reserve data within.
+ * @size: The number of bytes to reserve.
+ *
+ * Reserve an entry of at least @size bytes to be used by the caller. If
+ * successful, the data region of the entry belongs to the caller and cannot
+ * be invalidated by any other task/context. For this reason, the caller
+ * should call prb_commit() as quickly as possible in order to avoid preventing
+ * other tasks/contexts from reserving data in the case that the ring buffer
+ * has wrapped.
+ *
+ * It is safe to call this function from any context and state.
+ *
+ * Returns a pointer to the reserved entry (and @h is setup to reference that
+ * entry) or NULL if it was not possible to reserve data.
+ */
+char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb,
+ unsigned int size)
+{
+ unsigned long tail, res1, res2;
+ int ret;
+
+ if (size == 0)
+ return NULL;
+ size += sizeof(struct prb_entry);
+ size += PRB_DATA_ALIGN - 1;
+ size &= ~(PRB_DATA_ALIGN - 1);
+ if (size >= PRB_SIZE(rb))
+ return NULL;
+
+ h->rb = rb;
+ prb_lock(rb->cpulock, &h->cpu);
+
+ atomic_inc(&rb->ctx);
+
+ do {
+ for (;;) {
+ tail = atomic_long_read(&rb->tail);
+ res1 = atomic_long_read(&rb->reserve);
+ ret = calc_next(rb, tail, res1, size, &res2);
+ if (ret >= 0)
+ break;
+ if (!push_tail(rb, tail)) {
+ prb_commit(h);
+ return NULL;
+ }
+ }
+ } while (!atomic_long_try_cmpxchg_acquire(&rb->reserve, &res1, res2));
+
+ h->entry = to_entry(rb, res1);
+
+ if (ret) {
+ /* handle wrap */
+ h->entry->size = -1;
+ h->entry = to_entry(rb, PRB_WRAP_LPOS(rb, res2, 0));
+ }
+
+ h->entry->size = size;
+
+ return &h->entry->data[0];
+}

View File

@ -0,0 +1,259 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:43 +0100
Subject: [PATCH 05/25] printk-rb: add basic non-blocking reading interface
Add reader iterator static declaration/initializer, dynamic
initializer, and functions to iterate and retrieve ring buffer data.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/printk_ringbuffer.h | 20 ++++
lib/printk_ringbuffer.c | 190 ++++++++++++++++++++++++++++++++++++++
2 files changed, 210 insertions(+)
--- a/include/linux/printk_ringbuffer.h
+++ b/include/linux/printk_ringbuffer.h
@@ -43,6 +43,19 @@ static struct prb_cpulock name = { \
.irqflags = &_##name##_percpu_irqflags, \
}
+#define PRB_INIT ((unsigned long)-1)
+
+#define DECLARE_STATIC_PRINTKRB_ITER(name, rbaddr) \
+static struct prb_iterator name = { \
+ .rb = rbaddr, \
+ .lpos = PRB_INIT, \
+}
+
+struct prb_iterator {
+ struct printk_ringbuffer *rb;
+ unsigned long lpos;
+};
+
#define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr) \
static char _##name##_buffer[1 << (szbits)] \
__aligned(__alignof__(long)); \
@@ -62,6 +75,13 @@ char *prb_reserve(struct prb_handle *h,
unsigned int size);
void prb_commit(struct prb_handle *h);
+/* reader interface */
+void prb_iter_init(struct prb_iterator *iter, struct printk_ringbuffer *rb,
+ u64 *seq);
+void prb_iter_copy(struct prb_iterator *dest, struct prb_iterator *src);
+int prb_iter_next(struct prb_iterator *iter, char *buf, int size, u64 *seq);
+int prb_iter_data(struct prb_iterator *iter, char *buf, int size, u64 *seq);
+
/* utility functions */
void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store);
void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store);
--- a/lib/printk_ringbuffer.c
+++ b/lib/printk_ringbuffer.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/errno.h>
#include <linux/printk_ringbuffer.h>
#define PRB_SIZE(rb) (1 << rb->size_bits)
@@ -8,6 +10,7 @@
#define PRB_WRAPS(rb, lpos) (lpos >> rb->size_bits)
#define PRB_WRAP_LPOS(rb, lpos, xtra) \
((PRB_WRAPS(rb, lpos) + xtra) << rb->size_bits)
+#define PRB_DATA_SIZE(e) (e->size - sizeof(struct prb_entry))
#define PRB_DATA_ALIGN sizeof(long)
static bool __prb_trylock(struct prb_cpulock *cpu_lock,
@@ -247,3 +250,190 @@ char *prb_reserve(struct prb_handle *h,
return &h->entry->data[0];
}
+
+/*
+ * prb_iter_copy: Copy an iterator.
+ * @dest: The iterator to copy to.
+ * @src: The iterator to copy from.
+ *
+ * Make a deep copy of an iterator. This is particularly useful for making
+ * backup copies of an iterator in case a form of rewinding it needed.
+ *
+ * It is safe to call this function from any context and state. But
+ * note that this function is not atomic. Callers should not make copies
+ * to/from iterators that can be accessed by other tasks/contexts.
+ */
+void prb_iter_copy(struct prb_iterator *dest, struct prb_iterator *src)
+{
+ memcpy(dest, src, sizeof(*dest));
+}
+
+/*
+ * prb_iter_init: Initialize an iterator for a ring buffer.
+ * @iter: The iterator to initialize.
+ * @rb: A ring buffer to that @iter should iterate.
+ * @seq: The sequence number of the position preceding the first record.
+ * May be NULL.
+ *
+ * Initialize an iterator to be used with a specified ring buffer. If @seq
+ * is non-NULL, it will be set such that prb_iter_next() will provide a
+ * sequence value of "@seq + 1" if no records were missed.
+ *
+ * It is safe to call this function from any context and state.
+ */
+void prb_iter_init(struct prb_iterator *iter, struct printk_ringbuffer *rb,
+ u64 *seq)
+{
+ memset(iter, 0, sizeof(*iter));
+ iter->rb = rb;
+ iter->lpos = PRB_INIT;
+
+ if (!seq)
+ return;
+
+ for (;;) {
+ struct prb_iterator tmp_iter;
+ int ret;
+
+ prb_iter_copy(&tmp_iter, iter);
+
+ ret = prb_iter_next(&tmp_iter, NULL, 0, seq);
+ if (ret < 0)
+ continue;
+
+ if (ret == 0)
+ *seq = 0;
+ else
+ (*seq)--;
+ break;
+ }
+}
+
+static bool is_valid(struct printk_ringbuffer *rb, unsigned long lpos)
+{
+ unsigned long head, tail;
+
+ tail = atomic_long_read(&rb->tail);
+ head = atomic_long_read(&rb->head);
+ head -= tail;
+ lpos -= tail;
+
+ if (lpos >= head)
+ return false;
+ return true;
+}
+
+/*
+ * prb_iter_data: Retrieve the record data at the current position.
+ * @iter: Iterator tracking the current position.
+ * @buf: A buffer to store the data of the record. May be NULL.
+ * @size: The size of @buf. (Ignored if @buf is NULL.)
+ * @seq: The sequence number of the record. May be NULL.
+ *
+ * If @iter is at a record, provide the data and/or sequence number of that
+ * record (if specified by the caller).
+ *
+ * It is safe to call this function from any context and state.
+ *
+ * Returns >=0 if the current record contains valid data (returns 0 if @buf
+ * is NULL or returns the size of the data block if @buf is non-NULL) or
+ * -EINVAL if @iter is now invalid.
+ */
+int prb_iter_data(struct prb_iterator *iter, char *buf, int size, u64 *seq)
+{
+ struct printk_ringbuffer *rb = iter->rb;
+ unsigned long lpos = iter->lpos;
+ unsigned int datsize = 0;
+ struct prb_entry *e;
+
+ if (buf || seq) {
+ e = to_entry(rb, lpos);
+ if (!is_valid(rb, lpos))
+ return -EINVAL;
+ /* memory barrier to ensure valid lpos */
+ smp_rmb();
+ if (buf) {
+ datsize = PRB_DATA_SIZE(e);
+ /* memory barrier to ensure load of datsize */
+ smp_rmb();
+ if (!is_valid(rb, lpos))
+ return -EINVAL;
+ if (PRB_INDEX(rb, lpos) + datsize >
+ PRB_SIZE(rb) - PRB_DATA_ALIGN) {
+ return -EINVAL;
+ }
+ if (size > datsize)
+ size = datsize;
+ memcpy(buf, &e->data[0], size);
+ }
+ if (seq)
+ *seq = e->seq;
+ /* memory barrier to ensure loads of entry data */
+ smp_rmb();
+ }
+
+ if (!is_valid(rb, lpos))
+ return -EINVAL;
+
+ return datsize;
+}
+
+/*
+ * prb_iter_next: Advance to the next record.
+ * @iter: Iterator tracking the current position.
+ * @buf: A buffer to store the data of the next record. May be NULL.
+ * @size: The size of @buf. (Ignored if @buf is NULL.)
+ * @seq: The sequence number of the next record. May be NULL.
+ *
+ * If a next record is available, @iter is advanced and (if specified)
+ * the data and/or sequence number of that record are provided.
+ *
+ * It is safe to call this function from any context and state.
+ *
+ * Returns 1 if @iter was advanced, 0 if @iter is at the end of the list, or
+ * -EINVAL if @iter is now invalid.
+ */
+int prb_iter_next(struct prb_iterator *iter, char *buf, int size, u64 *seq)
+{
+ struct printk_ringbuffer *rb = iter->rb;
+ unsigned long next_lpos;
+ struct prb_entry *e;
+ unsigned int esize;
+
+ if (iter->lpos == PRB_INIT) {
+ next_lpos = atomic_long_read(&rb->tail);
+ } else {
+ if (!is_valid(rb, iter->lpos))
+ return -EINVAL;
+ /* memory barrier to ensure valid lpos */
+ smp_rmb();
+ e = to_entry(rb, iter->lpos);
+ esize = e->size;
+ /* memory barrier to ensure load of size */
+ smp_rmb();
+ if (!is_valid(rb, iter->lpos))
+ return -EINVAL;
+ next_lpos = iter->lpos + esize;
+ }
+ if (next_lpos == atomic_long_read(&rb->head))
+ return 0;
+ if (!is_valid(rb, next_lpos))
+ return -EINVAL;
+ /* memory barrier to ensure valid lpos */
+ smp_rmb();
+
+ iter->lpos = next_lpos;
+ e = to_entry(rb, iter->lpos);
+ esize = e->size;
+ /* memory barrier to ensure load of size */
+ smp_rmb();
+ if (!is_valid(rb, iter->lpos))
+ return -EINVAL;
+ if (esize == -1)
+ iter->lpos = PRB_WRAP_LPOS(rb, iter->lpos, 1);
+
+ if (prb_iter_data(iter, buf, size, seq) < 0)
+ return -EINVAL;
+
+ return 1;
+}

View File

@ -0,0 +1,161 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:44 +0100
Subject: [PATCH 06/25] printk-rb: add blocking reader support
Add a blocking read function for readers. An irq_work function is
used to signal the wait queue so that write notification can
be triggered from any context.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/printk_ringbuffer.h | 20 +++++++++++++
lib/printk_ringbuffer.c | 55 ++++++++++++++++++++++++++++++++++++++
2 files changed, 75 insertions(+)
--- a/include/linux/printk_ringbuffer.h
+++ b/include/linux/printk_ringbuffer.h
@@ -2,8 +2,10 @@
#ifndef _LINUX_PRINTK_RINGBUFFER_H
#define _LINUX_PRINTK_RINGBUFFER_H
+#include <linux/irq_work.h>
#include <linux/atomic.h>
#include <linux/percpu.h>
+#include <linux/wait.h>
struct prb_cpulock {
atomic_t owner;
@@ -22,6 +24,10 @@ struct printk_ringbuffer {
struct prb_cpulock *cpulock;
atomic_t ctx;
+
+ struct wait_queue_head *wq;
+ atomic_long_t wq_counter;
+ struct irq_work *wq_work;
};
struct prb_entry {
@@ -59,6 +65,15 @@ struct prb_iterator {
#define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr) \
static char _##name##_buffer[1 << (szbits)] \
__aligned(__alignof__(long)); \
+static DECLARE_WAIT_QUEUE_HEAD(_##name##_wait); \
+static void _##name##_wake_work_func(struct irq_work *irq_work) \
+{ \
+ wake_up_interruptible_all(&_##name##_wait); \
+} \
+static struct irq_work _##name##_wake_work = { \
+ .func = _##name##_wake_work_func, \
+ .flags = IRQ_WORK_LAZY, \
+}; \
static struct printk_ringbuffer name = { \
.buffer = &_##name##_buffer[0], \
.size_bits = szbits, \
@@ -68,6 +83,9 @@ static struct printk_ringbuffer name = {
.reserve = ATOMIC_LONG_INIT(-111 * sizeof(long)), \
.cpulock = cpulockptr, \
.ctx = ATOMIC_INIT(0), \
+ .wq = &_##name##_wait, \
+ .wq_counter = ATOMIC_LONG_INIT(0), \
+ .wq_work = &_##name##_wake_work, \
}
/* writer interface */
@@ -80,6 +98,8 @@ void prb_iter_init(struct prb_iterator *
u64 *seq);
void prb_iter_copy(struct prb_iterator *dest, struct prb_iterator *src);
int prb_iter_next(struct prb_iterator *iter, char *buf, int size, u64 *seq);
+int prb_iter_wait_next(struct prb_iterator *iter, char *buf, int size,
+ u64 *seq);
int prb_iter_data(struct prb_iterator *iter, char *buf, int size, u64 *seq);
/* utility functions */
--- a/lib/printk_ringbuffer.c
+++ b/lib/printk_ringbuffer.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/sched.h>
#include <linux/smp.h>
#include <linux/string.h>
#include <linux/errno.h>
@@ -154,6 +155,7 @@ static bool push_tail(struct printk_ring
void prb_commit(struct prb_handle *h)
{
struct printk_ringbuffer *rb = h->rb;
+ bool changed = false;
struct prb_entry *e;
unsigned long head;
unsigned long res;
@@ -175,6 +177,7 @@ void prb_commit(struct prb_handle *h)
}
e->seq = ++rb->seq;
head += e->size;
+ changed = true;
}
atomic_long_set_release(&rb->head, res);
atomic_dec(&rb->ctx);
@@ -185,6 +188,18 @@ void prb_commit(struct prb_handle *h)
}
prb_unlock(rb->cpulock, h->cpu);
+
+ if (changed) {
+ atomic_long_inc(&rb->wq_counter);
+ if (wq_has_sleeper(rb->wq)) {
+#ifdef CONFIG_IRQ_WORK
+ irq_work_queue(rb->wq_work);
+#else
+ if (!in_nmi())
+ wake_up_interruptible_all(rb->wq);
+#endif
+ }
+ }
}
/*
@@ -437,3 +452,43 @@ int prb_iter_next(struct prb_iterator *i
return 1;
}
+
+/*
+ * prb_iter_wait_next: Advance to the next record, blocking if none available.
+ * @iter: Iterator tracking the current position.
+ * @buf: A buffer to store the data of the next record. May be NULL.
+ * @size: The size of @buf. (Ignored if @buf is NULL.)
+ * @seq: The sequence number of the next record. May be NULL.
+ *
+ * If a next record is already available, this function works like
+ * prb_iter_next(). Otherwise block interruptible until a next record is
+ * available.
+ *
+ * When a next record is available, @iter is advanced and (if specified)
+ * the data and/or sequence number of that record are provided.
+ *
+ * This function might sleep.
+ *
+ * Returns 1 if @iter was advanced, -EINVAL if @iter is now invalid, or
+ * -ERESTARTSYS if interrupted by a signal.
+ */
+int prb_iter_wait_next(struct prb_iterator *iter, char *buf, int size, u64 *seq)
+{
+ unsigned long last_seen;
+ int ret;
+
+ for (;;) {
+ last_seen = atomic_long_read(&iter->rb->wq_counter);
+
+ ret = prb_iter_next(iter, buf, size, seq);
+ if (ret != 0)
+ break;
+
+ ret = wait_event_interruptible(*iter->rb->wq,
+ last_seen != atomic_long_read(&iter->rb->wq_counter));
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}

View File

@ -0,0 +1,159 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:45 +0100
Subject: [PATCH 07/25] printk-rb: add functionality required by printk
The printk subsystem needs to be able to query the size of the ring
buffer, seek to specific entries within the ring buffer, and track
if records could not be stored in the ring buffer.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/printk_ringbuffer.h | 5 ++
lib/printk_ringbuffer.c | 95 ++++++++++++++++++++++++++++++++++++++
2 files changed, 100 insertions(+)
--- a/include/linux/printk_ringbuffer.h
+++ b/include/linux/printk_ringbuffer.h
@@ -17,6 +17,7 @@ struct printk_ringbuffer {
unsigned int size_bits;
u64 seq;
+ atomic_long_t lost;
atomic_long_t tail;
atomic_long_t head;
@@ -78,6 +79,7 @@ static struct printk_ringbuffer name = {
.buffer = &_##name##_buffer[0], \
.size_bits = szbits, \
.seq = 0, \
+ .lost = ATOMIC_LONG_INIT(0), \
.tail = ATOMIC_LONG_INIT(-111 * sizeof(long)), \
.head = ATOMIC_LONG_INIT(-111 * sizeof(long)), \
.reserve = ATOMIC_LONG_INIT(-111 * sizeof(long)), \
@@ -100,9 +102,12 @@ void prb_iter_copy(struct prb_iterator *
int prb_iter_next(struct prb_iterator *iter, char *buf, int size, u64 *seq);
int prb_iter_wait_next(struct prb_iterator *iter, char *buf, int size,
u64 *seq);
+int prb_iter_seek(struct prb_iterator *iter, u64 seq);
int prb_iter_data(struct prb_iterator *iter, char *buf, int size, u64 *seq);
/* utility functions */
+int prb_buffer_size(struct printk_ringbuffer *rb);
+void prb_inc_lost(struct printk_ringbuffer *rb);
void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store);
void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store);
--- a/lib/printk_ringbuffer.c
+++ b/lib/printk_ringbuffer.c
@@ -175,11 +175,16 @@ void prb_commit(struct prb_handle *h)
head = PRB_WRAP_LPOS(rb, head, 1);
continue;
}
+ while (atomic_long_read(&rb->lost)) {
+ atomic_long_dec(&rb->lost);
+ rb->seq++;
+ }
e->seq = ++rb->seq;
head += e->size;
changed = true;
}
atomic_long_set_release(&rb->head, res);
+
atomic_dec(&rb->ctx);
if (atomic_long_read(&rb->reserve) == res)
@@ -492,3 +497,93 @@ int prb_iter_wait_next(struct prb_iterat
return ret;
}
+
+/*
+ * prb_iter_seek: Seek forward to a specific record.
+ * @iter: Iterator to advance.
+ * @seq: Record number to advance to.
+ *
+ * Advance @iter such that a following call to prb_iter_data() will provide
+ * the contents of the specified record. If a record is specified that does
+ * not yet exist, advance @iter to the end of the record list.
+ *
+ * Note that iterators cannot be rewound. So if a record is requested that
+ * exists but is previous to @iter in position, @iter is considered invalid.
+ *
+ * It is safe to call this function from any context and state.
+ *
+ * Returns 1 on succces, 0 if specified record does not yet exist (@iter is
+ * now at the end of the list), or -EINVAL if @iter is now invalid.
+ */
+int prb_iter_seek(struct prb_iterator *iter, u64 seq)
+{
+ u64 cur_seq;
+ int ret;
+
+ /* first check if the iterator is already at the wanted seq */
+ if (seq == 0) {
+ if (iter->lpos == PRB_INIT)
+ return 1;
+ else
+ return -EINVAL;
+ }
+ if (iter->lpos != PRB_INIT) {
+ if (prb_iter_data(iter, NULL, 0, &cur_seq) >= 0) {
+ if (cur_seq == seq)
+ return 1;
+ if (cur_seq > seq)
+ return -EINVAL;
+ }
+ }
+
+ /* iterate to find the wanted seq */
+ for (;;) {
+ ret = prb_iter_next(iter, NULL, 0, &cur_seq);
+ if (ret <= 0)
+ break;
+
+ if (cur_seq == seq)
+ break;
+
+ if (cur_seq > seq) {
+ ret = -EINVAL;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * prb_buffer_size: Get the size of the ring buffer.
+ * @rb: The ring buffer to get the size of.
+ *
+ * Return the number of bytes used for the ring buffer entry storage area.
+ * Note that this area stores both entry header and entry data. Therefore
+ * this represents an upper bound to the amount of data that can be stored
+ * in the ring buffer.
+ *
+ * It is safe to call this function from any context and state.
+ *
+ * Returns the size in bytes of the entry storage area.
+ */
+int prb_buffer_size(struct printk_ringbuffer *rb)
+{
+ return PRB_SIZE(rb);
+}
+
+/*
+ * prb_inc_lost: Increment the seq counter to signal a lost record.
+ * @rb: The ring buffer to increment the seq of.
+ *
+ * Increment the seq counter so that a seq number is intentially missing
+ * for the readers. This allows readers to identify that a record is
+ * missing. A writer will typically use this function if prb_reserve()
+ * fails.
+ *
+ * It is safe to call this function from any context and state.
+ */
+void prb_inc_lost(struct printk_ringbuffer *rb)
+{
+ atomic_long_inc(&rb->lost);
+}

View File

@ -0,0 +1,168 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:46 +0100
Subject: [PATCH 08/25] printk: add ring buffer and kthread
The printk ring buffer provides an NMI-safe interface for writing
messages to a ring buffer. Using such a buffer for alleviates printk
callers from the current burdens of disabled preemption while calling
the console drivers (and possibly printing out many messages that
another task put into the log buffer).
Create a ring buffer to be used for storing messages to be
printed to the consoles.
Create a dedicated printk kthread to block on the ring buffer
and call the console drivers for the read messages.
NOTE: The printk_delay is relocated to _after_ the message is
printed, where it makes more sense.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 105 insertions(+)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,8 @@
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
+#include <linux/kthread.h>
+#include <linux/printk_ringbuffer.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
@@ -417,7 +419,12 @@ DEFINE_RAW_SPINLOCK(logbuf_lock);
printk_safe_exit_irqrestore(flags); \
} while (0)
+DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock);
+
#ifdef CONFIG_PRINTK
+/* record buffer */
+DECLARE_STATIC_PRINTKRB(printk_rb, CONFIG_LOG_BUF_SHIFT, &printk_cpulock);
+
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
@@ -780,6 +787,10 @@ static ssize_t msg_print_ext_body(char *
return p - buf;
}
+#define PRINTK_SPRINT_MAX (LOG_LINE_MAX + PREFIX_MAX)
+#define PRINTK_RECORD_MAX (sizeof(struct printk_log) + \
+ CONSOLE_EXT_LOG_MAX + PRINTK_SPRINT_MAX)
+
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
u64 seq;
@@ -1620,6 +1631,34 @@ SYSCALL_DEFINE3(syslog, int, type, char
return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}
+static void format_text(struct printk_log *msg, u64 seq,
+ char *ext_text, size_t *ext_len,
+ char *text, size_t *len, bool time)
+{
+ if (suppress_message_printing(msg->level)) {
+ /*
+ * Skip record that has level above the console
+ * loglevel and update each console's local seq.
+ */
+ *len = 0;
+ *ext_len = 0;
+ return;
+ }
+
+ *len = msg_print_text(msg, console_msg_format & MSG_FORMAT_SYSLOG,
+ time, text, PRINTK_SPRINT_MAX);
+ if (nr_ext_console_drivers) {
+ *ext_len = msg_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX,
+ msg, seq);
+ *ext_len += msg_print_ext_body(ext_text + *ext_len,
+ CONSOLE_EXT_LOG_MAX - *ext_len,
+ log_dict(msg), msg->dict_len,
+ log_text(msg), msg->text_len);
+ } else {
+ *ext_len = 0;
+ }
+}
+
/*
* Special console_lock variants that help to reduce the risk of soft-lockups.
* They allow to pass console_lock to another printk() call using a busy wait.
@@ -2974,6 +3013,72 @@ void wake_up_klogd(void)
preempt_enable();
}
+static int printk_kthread_func(void *data)
+{
+ struct prb_iterator iter;
+ struct printk_log *msg;
+ size_t ext_len;
+ char *ext_text;
+ u64 master_seq;
+ size_t len;
+ char *text;
+ char *buf;
+ int ret;
+
+ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
+ buf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
+ if (!ext_text || !text || !buf)
+ return -1;
+
+ prb_iter_init(&iter, &printk_rb, NULL);
+
+ /* the printk kthread never exits */
+ for (;;) {
+ ret = prb_iter_wait_next(&iter, buf,
+ PRINTK_RECORD_MAX, &master_seq);
+ if (ret == -ERESTARTSYS) {
+ continue;
+ } else if (ret < 0) {
+ /* iterator invalid, start over */
+ prb_iter_init(&iter, &printk_rb, NULL);
+ continue;
+ }
+
+ msg = (struct printk_log *)buf;
+ format_text(msg, master_seq, ext_text, &ext_len, text,
+ &len, printk_time);
+
+ console_lock();
+ if (len > 0 || ext_len > 0) {
+ call_console_drivers(ext_text, ext_len, text, len);
+ boot_delay_msec(msg->level);
+ printk_delay();
+ }
+ console_unlock();
+ }
+
+ kfree(ext_text);
+ kfree(text);
+ kfree(buf);
+
+ return 0;
+}
+
+static int __init init_printk_kthread(void)
+{
+ struct task_struct *thread;
+
+ thread = kthread_run(printk_kthread_func, NULL, "printk");
+ if (IS_ERR(thread)) {
+ pr_err("printk: unable to create printing thread\n");
+ return PTR_ERR(thread);
+ }
+
+ return 0;
+}
+late_initcall(init_printk_kthread);
+
void defer_console_output(void)
{
preempt_disable();

View File

@ -0,0 +1,101 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:47 +0100
Subject: [PATCH 09/25] printk: remove exclusive console hack
In order to support printing the printk log history when new
consoles are registered, a global exclusive_console variable is
temporarily set. This only works because printk runs with
preemption disabled.
When console printing is moved to a fully preemptible dedicated
kthread, this hack no longer works.
Remove exclusive_console usage.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 30 ++++--------------------------
1 file changed, 4 insertions(+), 26 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -269,11 +269,6 @@ static void __up_console_sem(unsigned lo
static int console_locked, console_suspended;
/*
- * If exclusive_console is non-NULL then only this console is to be printed to.
- */
-static struct console *exclusive_console;
-
-/*
* Array of consoles built from command line options (console=)
*/
@@ -443,7 +438,6 @@ static u32 log_next_idx;
/* the next printk record to write to the console */
static u64 console_seq;
static u32 console_idx;
-static u64 exclusive_console_stop_seq;
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
@@ -1815,8 +1809,6 @@ static void call_console_drivers(const c
return;
for_each_console(con) {
- if (exclusive_console && con != exclusive_console)
- continue;
if (!(con->flags & CON_ENABLED))
continue;
if (!con->write)
@@ -2109,7 +2101,6 @@ static u64 syslog_seq;
static u32 syslog_idx;
static u64 console_seq;
static u32 console_idx;
-static u64 exclusive_console_stop_seq;
static u64 log_first_seq;
static u32 log_first_idx;
static u64 log_next_seq;
@@ -2478,12 +2469,6 @@ void console_unlock(void)
goto skip;
}
- /* Output to all consoles once old messages replayed. */
- if (unlikely(exclusive_console &&
- console_seq >= exclusive_console_stop_seq)) {
- exclusive_console = NULL;
- }
-
len += msg_print_text(msg,
console_msg_format & MSG_FORMAT_SYSLOG,
printk_time, text + len, sizeof(text) - len);
@@ -2809,17 +2794,6 @@ void register_console(struct console *ne
* for us.
*/
logbuf_lock_irqsave(flags);
- /*
- * We're about to replay the log buffer. Only do this to the
- * just-registered console to avoid excessive message spam to
- * the already-registered consoles.
- *
- * Set exclusive_console with disabled interrupts to reduce
- * race window with eventual console_flush_on_panic() that
- * ignores console_lock.
- */
- exclusive_console = newcon;
- exclusive_console_stop_seq = console_seq;
console_seq = syslog_seq;
console_idx = syslog_idx;
logbuf_unlock_irqrestore(flags);
@@ -2833,6 +2807,10 @@ void register_console(struct console *ne
* boot consoles, real consoles, etc - this is to ensure that end
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
+ *
+ * This message is also important because it will trigger the
+ * printk kthread to begin dumping the log buffer to the newly
+ * registered console.
*/
pr_info("%sconsole [%s%d] enabled\n",
(newcon->flags & CON_BOOT) ? "boot" : "" ,

View File

@ -0,0 +1,437 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:48 +0100
Subject: [PATCH 10/25] printk: redirect emit/store to new ringbuffer
vprintk_emit and vprintk_store are the main functions that all printk
variants eventually go through. Change these to store the message in
the new printk ring buffer that the printk kthread is reading.
Remove functions no longer in use because of the changes to
vprintk_emit and vprintk_store.
In order to handle interrupts and NMIs, a second per-cpu ring buffer
(sprint_rb) is added. This ring buffer is used for NMI-safe memory
allocation in order to format the printk messages.
NOTE: LOG_CONT is ignored for now and handled as individual messages.
LOG_CONT functions are masked behind "#if 0" blocks until their
functionality can be restored
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 325 +++++++------------------------------------------
1 file changed, 51 insertions(+), 274 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -517,90 +517,6 @@ static u32 log_next(u32 idx)
return idx + msg->len;
}
-/*
- * Check whether there is enough free space for the given message.
- *
- * The same values of first_idx and next_idx mean that the buffer
- * is either empty or full.
- *
- * If the buffer is empty, we must respect the position of the indexes.
- * They cannot be reset to the beginning of the buffer.
- */
-static int logbuf_has_space(u32 msg_size, bool empty)
-{
- u32 free;
-
- if (log_next_idx > log_first_idx || empty)
- free = max(log_buf_len - log_next_idx, log_first_idx);
- else
- free = log_first_idx - log_next_idx;
-
- /*
- * We need space also for an empty header that signalizes wrapping
- * of the buffer.
- */
- return free >= msg_size + sizeof(struct printk_log);
-}
-
-static int log_make_free_space(u32 msg_size)
-{
- while (log_first_seq < log_next_seq &&
- !logbuf_has_space(msg_size, false)) {
- /* drop old messages until we have enough contiguous space */
- log_first_idx = log_next(log_first_idx);
- log_first_seq++;
- }
-
- if (clear_seq < log_first_seq) {
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
- }
-
- /* sequence numbers are equal, so the log buffer is empty */
- if (logbuf_has_space(msg_size, log_first_seq == log_next_seq))
- return 0;
-
- return -ENOMEM;
-}
-
-/* compute the message size including the padding bytes */
-static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
-{
- u32 size;
-
- size = sizeof(struct printk_log) + text_len + dict_len;
- *pad_len = (-size) & (LOG_ALIGN - 1);
- size += *pad_len;
-
- return size;
-}
-
-/*
- * Define how much of the log buffer we could take at maximum. The value
- * must be greater than two. Note that only half of the buffer is available
- * when the index points to the middle.
- */
-#define MAX_LOG_TAKE_PART 4
-static const char trunc_msg[] = "<truncated>";
-
-static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
- u16 *dict_len, u32 *pad_len)
-{
- /*
- * The message should not take the whole buffer. Otherwise, it might
- * get removed too soon.
- */
- u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
- if (*text_len > max_text_len)
- *text_len = max_text_len;
- /* enable the warning message */
- *trunc_msg_len = strlen(trunc_msg);
- /* disable the "dict" completely */
- *dict_len = 0;
- /* compute the size again, count also the warning message */
- return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
-}
-
/* insert record into the buffer, discard old ones, update heads */
static int log_store(u32 caller_id, int facility, int level,
enum log_flags flags, u64 ts_nsec,
@@ -608,57 +524,39 @@ static int log_store(u32 caller_id, int
const char *text, u16 text_len)
{
struct printk_log *msg;
- u32 size, pad_len;
- u16 trunc_msg_len = 0;
-
- /* number of '\0' padding bytes to next message */
- size = msg_used_size(text_len, dict_len, &pad_len);
+ struct prb_handle h;
+ char *rbuf;
+ u32 size;
- if (log_make_free_space(size)) {
- /* truncate the message if it is too long for empty buffer */
- size = truncate_msg(&text_len, &trunc_msg_len,
- &dict_len, &pad_len);
- /* survive when the log buffer is too small for trunc_msg */
- if (log_make_free_space(size))
- return 0;
- }
+ size = sizeof(*msg) + text_len + dict_len;
- if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
+ rbuf = prb_reserve(&h, &printk_rb, size);
+ if (!rbuf) {
/*
- * This message + an additional empty header does not fit
- * at the end of the buffer. Add an empty header with len == 0
- * to signify a wrap around.
+ * An emergency message would have been printed, but
+ * it cannot be stored in the log.
*/
- memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
- log_next_idx = 0;
+ prb_inc_lost(&printk_rb);
+ return 0;
}
/* fill message */
- msg = (struct printk_log *)(log_buf + log_next_idx);
+ msg = (struct printk_log *)rbuf;
memcpy(log_text(msg), text, text_len);
msg->text_len = text_len;
- if (trunc_msg_len) {
- memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
- msg->text_len += trunc_msg_len;
- }
memcpy(log_dict(msg), dict, dict_len);
msg->dict_len = dict_len;
msg->facility = facility;
msg->level = level & 7;
msg->flags = flags & 0x1f;
- if (ts_nsec > 0)
- msg->ts_nsec = ts_nsec;
- else
- msg->ts_nsec = local_clock();
+ msg->ts_nsec = ts_nsec;
#ifdef CONFIG_PRINTK_CALLER
msg->caller_id = caller_id;
#endif
- memset(log_dict(msg) + dict_len, 0, pad_len);
msg->len = size;
/* insert message */
- log_next_idx += msg->len;
- log_next_seq++;
+ prb_commit(&h);
return msg->text_len;
}
@@ -1729,70 +1627,6 @@ static int console_lock_spinning_disable
return 1;
}
-/**
- * console_trylock_spinning - try to get console_lock by busy waiting
- *
- * This allows to busy wait for the console_lock when the current
- * owner is running in specially marked sections. It means that
- * the current owner is running and cannot reschedule until it
- * is ready to lose the lock.
- *
- * Return: 1 if we got the lock, 0 othrewise
- */
-static int console_trylock_spinning(void)
-{
- struct task_struct *owner = NULL;
- bool waiter;
- bool spin = false;
- unsigned long flags;
-
- if (console_trylock())
- return 1;
-
- printk_safe_enter_irqsave(flags);
-
- raw_spin_lock(&console_owner_lock);
- owner = READ_ONCE(console_owner);
- waiter = READ_ONCE(console_waiter);
- if (!waiter && owner && owner != current) {
- WRITE_ONCE(console_waiter, true);
- spin = true;
- }
- raw_spin_unlock(&console_owner_lock);
-
- /*
- * If there is an active printk() writing to the
- * consoles, instead of having it write our data too,
- * see if we can offload that load from the active
- * printer, and do some printing ourselves.
- * Go into a spin only if there isn't already a waiter
- * spinning, and there is an active printer, and
- * that active printer isn't us (recursive printk?).
- */
- if (!spin) {
- printk_safe_exit_irqrestore(flags);
- return 0;
- }
-
- /* We spin waiting for the owner to release us */
- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
- /* Owner will clear console_waiter on hand off */
- while (READ_ONCE(console_waiter))
- cpu_relax();
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
-
- printk_safe_exit_irqrestore(flags);
- /*
- * The owner passed the console lock to us.
- * Since we did not spin on console lock, annotate
- * this as a trylock. Otherwise lockdep will
- * complain.
- */
- mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);
-
- return 1;
-}
-
/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
@@ -1813,7 +1647,7 @@ static void call_console_drivers(const c
continue;
if (!con->write)
continue;
- if (!cpu_online(smp_processor_id()) &&
+ if (!cpu_online(raw_smp_processor_id()) &&
!(con->flags & CON_ANYTIME))
continue;
if (con->flags & CON_EXTENDED)
@@ -1843,6 +1677,8 @@ static inline u32 printk_caller_id(void)
0x80000000 + raw_smp_processor_id();
}
+/* FIXME: no support for LOG_CONT */
+#if 0
/*
* Continuation lines are buffered, and not committed to the record buffer
* until the line is complete, or a race forces it. The line fragments
@@ -1898,56 +1734,45 @@ static bool cont_add(u32 caller_id, int
return true;
}
+#endif /* 0 */
-static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
-{
- const u32 caller_id = printk_caller_id();
-
- /*
- * If an earlier line was buffered, and we're a continuation
- * write from the same context, try to add it to the buffer.
- */
- if (cont.len) {
- if (cont.caller_id == caller_id && (lflags & LOG_CONT)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
- return text_len;
- }
- /* Otherwise, make sure it's flushed */
- cont_flush();
- }
-
- /* Skip empty continuation lines that couldn't be added - they just flush */
- if (!text_len && (lflags & LOG_CONT))
- return 0;
-
- /* If it doesn't end in a newline, try to buffer the current line */
- if (!(lflags & LOG_NEWLINE)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
- return text_len;
- }
-
- /* Store it in the record log */
- return log_store(caller_id, facility, level, lflags, 0,
- dict, dictlen, text, text_len);
-}
-
-/* Must be called under logbuf_lock. */
int vprintk_store(int facility, int level,
const char *dict, size_t dictlen,
const char *fmt, va_list args)
{
- static char textbuf[LOG_LINE_MAX];
- char *text = textbuf;
- size_t text_len;
+ return vprintk_emit(facility, level, dict, dictlen, fmt, args);
+}
+
+/* ring buffer used as memory allocator for temporary sprint buffers */
+DECLARE_STATIC_PRINTKRB(sprint_rb,
+ ilog2(PRINTK_RECORD_MAX + sizeof(struct prb_entry) +
+ sizeof(long)) + 2, &printk_cpulock);
+
+asmlinkage int vprintk_emit(int facility, int level,
+ const char *dict, size_t dictlen,
+ const char *fmt, va_list args)
+{
+ const u32 caller_id = printk_caller_id();
enum log_flags lflags = 0;
+ int printed_len = 0;
+ struct prb_handle h;
+ size_t text_len;
+ u64 ts_nsec;
+ char *text;
+ char *rbuf;
- /*
- * The printf needs to come first; we need the syslog
- * prefix which might be passed-in as a parameter.
- */
- text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
+ ts_nsec = local_clock();
+
+ rbuf = prb_reserve(&h, &sprint_rb, PRINTK_SPRINT_MAX);
+ if (!rbuf) {
+ prb_inc_lost(&printk_rb);
+ return printed_len;
+ }
+
+ text = rbuf;
+ text_len = vscnprintf(text, PRINTK_SPRINT_MAX, fmt, args);
- /* mark and strip a trailing newline */
+ /* strip and flag a trailing newline */
if (text_len && text[text_len-1] == '\n') {
text_len--;
lflags |= LOG_NEWLINE;
@@ -1978,58 +1803,10 @@ int vprintk_store(int facility, int leve
if (dict)
lflags |= LOG_NEWLINE;
- return log_output(facility, level, lflags,
- dict, dictlen, text, text_len);
-}
-
-asmlinkage int vprintk_emit(int facility, int level,
- const char *dict, size_t dictlen,
- const char *fmt, va_list args)
-{
- int printed_len;
- bool in_sched = false, pending_output;
- unsigned long flags;
- u64 curr_log_seq;
-
- /* Suppress unimportant messages after panic happens */
- if (unlikely(suppress_printk))
- return 0;
-
- if (level == LOGLEVEL_SCHED) {
- level = LOGLEVEL_DEFAULT;
- in_sched = true;
- }
-
- boot_delay_msec(level);
- printk_delay();
-
- /* This stops the holder of console_sem just where we want him */
- logbuf_lock_irqsave(flags);
- curr_log_seq = log_next_seq;
- printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args);
- pending_output = (curr_log_seq != log_next_seq);
- logbuf_unlock_irqrestore(flags);
-
- /* If called from the scheduler, we can not call up(). */
- if (!in_sched && pending_output) {
- /*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to
- * console
- */
- preempt_disable();
- /*
- * Try to acquire and then immediately release the console
- * semaphore. The release will print out buffers and wake up
- * /dev/kmsg and syslog() users.
- */
- if (console_trylock_spinning())
- console_unlock();
- preempt_enable();
- }
+ printed_len = log_store(caller_id, facility, level, lflags, ts_nsec,
+ dict, dictlen, text, text_len);
- if (pending_output)
- wake_up_klogd();
+ prb_commit(&h);
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -2494,7 +2271,7 @@ void console_unlock(void)
console_lock_spinning_enable();
stop_critical_timings(); /* don't trace print latency */
- call_console_drivers(ext_text, ext_len, text, len);
+ //call_console_drivers(ext_text, ext_len, text, len);
start_critical_timings();
if (console_lock_spinning_disable_and_check()) {

View File

@ -0,0 +1,699 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:49 +0100
Subject: [PATCH 11/25] printk_safe: remove printk safe code
vprintk variants are now NMI-safe so there is no longer a need for
the "safe" calls.
NOTE: This also removes printk flushing functionality.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/powerpc/kernel/traps.c | 1
arch/powerpc/kernel/watchdog.c | 5
include/linux/hardirq.h | 2
include/linux/printk.h | 27 --
init/main.c | 1
kernel/kexec_core.c | 1
kernel/panic.c | 3
kernel/printk/Makefile | 1
kernel/printk/internal.h | 30 --
kernel/printk/printk.c | 13 -
kernel/printk/printk_safe.c | 415 -----------------------------------------
kernel/trace/trace.c | 2
lib/nmi_backtrace.c | 6
13 files changed, 7 insertions(+), 500 deletions(-)
delete mode 100644 kernel/printk/printk_safe.c
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -171,7 +171,6 @@ extern void panic_flush_kmsg_start(void)
extern void panic_flush_kmsg_end(void)
{
- printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
bust_spinlocks(0);
debug_locks_off();
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -181,11 +181,6 @@ static void watchdog_smp_panic(int cpu,
wd_smp_unlock(&flags);
- printk_safe_flush();
- /*
- * printk_safe_flush() seems to require another print
- * before anything actually goes out to console.
- */
if (sysctl_hardlockup_all_cpu_backtrace)
trigger_allbutself_cpu_backtrace();
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -68,7 +68,6 @@ extern void irq_exit(void);
#define nmi_enter() \
do { \
arch_nmi_enter(); \
- printk_nmi_enter(); \
lockdep_off(); \
ftrace_nmi_enter(); \
BUG_ON(in_nmi()); \
@@ -85,7 +84,6 @@ extern void irq_exit(void);
preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \
ftrace_nmi_exit(); \
lockdep_on(); \
- printk_nmi_exit(); \
arch_nmi_exit(); \
} while (0)
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -146,18 +146,6 @@ static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif
-#ifdef CONFIG_PRINTK_NMI
-extern void printk_nmi_enter(void);
-extern void printk_nmi_exit(void);
-extern void printk_nmi_direct_enter(void);
-extern void printk_nmi_direct_exit(void);
-#else
-static inline void printk_nmi_enter(void) { }
-static inline void printk_nmi_exit(void) { }
-static inline void printk_nmi_direct_enter(void) { }
-static inline void printk_nmi_direct_exit(void) { }
-#endif /* PRINTK_NMI */
-
#ifdef CONFIG_PRINTK
asmlinkage __printf(5, 0)
int vprintk_emit(int facility, int level,
@@ -202,9 +190,6 @@ void __init setup_log_buf(int early);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack(void) __cold;
-extern void printk_safe_init(void);
-extern void printk_safe_flush(void);
-extern void printk_safe_flush_on_panic(void);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
@@ -268,18 +253,6 @@ static inline void show_regs_print_info(
static inline void dump_stack(void)
{
}
-
-static inline void printk_safe_init(void)
-{
-}
-
-static inline void printk_safe_flush(void)
-{
-}
-
-static inline void printk_safe_flush_on_panic(void)
-{
-}
#endif
extern int kptr_restrict;
--- a/init/main.c
+++ b/init/main.c
@@ -694,7 +694,6 @@ asmlinkage __visible void __init start_k
boot_init_stack_canary();
time_init();
- printk_safe_init();
perf_event_init();
profile_init();
call_function_init();
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -972,7 +972,6 @@ void crash_kexec(struct pt_regs *regs)
old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
if (old_cpu == PANIC_CPU_INVALID) {
/* This is the 1st CPU which comes here, so go ahead. */
- printk_safe_flush_on_panic();
__crash_kexec(regs);
/*
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -237,7 +237,6 @@ void panic(const char *fmt, ...)
* Bypass the panic_cpu check and call __crash_kexec directly.
*/
if (!_crash_kexec_post_notifiers) {
- printk_safe_flush_on_panic();
__crash_kexec(NULL);
/*
@@ -261,8 +260,6 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
- /* Call flush even twice. It tries harder with a single online CPU */
- printk_safe_flush_on_panic();
kmsg_dump(KMSG_DUMP_PANIC);
/*
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,4 +1,3 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
-obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -20,32 +20,6 @@ int vprintk_store(int facility, int leve
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
-void __printk_safe_enter(void);
-void __printk_safe_exit(void);
-
-#define printk_safe_enter_irqsave(flags) \
- do { \
- local_irq_save(flags); \
- __printk_safe_enter(); \
- } while (0)
-
-#define printk_safe_exit_irqrestore(flags) \
- do { \
- __printk_safe_exit(); \
- local_irq_restore(flags); \
- } while (0)
-
-#define printk_safe_enter_irq() \
- do { \
- local_irq_disable(); \
- __printk_safe_enter(); \
- } while (0)
-
-#define printk_safe_exit_irq() \
- do { \
- __printk_safe_exit(); \
- local_irq_enable(); \
- } while (0)
void defer_console_output(void);
@@ -58,10 +32,10 @@ void defer_console_output(void);
* semaphore and some of console functions (console_unlock()/etc.), so
* printk-safe must preserve the existing local IRQ guarantees.
*/
+#endif /* CONFIG_PRINTK */
+
#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
#define printk_safe_enter_irq() local_irq_disable()
#define printk_safe_exit_irq() local_irq_enable()
-
-#endif /* CONFIG_PRINTK */
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1736,13 +1736,6 @@ static bool cont_add(u32 caller_id, int
}
#endif /* 0 */
-int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
- const char *fmt, va_list args)
-{
- return vprintk_emit(facility, level, dict, dictlen, fmt, args);
-}
-
/* ring buffer used as memory allocator for temporary sprint buffers */
DECLARE_STATIC_PRINTKRB(sprint_rb,
ilog2(PRINTK_RECORD_MAX + sizeof(struct prb_entry) +
@@ -1811,6 +1804,11 @@ asmlinkage int vprintk_emit(int facility
}
EXPORT_SYMBOL(vprintk_emit);
+__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+}
+
asmlinkage int vprintk(const char *fmt, va_list args)
{
return vprintk_func(fmt, args);
@@ -3211,5 +3209,4 @@ void kmsg_dump_rewind(struct kmsg_dumper
logbuf_unlock_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
-
#endif
--- a/kernel/printk/printk_safe.c
+++ /dev/null
@@ -1,415 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * printk_safe.c - Safe printk for printk-deadlock-prone contexts
- */
-
-#include <linux/preempt.h>
-#include <linux/spinlock.h>
-#include <linux/debug_locks.h>
-#include <linux/smp.h>
-#include <linux/cpumask.h>
-#include <linux/irq_work.h>
-#include <linux/printk.h>
-
-#include "internal.h"
-
-/*
- * printk() could not take logbuf_lock in NMI context. Instead,
- * it uses an alternative implementation that temporary stores
- * the strings into a per-CPU buffer. The content of the buffer
- * is later flushed into the main ring buffer via IRQ work.
- *
- * The alternative implementation is chosen transparently
- * by examinig current printk() context mask stored in @printk_context
- * per-CPU variable.
- *
- * The implementation allows to flush the strings also from another CPU.
- * There are situations when we want to make sure that all buffers
- * were handled or when IRQs are blocked.
- */
-static int printk_safe_irq_ready __read_mostly;
-
-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \
- sizeof(atomic_t) - \
- sizeof(atomic_t) - \
- sizeof(struct irq_work))
-
-struct printk_safe_seq_buf {
- atomic_t len; /* length of written data */
- atomic_t message_lost;
- struct irq_work work; /* IRQ work that flushes the buffer */
- unsigned char buffer[SAFE_LOG_BUF_LEN];
-};
-
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq);
-static DEFINE_PER_CPU(int, printk_context);
-
-#ifdef CONFIG_PRINTK_NMI
-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
-#endif
-
-/* Get flushed in a more safe context. */
-static void queue_flush_work(struct printk_safe_seq_buf *s)
-{
- if (printk_safe_irq_ready)
- irq_work_queue(&s->work);
-}
-
-/*
- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe
- * have dedicated buffers, because otherwise printk-safe preempted by
- * NMI-printk would have overwritten the NMI messages.
- *
- * The messages are flushed from irq work (or from panic()), possibly,
- * from other CPU, concurrently with printk_safe_log_store(). Should this
- * happen, printk_safe_log_store() will notice the buffer->len mismatch
- * and repeat the write.
- */
-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
- const char *fmt, va_list args)
-{
- int add;
- size_t len;
- va_list ap;
-
-again:
- len = atomic_read(&s->len);
-
- /* The trailing '\0' is not counted into len. */
- if (len >= sizeof(s->buffer) - 1) {
- atomic_inc(&s->message_lost);
- queue_flush_work(s);
- return 0;
- }
-
- /*
- * Make sure that all old data have been read before the buffer
- * was reset. This is not needed when we just append data.
- */
- if (!len)
- smp_rmb();
-
- va_copy(ap, args);
- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap);
- va_end(ap);
- if (!add)
- return 0;
-
- /*
- * Do it once again if the buffer has been flushed in the meantime.
- * Note that atomic_cmpxchg() is an implicit memory barrier that
- * makes sure that the data were written before updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, len + add) != len)
- goto again;
-
- queue_flush_work(s);
- return add;
-}
-
-static inline void printk_safe_flush_line(const char *text, int len)
-{
- /*
- * Avoid any console drivers calls from here, because we may be
- * in NMI or printk_safe context (when in panic). The messages
- * must go only into the ring buffer at this stage. Consoles will
- * get explicitly called later when a crashdump is not generated.
- */
- printk_deferred("%.*s", len, text);
-}
-
-/* printk part of the temporary buffer line by line */
-static int printk_safe_flush_buffer(const char *start, size_t len)
-{
- const char *c, *end;
- bool header;
-
- c = start;
- end = start + len;
- header = true;
-
- /* Print line by line. */
- while (c < end) {
- if (*c == '\n') {
- printk_safe_flush_line(start, c - start + 1);
- start = ++c;
- header = true;
- continue;
- }
-
- /* Handle continuous lines or missing new line. */
- if ((c + 1 < end) && printk_get_level(c)) {
- if (header) {
- c = printk_skip_level(c);
- continue;
- }
-
- printk_safe_flush_line(start, c - start);
- start = c++;
- header = true;
- continue;
- }
-
- header = false;
- c++;
- }
-
- /* Check if there was a partial line. Ignore pure header. */
- if (start < end && !header) {
- static const char newline[] = KERN_CONT "\n";
-
- printk_safe_flush_line(start, end - start);
- printk_safe_flush_line(newline, strlen(newline));
- }
-
- return len;
-}
-
-static void report_message_lost(struct printk_safe_seq_buf *s)
-{
- int lost = atomic_xchg(&s->message_lost, 0);
-
- if (lost)
- printk_deferred("Lost %d message(s)!\n", lost);
-}
-
-/*
- * Flush data from the associated per-CPU buffer. The function
- * can be called either via IRQ work or independently.
- */
-static void __printk_safe_flush(struct irq_work *work)
-{
- static raw_spinlock_t read_lock =
- __RAW_SPIN_LOCK_INITIALIZER(read_lock);
- struct printk_safe_seq_buf *s =
- container_of(work, struct printk_safe_seq_buf, work);
- unsigned long flags;
- size_t len;
- int i;
-
- /*
- * The lock has two functions. First, one reader has to flush all
- * available message to make the lockless synchronization with
- * writers easier. Second, we do not want to mix messages from
- * different CPUs. This is especially important when printing
- * a backtrace.
- */
- raw_spin_lock_irqsave(&read_lock, flags);
-
- i = 0;
-more:
- len = atomic_read(&s->len);
-
- /*
- * This is just a paranoid check that nobody has manipulated
- * the buffer an unexpected way. If we printed something then
- * @len must only increase. Also it should never overflow the
- * buffer size.
- */
- if ((i && i >= len) || len > sizeof(s->buffer)) {
- const char *msg = "printk_safe_flush: internal error\n";
-
- printk_safe_flush_line(msg, strlen(msg));
- len = 0;
- }
-
- if (!len)
- goto out; /* Someone else has already flushed the buffer. */
-
- /* Make sure that data has been written up to the @len */
- smp_rmb();
- i += printk_safe_flush_buffer(s->buffer + i, len - i);
-
- /*
- * Check that nothing has got added in the meantime and truncate
- * the buffer. Note that atomic_cmpxchg() is an implicit memory
- * barrier that makes sure that the data were copied before
- * updating s->len.
- */
- if (atomic_cmpxchg(&s->len, len, 0) != len)
- goto more;
-
-out:
- report_message_lost(s);
- raw_spin_unlock_irqrestore(&read_lock, flags);
-}
-
-/**
- * printk_safe_flush - flush all per-cpu nmi buffers.
- *
- * The buffers are flushed automatically via IRQ work. This function
- * is useful only when someone wants to be sure that all buffers have
- * been flushed at some point.
- */
-void printk_safe_flush(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
-#ifdef CONFIG_PRINTK_NMI
- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work);
-#endif
- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work);
- }
-}
-
-/**
- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system
- * goes down.
- *
- * Similar to printk_safe_flush() but it can be called even in NMI context when
- * the system goes down. It does the best effort to get NMI messages into
- * the main ring buffer.
- *
- * Note that it could try harder when there is only one CPU online.
- */
-void printk_safe_flush_on_panic(void)
-{
- /*
- * Make sure that we could access the main ring buffer.
- * Do not risk a double release when more CPUs are up.
- */
- if (raw_spin_is_locked(&logbuf_lock)) {
- if (num_online_cpus() > 1)
- return;
-
- debug_locks_off();
- raw_spin_lock_init(&logbuf_lock);
- }
-
- printk_safe_flush();
-}
-
-#ifdef CONFIG_PRINTK_NMI
-/*
- * Safe printk() for NMI context. It uses a per-CPU buffer to
- * store the message. NMIs are not nested, so there is always only
- * one writer running. But the buffer might get flushed from another
- * CPU, so we need to be careful.
- */
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-void notrace printk_nmi_enter(void)
-{
- this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
-}
-
-void notrace printk_nmi_exit(void)
-{
- this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
-}
-
-/*
- * Marks a code that might produce many messages in NMI context
- * and the risk of losing them is more critical than eventual
- * reordering.
- *
- * It has effect only when called in NMI context. Then printk()
- * will try to store the messages into the main logbuf directly
- * and use the per-CPU buffers only as a fallback when the lock
- * is not available.
- */
-void printk_nmi_direct_enter(void)
-{
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-void printk_nmi_direct_exit(void)
-{
- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK);
-}
-
-#else
-
-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
-{
- return 0;
-}
-
-#endif /* CONFIG_PRINTK_NMI */
-
-/*
- * Lock-less printk(), to avoid deadlocks should the printk() recurse
- * into itself. It uses a per-CPU buffer to store the message, just like
- * NMI.
- */
-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
-{
- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
-
- return printk_safe_log_store(s, fmt, args);
-}
-
-/* Can be preempted by NMI. */
-void __printk_safe_enter(void)
-{
- this_cpu_inc(printk_context);
-}
-
-/* Can be preempted by NMI. */
-void __printk_safe_exit(void)
-{
- this_cpu_dec(printk_context);
-}
-
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
-{
- /*
- * Try to use the main logbuf even in NMI. But avoid calling console
- * drivers that might have their own locks.
- */
- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) &&
- raw_spin_trylock(&logbuf_lock)) {
- int len;
-
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
- raw_spin_unlock(&logbuf_lock);
- defer_console_output();
- return len;
- }
-
- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
- return vprintk_nmi(fmt, args);
-
- /* Use extra buffer to prevent a recursion deadlock in safe mode. */
- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
- return vprintk_safe(fmt, args);
-
- /* No obstacles. */
- return vprintk_default(fmt, args);
-}
-
-void __init printk_safe_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct printk_safe_seq_buf *s;
-
- s = &per_cpu(safe_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-
-#ifdef CONFIG_PRINTK_NMI
- s = &per_cpu(nmi_print_seq, cpu);
- init_irq_work(&s->work, __printk_safe_flush);
-#endif
- }
-
- /*
- * In the highly unlikely event that a NMI were to trigger at
- * this moment. Make sure IRQ work is set up before this
- * variable is set.
- */
- barrier();
- printk_safe_irq_ready = 1;
-
- /* Flush pending messages that did not have scheduled IRQ works. */
- printk_safe_flush();
-}
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8929,7 +8929,6 @@ void ftrace_dump(enum ftrace_dump_mode o
tracing_off();
local_irq_save(flags);
- printk_nmi_direct_enter();
/* Simulate the iterator */
trace_init_global_iter(&iter);
@@ -9006,7 +9005,6 @@ void ftrace_dump(enum ftrace_dump_mode o
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
atomic_dec(&dump_running);
- printk_nmi_direct_exit();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(ftrace_dump);
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const
touch_softlockup_watchdog();
}
- /*
- * Force flush any remote buffers that might be stuck in IRQ context
- * and therefore could not run their irq_work.
- */
- printk_safe_flush();
-
clear_bit_unlock(0, &backtrace_flag);
put_cpu();
}

View File

@ -0,0 +1,329 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:50 +0100
Subject: [PATCH 12/25] printk: minimize console locking implementation
Since printing of the printk buffer is now handled by the printk
kthread, minimize the console locking functions to just handle
locking of the console.
NOTE: With this console_flush_on_panic will no longer flush.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 255 -------------------------------------------------
1 file changed, 1 insertion(+), 254 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -227,19 +227,7 @@ static int nr_ext_console_drivers;
static int __down_trylock_console_sem(unsigned long ip)
{
- int lock_failed;
- unsigned long flags;
-
- /*
- * Here and in __up_console_sem() we need to be in safe mode,
- * because spindump/WARN/etc from under console ->lock will
- * deadlock in printk()->down_trylock_console_sem() otherwise.
- */
- printk_safe_enter_irqsave(flags);
- lock_failed = down_trylock(&console_sem);
- printk_safe_exit_irqrestore(flags);
-
- if (lock_failed)
+ if (down_trylock(&console_sem))
return 1;
mutex_acquire(&console_lock_dep_map, 0, 1, ip);
return 0;
@@ -248,13 +236,9 @@ static int __down_trylock_console_sem(un
static void __up_console_sem(unsigned long ip)
{
- unsigned long flags;
-
mutex_release(&console_lock_dep_map, 1, ip);
- printk_safe_enter_irqsave(flags);
up(&console_sem);
- printk_safe_exit_irqrestore(flags);
}
#define up_console_sem() __up_console_sem(_RET_IP_)
@@ -1552,82 +1536,6 @@ static void format_text(struct printk_lo
}
/*
- * Special console_lock variants that help to reduce the risk of soft-lockups.
- * They allow to pass console_lock to another printk() call using a busy wait.
- */
-
-#ifdef CONFIG_LOCKDEP
-static struct lockdep_map console_owner_dep_map = {
- .name = "console_owner"
-};
-#endif
-
-static DEFINE_RAW_SPINLOCK(console_owner_lock);
-static struct task_struct *console_owner;
-static bool console_waiter;
-
-/**
- * console_lock_spinning_enable - mark beginning of code where another
- * thread might safely busy wait
- *
- * This basically converts console_lock into a spinlock. This marks
- * the section where the console_lock owner can not sleep, because
- * there may be a waiter spinning (like a spinlock). Also it must be
- * ready to hand over the lock at the end of the section.
- */
-static void console_lock_spinning_enable(void)
-{
- raw_spin_lock(&console_owner_lock);
- console_owner = current;
- raw_spin_unlock(&console_owner_lock);
-
- /* The waiter may spin on us after setting console_owner */
- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
-}
-
-/**
- * console_lock_spinning_disable_and_check - mark end of code where another
- * thread was able to busy wait and check if there is a waiter
- *
- * This is called at the end of the section where spinning is allowed.
- * It has two functions. First, it is a signal that it is no longer
- * safe to start busy waiting for the lock. Second, it checks if
- * there is a busy waiter and passes the lock rights to her.
- *
- * Important: Callers lose the lock if there was a busy waiter.
- * They must not touch items synchronized by console_lock
- * in this case.
- *
- * Return: 1 if the lock rights were passed, 0 otherwise.
- */
-static int console_lock_spinning_disable_and_check(void)
-{
- int waiter;
-
- raw_spin_lock(&console_owner_lock);
- waiter = READ_ONCE(console_waiter);
- console_owner = NULL;
- raw_spin_unlock(&console_owner_lock);
-
- if (!waiter) {
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
- return 0;
- }
-
- /* The waiter is now free to continue */
- WRITE_ONCE(console_waiter, false);
-
- spin_release(&console_owner_dep_map, 1, _THIS_IP_);
-
- /*
- * Hand off console_lock to waiter. The waiter will perform
- * the up(). After this, the waiter is the console_lock owner.
- */
- mutex_release(&console_lock_dep_map, 1, _THIS_IP_);
- return 1;
-}
-
-/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
@@ -1889,8 +1797,6 @@ static ssize_t msg_print_ext_header(char
static ssize_t msg_print_ext_body(char *buf, size_t size,
char *dict, size_t dict_len,
char *text, size_t text_len) { return 0; }
-static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(void) { return 0; }
static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg, bool syslog,
@@ -2125,35 +2031,6 @@ int is_console_locked(void)
{
return console_locked;
}
-EXPORT_SYMBOL(is_console_locked);
-
-/*
- * Check if we have any console that is capable of printing while cpu is
- * booting or shutting down. Requires console_sem.
- */
-static int have_callable_console(void)
-{
- struct console *con;
-
- for_each_console(con)
- if ((con->flags & CON_ENABLED) &&
- (con->flags & CON_ANYTIME))
- return 1;
-
- return 0;
-}
-
-/*
- * Can we actually use the console at this time on this cpu?
- *
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
- */
-static inline int can_use_console(void)
-{
- return cpu_online(raw_smp_processor_id()) || have_callable_console();
-}
/**
* console_unlock - unlock the console system
@@ -2161,147 +2038,17 @@ static inline int can_use_console(void)
* Releases the console_lock which the caller holds on the console system
* and the console driver list.
*
- * While the console_lock was held, console output may have been buffered
- * by printk(). If this is the case, console_unlock(); emits
- * the output prior to releasing the lock.
- *
- * If there is output waiting, we wake /dev/kmsg and syslog() users.
- *
* console_unlock(); may be called from any context.
*/
void console_unlock(void)
{
- static char ext_text[CONSOLE_EXT_LOG_MAX];
- static char text[LOG_LINE_MAX + PREFIX_MAX];
- unsigned long flags;
- bool do_cond_resched, retry;
-
if (console_suspended) {
up_console_sem();
return;
}
- /*
- * Console drivers are called with interrupts disabled, so
- * @console_may_schedule should be cleared before; however, we may
- * end up dumping a lot of lines, for example, if called from
- * console registration path, and should invoke cond_resched()
- * between lines if allowable. Not doing so can cause a very long
- * scheduling stall on a slow console leading to RCU stall and
- * softlockup warnings which exacerbate the issue with more
- * messages practically incapacitating the system.
- *
- * console_trylock() is not able to detect the preemptive
- * context reliably. Therefore the value must be stored before
- * and cleared after the the "again" goto label.
- */
- do_cond_resched = console_may_schedule;
-again:
- console_may_schedule = 0;
-
- /*
- * We released the console_sem lock, so we need to recheck if
- * cpu is online and (if not) is there at least one CON_ANYTIME
- * console.
- */
- if (!can_use_console()) {
- console_locked = 0;
- up_console_sem();
- return;
- }
-
- for (;;) {
- struct printk_log *msg;
- size_t ext_len = 0;
- size_t len;
-
- printk_safe_enter_irqsave(flags);
- raw_spin_lock(&logbuf_lock);
- if (console_seq < log_first_seq) {
- len = sprintf(text,
- "** %llu printk messages dropped **\n",
- log_first_seq - console_seq);
-
- /* messages are gone, move to first one */
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- } else {
- len = 0;
- }
-skip:
- if (console_seq == log_next_seq)
- break;
-
- msg = log_from_idx(console_idx);
- if (suppress_message_printing(msg->level)) {
- /*
- * Skip record we have buffered and already printed
- * directly to the console when we received it, and
- * record that has level above the console loglevel.
- */
- console_idx = log_next(console_idx);
- console_seq++;
- goto skip;
- }
-
- len += msg_print_text(msg,
- console_msg_format & MSG_FORMAT_SYSLOG,
- printk_time, text + len, sizeof(text) - len);
- if (nr_ext_console_drivers) {
- ext_len = msg_print_ext_header(ext_text,
- sizeof(ext_text),
- msg, console_seq);
- ext_len += msg_print_ext_body(ext_text + ext_len,
- sizeof(ext_text) - ext_len,
- log_dict(msg), msg->dict_len,
- log_text(msg), msg->text_len);
- }
- console_idx = log_next(console_idx);
- console_seq++;
- raw_spin_unlock(&logbuf_lock);
-
- /*
- * While actively printing out messages, if another printk()
- * were to occur on another CPU, it may wait for this one to
- * finish. This task can not be preempted if there is a
- * waiter waiting to take over.
- */
- console_lock_spinning_enable();
-
- stop_critical_timings(); /* don't trace print latency */
- //call_console_drivers(ext_text, ext_len, text, len);
- start_critical_timings();
-
- if (console_lock_spinning_disable_and_check()) {
- printk_safe_exit_irqrestore(flags);
- return;
- }
-
- printk_safe_exit_irqrestore(flags);
-
- if (do_cond_resched)
- cond_resched();
- }
-
console_locked = 0;
-
- raw_spin_unlock(&logbuf_lock);
-
up_console_sem();
-
- /*
- * Someone could have filled up the buffer again, so re-check if there's
- * something to flush. In case we cannot trylock the console_sem again,
- * there's a new owner and the console_unlock() from them will do the
- * flush, no worries.
- */
- raw_spin_lock(&logbuf_lock);
- retry = console_seq != log_next_seq;
- raw_spin_unlock(&logbuf_lock);
- printk_safe_exit_irqrestore(flags);
-
- if (retry && console_trylock())
- goto again;
}
EXPORT_SYMBOL(console_unlock);

View File

@ -0,0 +1,92 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:51 +0100
Subject: [PATCH 13/25] printk: track seq per console
Allow each console to track which seq record was last printed. This
simplifies identifying dropped records.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/console.h | 1 +
kernel/printk/printk.c | 30 +++++++++++++++++++++++++++---
2 files changed, 28 insertions(+), 3 deletions(-)
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -153,6 +153,7 @@ struct console {
short flags;
short index;
int cflag;
+ unsigned long printk_seq;
void *data;
struct console *next;
};
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1507,6 +1507,16 @@ SYSCALL_DEFINE3(syslog, int, type, char
return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}
+static void print_console_dropped(struct console *con, u64 count)
+{
+ char text[64];
+ int len;
+
+ len = sprintf(text, "** %llu printk message%s dropped **\n",
+ count, count > 1 ? "s" : "");
+ con->write(con, text, len);
+}
+
static void format_text(struct printk_log *msg, u64 seq,
char *ext_text, size_t *ext_len,
char *text, size_t *len, bool time)
@@ -1540,7 +1550,7 @@ static void format_text(struct printk_lo
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
*/
-static void call_console_drivers(const char *ext_text, size_t ext_len,
+static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len,
const char *text, size_t len)
{
struct console *con;
@@ -1558,6 +1568,19 @@ static void call_console_drivers(const c
if (!cpu_online(raw_smp_processor_id()) &&
!(con->flags & CON_ANYTIME))
continue;
+ if (con->printk_seq >= seq)
+ continue;
+
+ con->printk_seq++;
+ if (con->printk_seq < seq) {
+ print_console_dropped(con, seq - con->printk_seq);
+ con->printk_seq = seq;
+ }
+
+ /* for supressed messages, only seq is updated */
+ if (len == 0 && ext_len == 0)
+ continue;
+
if (con->flags & CON_EXTENDED)
con->write(con, ext_text, ext_len);
else
@@ -1797,7 +1820,7 @@ static ssize_t msg_print_ext_header(char
static ssize_t msg_print_ext_body(char *buf, size_t size,
char *dict, size_t dict_len,
char *text, size_t text_len) { return 0; }
-static void call_console_drivers(const char *ext_text, size_t ext_len,
+static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
static size_t msg_print_text(const struct printk_log *msg, bool syslog,
bool time, char *buf, size_t size) { return 0; }
@@ -2550,8 +2573,9 @@ static int printk_kthread_func(void *dat
&len, printk_time);
console_lock();
+ call_console_drivers(master_seq, ext_text,
+ ext_len, text, len);
if (len > 0 || ext_len > 0) {
- call_console_drivers(ext_text, ext_len, text, len);
boot_delay_msec(msg->level);
printk_delay();
}

View File

@ -0,0 +1,71 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:52 +0100
Subject: [PATCH 14/25] printk: do boot_delay_msec inside printk_delay
Both functions needed to be called one after the other, so just
integrate boot_delay_msec into printk_delay for simplification.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 35 +++++++++++++++++------------------
1 file changed, 17 insertions(+), 18 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1507,6 +1507,21 @@ SYSCALL_DEFINE3(syslog, int, type, char
return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}
+int printk_delay_msec __read_mostly;
+
+static inline void printk_delay(int level)
+{
+ boot_delay_msec(level);
+ if (unlikely(printk_delay_msec)) {
+ int m = printk_delay_msec;
+
+ while (m--) {
+ mdelay(1);
+ touch_nmi_watchdog();
+ }
+ }
+}
+
static void print_console_dropped(struct console *con, u64 count)
{
char text[64];
@@ -1588,20 +1603,6 @@ static void call_console_drivers(u64 seq
}
}
-int printk_delay_msec __read_mostly;
-
-static inline void printk_delay(void)
-{
- if (unlikely(printk_delay_msec)) {
- int m = printk_delay_msec;
-
- while (m--) {
- mdelay(1);
- touch_nmi_watchdog();
- }
- }
-}
-
static inline u32 printk_caller_id(void)
{
return in_task() ? task_pid_nr(current) :
@@ -2575,10 +2576,8 @@ static int printk_kthread_func(void *dat
console_lock();
call_console_drivers(master_seq, ext_text,
ext_len, text, len);
- if (len > 0 || ext_len > 0) {
- boot_delay_msec(msg->level);
- printk_delay();
- }
+ if (len > 0 || ext_len > 0)
+ printk_delay(msg->level);
console_unlock();
}

View File

@ -0,0 +1,118 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:53 +0100
Subject: [PATCH 15/25] printk: print history for new consoles
When new consoles register, they currently print how many messages
they have missed. However, many (or all) of those messages may still
be in the ring buffer. Add functionality to print as much of the
history as available. This is a clean replacement of the old
exclusive console hack.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/console.h | 1
kernel/printk/printk.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 76 insertions(+)
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -154,6 +154,7 @@ struct console {
short index;
int cflag;
unsigned long printk_seq;
+ int wrote_history;
void *data;
struct console *next;
};
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1560,6 +1560,77 @@ static void format_text(struct printk_lo
}
}
+static void printk_write_history(struct console *con, u64 master_seq)
+{
+ struct prb_iterator iter;
+ bool time = printk_time;
+ static char *ext_text;
+ static char *text;
+ static char *buf;
+ u64 seq;
+
+ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
+ buf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
+ if (!ext_text || !text || !buf)
+ return;
+
+ if (!(con->flags & CON_ENABLED))
+ goto out;
+
+ if (!con->write)
+ goto out;
+
+ if (!cpu_online(raw_smp_processor_id()) &&
+ !(con->flags & CON_ANYTIME))
+ goto out;
+
+ prb_iter_init(&iter, &printk_rb, NULL);
+
+ for (;;) {
+ struct printk_log *msg;
+ size_t ext_len;
+ size_t len;
+ int ret;
+
+ ret = prb_iter_next(&iter, buf, PRINTK_RECORD_MAX, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ prb_iter_init(&iter, &printk_rb, NULL);
+ continue;
+ }
+
+ if (seq > master_seq)
+ break;
+
+ con->printk_seq++;
+ if (con->printk_seq < seq) {
+ print_console_dropped(con, seq - con->printk_seq);
+ con->printk_seq = seq;
+ }
+
+ msg = (struct printk_log *)buf;
+ format_text(msg, master_seq, ext_text, &ext_len, text,
+ &len, time);
+
+ if (len == 0 && ext_len == 0)
+ continue;
+
+ if (con->flags & CON_EXTENDED)
+ con->write(con, ext_text, ext_len);
+ else
+ con->write(con, text, len);
+
+ printk_delay(msg->level);
+ }
+out:
+ con->wrote_history = 1;
+ kfree(ext_text);
+ kfree(text);
+ kfree(buf);
+}
+
/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
@@ -1578,6 +1649,10 @@ static void call_console_drivers(u64 seq
for_each_console(con) {
if (!(con->flags & CON_ENABLED))
continue;
+ if (!con->wrote_history) {
+ printk_write_history(con, seq);
+ continue;
+ }
if (!con->write)
continue;
if (!cpu_online(raw_smp_processor_id()) &&

View File

@ -0,0 +1,91 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:54 +0100
Subject: [PATCH 16/25] printk: implement CON_PRINTBUFFER
If the CON_PRINTBUFFER flag is not set, do not replay the history
for that console.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 34 ++++++----------------------------
1 file changed, 6 insertions(+), 28 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -419,10 +419,6 @@ static u32 log_first_idx;
static u64 log_next_seq;
static u32 log_next_idx;
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u32 console_idx;
-
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
static u32 clear_idx;
@@ -1650,8 +1646,12 @@ static void call_console_drivers(u64 seq
if (!(con->flags & CON_ENABLED))
continue;
if (!con->wrote_history) {
- printk_write_history(con, seq);
- continue;
+ if (con->flags & CON_PRINTBUFFER) {
+ printk_write_history(con, seq);
+ continue;
+ }
+ con->wrote_history = 1;
+ con->printk_seq = seq - 1;
}
if (!con->write)
continue;
@@ -1881,8 +1881,6 @@ EXPORT_SYMBOL(printk);
static u64 syslog_seq;
static u32 syslog_idx;
-static u64 console_seq;
-static u32 console_idx;
static u64 log_first_seq;
static u32 log_first_idx;
static u64 log_next_seq;
@@ -2206,15 +2204,6 @@ void console_flush_on_panic(enum con_flu
*/
console_trylock();
console_may_schedule = 0;
-
- if (mode == CONSOLE_REPLAY_ALL) {
- unsigned long flags;
-
- logbuf_lock_irqsave(flags);
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- logbuf_unlock_irqrestore(flags);
- }
console_unlock();
}
@@ -2293,7 +2282,6 @@ early_param("keep_bootcon", keep_bootcon
void register_console(struct console *newcon)
{
int i;
- unsigned long flags;
struct console *bcon = NULL;
struct console_cmdline *c;
static bool has_preferred;
@@ -2409,16 +2397,6 @@ void register_console(struct console *ne
if (newcon->flags & CON_EXTENDED)
nr_ext_console_drivers++;
- if (newcon->flags & CON_PRINTBUFFER) {
- /*
- * console_unlock(); will print out the buffered messages
- * for us.
- */
- logbuf_lock_irqsave(flags);
- console_seq = syslog_seq;
- console_idx = syslog_idx;
- logbuf_unlock_irqrestore(flags);
- }
console_unlock();
console_sysfs_notify();

View File

@ -0,0 +1,99 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:55 +0100
Subject: [PATCH 17/25] printk: add processor number to output
It can be difficult to sort printk out if multiple processors are
printing simultaneously. Add the processor number to the printk
output to allow the messages to be sorted.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 19 +++++++++++++++----
1 file changed, 15 insertions(+), 4 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -348,6 +348,7 @@ enum log_flags {
struct printk_log {
u64 ts_nsec; /* timestamp in nanoseconds */
+ u16 cpu; /* cpu that generated record */
u16 len; /* length of entire record */
u16 text_len; /* length of text buffer */
u16 dict_len; /* length of dictionary buffer */
@@ -499,7 +500,7 @@ static u32 log_next(u32 idx)
/* insert record into the buffer, discard old ones, update heads */
static int log_store(u32 caller_id, int facility, int level,
- enum log_flags flags, u64 ts_nsec,
+ enum log_flags flags, u64 ts_nsec, u16 cpu,
const char *dict, u16 dict_len,
const char *text, u16 text_len)
{
@@ -533,6 +534,7 @@ static int log_store(u32 caller_id, int
#ifdef CONFIG_PRINTK_CALLER
msg->caller_id = caller_id;
#endif
+ msg->cpu = cpu;
msg->len = size;
/* insert message */
@@ -606,9 +608,9 @@ static ssize_t msg_print_ext_header(char
do_div(ts_usec, 1000);
- return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
+ return scnprintf(buf, size, "%u,%llu,%llu,%c%s,%hu;",
(msg->facility << 3) | msg->level, seq, ts_usec,
- msg->flags & LOG_CONT ? 'c' : '-', caller);
+ msg->flags & LOG_CONT ? 'c' : '-', caller, msg->cpu);
}
static ssize_t msg_print_ext_body(char *buf, size_t size,
@@ -1142,6 +1144,11 @@ static inline void boot_delay_msec(int l
static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
+static size_t print_cpu(u16 cpu, char *buf)
+{
+ return sprintf(buf, "%03hu: ", cpu);
+}
+
static size_t print_syslog(unsigned int level, char *buf)
{
return sprintf(buf, "<%u>", level);
@@ -1185,6 +1192,7 @@ static size_t print_prefix(const struct
buf[len++] = ' ';
buf[len] = '\0';
}
+ len += print_cpu(msg->cpu, buf + len);
return len;
}
@@ -1760,6 +1768,7 @@ asmlinkage int vprintk_emit(int facility
u64 ts_nsec;
char *text;
char *rbuf;
+ int cpu;
ts_nsec = local_clock();
@@ -1769,6 +1778,8 @@ asmlinkage int vprintk_emit(int facility
return printed_len;
}
+ cpu = raw_smp_processor_id();
+
text = rbuf;
text_len = vscnprintf(text, PRINTK_SPRINT_MAX, fmt, args);
@@ -1803,7 +1814,7 @@ asmlinkage int vprintk_emit(int facility
if (dict)
lflags |= LOG_NEWLINE;
- printed_len = log_store(caller_id, facility, level, lflags, ts_nsec,
+ printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, cpu,
dict, dictlen, text, text_len);
prb_commit(&h);

View File

@ -0,0 +1,64 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:56 +0100
Subject: [PATCH 18/25] console: add write_atomic interface
Add a write_atomic callback to the console. This is an optional
function for console drivers. The function must be atomic (including
NMI safe) for writing to the console.
Console drivers must still implement the write callback. The
write_atomic callback will only be used for emergency messages.
Creating an NMI safe write_atomic that must synchronize with write
requires a careful implementation of the console driver. To aid with
the implementation, a set of console_atomic_* functions are provided:
void console_atomic_lock(unsigned int *flags);
void console_atomic_unlock(unsigned int flags);
These functions synchronize using the processor-reentrant cpu lock of
the printk buffer.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/console.h | 4 ++++
kernel/printk/printk.c | 12 ++++++++++++
2 files changed, 16 insertions(+)
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -145,6 +145,7 @@ static inline int con_debug_leave(void)
struct console {
char name[16];
void (*write)(struct console *, const char *, unsigned);
+ void (*write_atomic)(struct console *, const char *, unsigned);
int (*read)(struct console *, char *, unsigned);
struct tty_driver *(*device)(struct console *, int *);
void (*unblank)(void);
@@ -236,4 +237,7 @@ extern void console_init(void);
void dummycon_register_output_notifier(struct notifier_block *nb);
void dummycon_unregister_output_notifier(struct notifier_block *nb);
+extern void console_atomic_lock(unsigned int *flags);
+extern void console_atomic_unlock(unsigned int flags);
+
#endif /* _LINUX_CONSOLE_H */
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3044,3 +3044,15 @@ void kmsg_dump_rewind(struct kmsg_dumper
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
#endif
+
+void console_atomic_lock(unsigned int *flags)
+{
+ prb_lock(&printk_cpulock, flags);
+}
+EXPORT_SYMBOL(console_atomic_lock);
+
+void console_atomic_unlock(unsigned int flags)
+{
+ prb_unlock(&printk_cpulock, flags);
+}
+EXPORT_SYMBOL(console_atomic_unlock);

View File

@ -0,0 +1,272 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:57 +0100
Subject: [PATCH 19/25] printk: introduce emergency messages
Console messages are generally either critical or non-critical.
Critical messages are messages such as crashes or sysrq output.
Critical messages should never be lost because generally they provide
important debugging information.
Since all console messages are output via a fully preemptible printk
kernel thread, it is possible that messages are not output because
that thread cannot be scheduled (BUG in scheduler, run-away RT task,
etc).
To allow critical messages to be output independent of the
schedulability of the printk task, introduce an emergency mechanism
that _immediately_ outputs the message to the consoles. To avoid
possible unbounded latency issues, the emergency mechanism only
outputs the printk line provided by the caller and ignores any
pending messages in the log buffer.
Critical messages are identified as messages (by default) with log
level LOGLEVEL_WARNING or more critical. This is configurable via the
kernel option CONSOLE_LOGLEVEL_EMERGENCY.
Any messages output as emergency messages are skipped by the printk
thread on those consoles that output the emergency message.
In order for a console driver to support emergency messages, the
write_atomic function must be implemented by the driver. If not
implemented, the emergency messages are handled like all other
messages and are printed by the printk thread.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/printk.h | 2
kernel/printk/printk.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++---
lib/Kconfig.debug | 17 +++++++
3 files changed, 124 insertions(+), 6 deletions(-)
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -58,6 +58,7 @@ static inline const char *printk_skip_he
*/
#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
#define CONSOLE_LOGLEVEL_QUIET CONFIG_CONSOLE_LOGLEVEL_QUIET
+#define CONSOLE_LOGLEVEL_EMERGENCY CONFIG_CONSOLE_LOGLEVEL_EMERGENCY
extern int console_printk[];
@@ -65,6 +66,7 @@ extern int console_printk[];
#define default_message_loglevel (console_printk[1])
#define minimum_console_loglevel (console_printk[2])
#define default_console_loglevel (console_printk[3])
+#define emergency_console_loglevel (console_printk[4])
static inline void console_silent(void)
{
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -46,6 +46,7 @@
#include <linux/ctype.h>
#include <linux/uio.h>
#include <linux/kthread.h>
+#include <linux/clocksource.h>
#include <linux/printk_ringbuffer.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
@@ -62,11 +63,12 @@
#include "braille.h"
#include "internal.h"
-int console_printk[4] = {
+int console_printk[5] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
+ CONSOLE_LOGLEVEL_EMERGENCY, /* emergency_console_loglevel */
};
EXPORT_SYMBOL_GPL(console_printk);
@@ -498,6 +500,9 @@ static u32 log_next(u32 idx)
return idx + msg->len;
}
+static void printk_emergency(char *buffer, int level, u64 ts_nsec, u16 cpu,
+ char *text, u16 text_len);
+
/* insert record into the buffer, discard old ones, update heads */
static int log_store(u32 caller_id, int facility, int level,
enum log_flags flags, u64 ts_nsec, u16 cpu,
@@ -1641,7 +1646,7 @@ static void printk_write_history(struct
* The console_lock must be held.
*/
static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len,
- const char *text, size_t len)
+ const char *text, size_t len, int level)
{
struct console *con;
@@ -1661,6 +1666,18 @@ static void call_console_drivers(u64 seq
con->wrote_history = 1;
con->printk_seq = seq - 1;
}
+ if (con->write_atomic && level < emergency_console_loglevel) {
+ /* skip emergency messages, already printed */
+ if (con->printk_seq < seq)
+ con->printk_seq = seq;
+ continue;
+ }
+ if (con->flags & CON_BOOT) {
+ /* skip emergency messages, already printed */
+ if (con->printk_seq < seq)
+ con->printk_seq = seq;
+ continue;
+ }
if (!con->write)
continue;
if (!cpu_online(raw_smp_processor_id()) &&
@@ -1780,8 +1797,12 @@ asmlinkage int vprintk_emit(int facility
cpu = raw_smp_processor_id();
- text = rbuf;
- text_len = vscnprintf(text, PRINTK_SPRINT_MAX, fmt, args);
+ /*
+ * If this turns out to be an emergency message, there
+ * may need to be a prefix added. Leave room for it.
+ */
+ text = rbuf + PREFIX_MAX;
+ text_len = vscnprintf(text, PRINTK_SPRINT_MAX - PREFIX_MAX, fmt, args);
/* strip and flag a trailing newline */
if (text_len && text[text_len-1] == '\n') {
@@ -1814,6 +1835,14 @@ asmlinkage int vprintk_emit(int facility
if (dict)
lflags |= LOG_NEWLINE;
+ /*
+ * NOTE:
+ * - rbuf points to beginning of allocated buffer
+ * - text points to beginning of text
+ * - there is room before text for prefix
+ */
+ printk_emergency(rbuf, level, ts_nsec, cpu, text, text_len);
+
printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, cpu,
dict, dictlen, text, text_len);
@@ -1906,7 +1935,7 @@ static ssize_t msg_print_ext_body(char *
char *dict, size_t dict_len,
char *text, size_t text_len) { return 0; }
static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len,
- const char *text, size_t len) {}
+ const char *text, size_t len, int level) {}
static size_t msg_print_text(const struct printk_log *msg, bool syslog,
bool time, char *buf, size_t size) { return 0; }
static bool suppress_message_printing(int level) { return false; }
@@ -2639,7 +2668,7 @@ static int printk_kthread_func(void *dat
console_lock();
call_console_drivers(master_seq, ext_text,
- ext_len, text, len);
+ ext_len, text, len, msg->level);
if (len > 0 || ext_len > 0)
printk_delay(msg->level);
console_unlock();
@@ -3043,6 +3072,76 @@ void kmsg_dump_rewind(struct kmsg_dumper
logbuf_unlock_irqrestore(flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+
+static bool console_can_emergency(int level)
+{
+ struct console *con;
+
+ for_each_console(con) {
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ if (con->write_atomic && level < emergency_console_loglevel)
+ return true;
+ if (con->write && (con->flags & CON_BOOT))
+ return true;
+ }
+ return false;
+}
+
+static void call_emergency_console_drivers(int level, const char *text,
+ size_t text_len)
+{
+ struct console *con;
+
+ for_each_console(con) {
+ if (!(con->flags & CON_ENABLED))
+ continue;
+ if (con->write_atomic && level < emergency_console_loglevel) {
+ con->write_atomic(con, text, text_len);
+ continue;
+ }
+ if (con->write && (con->flags & CON_BOOT)) {
+ con->write(con, text, text_len);
+ continue;
+ }
+ }
+}
+
+static void printk_emergency(char *buffer, int level, u64 ts_nsec, u16 cpu,
+ char *text, u16 text_len)
+{
+ struct printk_log msg;
+ size_t prefix_len;
+
+ if (!console_can_emergency(level))
+ return;
+
+ msg.level = level;
+ msg.ts_nsec = ts_nsec;
+ msg.cpu = cpu;
+ msg.facility = 0;
+
+ /* "text" must have PREFIX_MAX preceding bytes available */
+
+ prefix_len = print_prefix(&msg,
+ console_msg_format & MSG_FORMAT_SYSLOG,
+ printk_time, buffer);
+ /* move the prefix forward to the beginning of the message text */
+ text -= prefix_len;
+ memmove(text, buffer, prefix_len);
+ text_len += prefix_len;
+
+ text[text_len++] = '\n';
+
+ call_emergency_console_drivers(level, text, text_len);
+
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
+ touch_nmi_watchdog();
+
+ printk_delay(level);
+}
#endif
void console_atomic_lock(unsigned int *flags)
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -61,6 +61,23 @@ config CONSOLE_LOGLEVEL_QUIET
will be used as the loglevel. IOW passing "quiet" will be the
equivalent of passing "loglevel=<CONSOLE_LOGLEVEL_QUIET>"
+config CONSOLE_LOGLEVEL_EMERGENCY
+ int "Emergency console loglevel (1-15)"
+ range 1 15
+ default "5"
+ help
+ The loglevel to determine if a console message is an emergency
+ message.
+
+ If supported by the console driver, emergency messages will be
+ flushed to the console immediately. This can cause significant system
+ latencies so the value should be set such that only significant
+ messages are classified as emergency messages.
+
+ Setting a default here is equivalent to passing in
+ emergency_loglevel=<x> in the kernel bootargs. emergency_loglevel=<x>
+ continues to override whatever value is specified here as well.
+
config MESSAGE_LOGLEVEL_DEFAULT
int "Default message log level (1-7)"
range 1 7

View File

@ -0,0 +1,484 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:58 +0100
Subject: [PATCH 20/25] serial: 8250: implement write_atomic
Implement a non-sleeping NMI-safe write_atomic console function in
order to support emergency printk messages.
Since interrupts need to be disabled during transmit, all usage of
the IER register was wrapped with access functions that use the
console_atomic_lock function to synchronize register access while
tracking the state of the interrupts. This was necessary because
write_atomic is can be calling from an NMI context that has
preempted write_atomic.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/tty/serial/8250/8250.h | 22 +++++
drivers/tty/serial/8250/8250_core.c | 19 +++-
drivers/tty/serial/8250/8250_dma.c | 4
drivers/tty/serial/8250/8250_port.c | 154 ++++++++++++++++++++++++++----------
include/linux/serial_8250.h | 5 +
5 files changed, 157 insertions(+), 47 deletions(-)
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -96,6 +96,10 @@ struct serial8250_config {
#define SERIAL8250_SHARE_IRQS 0
#endif
+void set_ier(struct uart_8250_port *up, unsigned char ier);
+void clear_ier(struct uart_8250_port *up);
+void restore_ier(struct uart_8250_port *up);
+
#define SERIAL8250_PORT_FLAGS(_base, _irq, _flags) \
{ \
.iobase = _base, \
@@ -139,6 +143,15 @@ static inline bool serial8250_set_THRI(s
return true;
}
+static inline bool serial8250_set_THRI_sier(struct uart_8250_port *up)
+{
+ if (up->ier & UART_IER_THRI)
+ return false;
+ up->ier |= UART_IER_THRI;
+ set_ier(up, up->ier);
+ return true;
+}
+
static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
{
if (!(up->ier & UART_IER_THRI))
@@ -148,6 +161,15 @@ static inline bool serial8250_clear_THRI
return true;
}
+static inline bool serial8250_clear_THRI_sier(struct uart_8250_port *up)
+{
+ if (!(up->ier & UART_IER_THRI))
+ return false;
+ up->ier &= ~UART_IER_THRI;
+ set_ier(up, up->ier);
+ return true;
+}
+
struct uart_8250_port *serial8250_get_port(int line);
void serial8250_rpm_get(struct uart_8250_port *p);
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -265,7 +265,7 @@ static void serial8250_timeout(struct ti
static void serial8250_backup_timeout(struct timer_list *t)
{
struct uart_8250_port *up = from_timer(up, t, timer);
- unsigned int iir, ier = 0, lsr;
+ unsigned int iir, lsr;
unsigned long flags;
spin_lock_irqsave(&up->port.lock, flags);
@@ -274,10 +274,8 @@ static void serial8250_backup_timeout(st
* Must disable interrupts or else we risk racing with the interrupt
* based handler.
*/
- if (up->port.irq) {
- ier = serial_in(up, UART_IER);
- serial_out(up, UART_IER, 0);
- }
+ if (up->port.irq)
+ clear_ier(up);
iir = serial_in(up, UART_IIR);
@@ -300,7 +298,7 @@ static void serial8250_backup_timeout(st
serial8250_tx_chars(up);
if (up->port.irq)
- serial_out(up, UART_IER, ier);
+ restore_ier(up);
spin_unlock_irqrestore(&up->port.lock, flags);
@@ -578,6 +576,14 @@ serial8250_register_ports(struct uart_dr
#ifdef CONFIG_SERIAL_8250_CONSOLE
+static void univ8250_console_write_atomic(struct console *co, const char *s,
+ unsigned int count)
+{
+ struct uart_8250_port *up = &serial8250_ports[co->index];
+
+ serial8250_console_write_atomic(up, s, count);
+}
+
static void univ8250_console_write(struct console *co, const char *s,
unsigned int count)
{
@@ -663,6 +669,7 @@ static int univ8250_console_match(struct
static struct console univ8250_console = {
.name = "ttyS",
+ .write_atomic = univ8250_console_write_atomic,
.write = univ8250_console_write,
.device = uart_console_device,
.setup = univ8250_console_setup,
--- a/drivers/tty/serial/8250/8250_dma.c
+++ b/drivers/tty/serial/8250/8250_dma.c
@@ -35,7 +35,7 @@ static void __dma_tx_complete(void *para
ret = serial8250_tx_dma(p);
if (ret)
- serial8250_set_THRI(p);
+ serial8250_set_THRI_sier(p);
spin_unlock_irqrestore(&p->port.lock, flags);
}
@@ -98,7 +98,7 @@ int serial8250_tx_dma(struct uart_8250_p
dma_async_issue_pending(dma->txchan);
if (dma->tx_err) {
dma->tx_err = 0;
- serial8250_clear_THRI(p);
+ serial8250_clear_THRI_sier(p);
}
return 0;
err:
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -721,7 +721,7 @@ static void serial8250_set_sleep(struct
serial_out(p, UART_EFR, UART_EFR_ECB);
serial_out(p, UART_LCR, 0);
}
- serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0);
+ set_ier(p, sleep ? UART_IERX_SLEEP : 0);
if (p->capabilities & UART_CAP_EFR) {
serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
serial_out(p, UART_EFR, efr);
@@ -1390,7 +1390,7 @@ static void serial8250_stop_rx(struct ua
up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
up->port.read_status_mask &= ~UART_LSR_DR;
- serial_port_out(port, UART_IER, up->ier);
+ set_ier(up, up->ier);
serial8250_rpm_put(up);
}
@@ -1408,7 +1408,7 @@ static void __do_stop_tx_rs485(struct ua
serial8250_clear_and_reinit_fifos(p);
p->ier |= UART_IER_RLSI | UART_IER_RDI;
- serial_port_out(&p->port, UART_IER, p->ier);
+ set_ier(p, p->ier);
}
}
static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t)
@@ -1459,7 +1459,7 @@ static void __stop_tx_rs485(struct uart_
static inline void __do_stop_tx(struct uart_8250_port *p)
{
- if (serial8250_clear_THRI(p))
+ if (serial8250_clear_THRI_sier(p))
serial8250_rpm_put_tx(p);
}
@@ -1509,7 +1509,7 @@ static inline void __start_tx(struct uar
if (up->dma && !up->dma->tx_dma(up))
return;
- if (serial8250_set_THRI(up)) {
+ if (serial8250_set_THRI_sier(up)) {
if (up->bugs & UART_BUG_TXEN) {
unsigned char lsr;
@@ -1616,7 +1616,7 @@ static void serial8250_disable_ms(struct
mctrl_gpio_disable_ms(up->gpios);
up->ier &= ~UART_IER_MSI;
- serial_port_out(port, UART_IER, up->ier);
+ set_ier(up, up->ier);
}
static void serial8250_enable_ms(struct uart_port *port)
@@ -1632,7 +1632,7 @@ static void serial8250_enable_ms(struct
up->ier |= UART_IER_MSI;
serial8250_rpm_get(up);
- serial_port_out(port, UART_IER, up->ier);
+ set_ier(up, up->ier);
serial8250_rpm_put(up);
}
@@ -1991,6 +1991,52 @@ static void wait_for_xmitr(struct uart_8
}
}
+static atomic_t ier_counter = ATOMIC_INIT(0);
+static atomic_t ier_value = ATOMIC_INIT(0);
+
+void set_ier(struct uart_8250_port *up, unsigned char ier)
+{
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+
+ console_atomic_lock(&flags);
+ if (atomic_read(&ier_counter) > 0)
+ atomic_set(&ier_value, ier);
+ else
+ serial_port_out(port, UART_IER, ier);
+ console_atomic_unlock(flags);
+}
+
+void clear_ier(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+ unsigned int ier_cleared = 0;
+ unsigned int flags;
+ unsigned int ier;
+
+ console_atomic_lock(&flags);
+ atomic_inc(&ier_counter);
+ ier = serial_port_in(port, UART_IER);
+ if (up->capabilities & UART_CAP_UUE)
+ ier_cleared = UART_IER_UUE;
+ if (ier != ier_cleared) {
+ serial_port_out(port, UART_IER, ier_cleared);
+ atomic_set(&ier_value, ier);
+ }
+ console_atomic_unlock(flags);
+}
+
+void restore_ier(struct uart_8250_port *up)
+{
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+
+ console_atomic_lock(&flags);
+ if (atomic_fetch_dec(&ier_counter) == 1)
+ serial_port_out(port, UART_IER, atomic_read(&ier_value));
+ console_atomic_unlock(flags);
+}
+
#ifdef CONFIG_CONSOLE_POLL
/*
* Console polling routines for writing and reading from the uart while
@@ -2022,18 +2068,10 @@ static int serial8250_get_poll_char(stru
static void serial8250_put_poll_char(struct uart_port *port,
unsigned char c)
{
- unsigned int ier;
struct uart_8250_port *up = up_to_u8250p(port);
serial8250_rpm_get(up);
- /*
- * First save the IER then disable the interrupts
- */
- ier = serial_port_in(port, UART_IER);
- if (up->capabilities & UART_CAP_UUE)
- serial_port_out(port, UART_IER, UART_IER_UUE);
- else
- serial_port_out(port, UART_IER, 0);
+ clear_ier(up);
wait_for_xmitr(up, BOTH_EMPTY);
/*
@@ -2046,7 +2084,7 @@ static void serial8250_put_poll_char(str
* and restore the IER
*/
wait_for_xmitr(up, BOTH_EMPTY);
- serial_port_out(port, UART_IER, ier);
+ restore_ier(up);
serial8250_rpm_put(up);
}
@@ -2358,7 +2396,7 @@ void serial8250_do_shutdown(struct uart_
*/
spin_lock_irqsave(&port->lock, flags);
up->ier = 0;
- serial_port_out(port, UART_IER, 0);
+ set_ier(up, 0);
spin_unlock_irqrestore(&port->lock, flags);
synchronize_irq(port->irq);
@@ -2643,7 +2681,7 @@ serial8250_do_set_termios(struct uart_po
if (up->capabilities & UART_CAP_RTOIE)
up->ier |= UART_IER_RTOIE;
- serial_port_out(port, UART_IER, up->ier);
+ set_ier(up, up->ier);
if (up->capabilities & UART_CAP_EFR) {
unsigned char efr = 0;
@@ -3107,7 +3145,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_default
#ifdef CONFIG_SERIAL_8250_CONSOLE
-static void serial8250_console_putchar(struct uart_port *port, int ch)
+static void serial8250_console_putchar_locked(struct uart_port *port, int ch)
{
struct uart_8250_port *up = up_to_u8250p(port);
@@ -3115,6 +3153,18 @@ static void serial8250_console_putchar(s
serial_port_out(port, UART_TX, ch);
}
+static void serial8250_console_putchar(struct uart_port *port, int ch)
+{
+ struct uart_8250_port *up = up_to_u8250p(port);
+ unsigned int flags;
+
+ wait_for_xmitr(up, UART_LSR_THRE);
+
+ console_atomic_lock(&flags);
+ serial8250_console_putchar_locked(port, ch);
+ console_atomic_unlock(flags);
+}
+
/*
* Restore serial console when h/w power-off detected
*/
@@ -3136,6 +3186,42 @@ static void serial8250_console_restore(s
serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS);
}
+void serial8250_console_write_atomic(struct uart_8250_port *up,
+ const char *s, unsigned int count)
+{
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ bool locked;
+
+ console_atomic_lock(&flags);
+
+ /*
+ * If possible, keep any other CPUs from working with the
+ * UART until the atomic message is completed. This helps
+ * to keep the output more orderly.
+ */
+ locked = spin_trylock(&port->lock);
+
+ touch_nmi_watchdog();
+
+ clear_ier(up);
+
+ if (atomic_fetch_inc(&up->console_printing)) {
+ uart_console_write(port, "\n", 1,
+ serial8250_console_putchar_locked);
+ }
+ uart_console_write(port, s, count, serial8250_console_putchar_locked);
+ atomic_dec(&up->console_printing);
+
+ wait_for_xmitr(up, BOTH_EMPTY);
+ restore_ier(up);
+
+ if (locked)
+ spin_unlock(&port->lock);
+
+ console_atomic_unlock(flags);
+}
+
/*
* Print a string to the serial port trying not to disturb
* any possible real use of the port...
@@ -3147,27 +3233,13 @@ void serial8250_console_write(struct uar
{
struct uart_port *port = &up->port;
unsigned long flags;
- unsigned int ier;
- int locked = 1;
touch_nmi_watchdog();
serial8250_rpm_get(up);
+ spin_lock_irqsave(&port->lock, flags);
- if (oops_in_progress)
- locked = spin_trylock_irqsave(&port->lock, flags);
- else
- spin_lock_irqsave(&port->lock, flags);
-
- /*
- * First save the IER then disable the interrupts
- */
- ier = serial_port_in(port, UART_IER);
-
- if (up->capabilities & UART_CAP_UUE)
- serial_port_out(port, UART_IER, UART_IER_UUE);
- else
- serial_port_out(port, UART_IER, 0);
+ clear_ier(up);
/* check scratch reg to see if port powered off during system sleep */
if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
@@ -3175,14 +3247,16 @@ void serial8250_console_write(struct uar
up->canary = 0;
}
+ atomic_inc(&up->console_printing);
uart_console_write(port, s, count, serial8250_console_putchar);
+ atomic_dec(&up->console_printing);
/*
* Finally, wait for transmitter to become empty
* and restore the IER
*/
wait_for_xmitr(up, BOTH_EMPTY);
- serial_port_out(port, UART_IER, ier);
+ restore_ier(up);
/*
* The receive handling will happen properly because the
@@ -3194,8 +3268,7 @@ void serial8250_console_write(struct uar
if (up->msr_saved_flags)
serial8250_modem_status(up);
- if (locked)
- spin_unlock_irqrestore(&port->lock, flags);
+ spin_unlock_irqrestore(&port->lock, flags);
serial8250_rpm_put(up);
}
@@ -3216,6 +3289,7 @@ static unsigned int probe_baud(struct ua
int serial8250_console_setup(struct uart_port *port, char *options, bool probe)
{
+ struct uart_8250_port *up = up_to_u8250p(port);
int baud = 9600;
int bits = 8;
int parity = 'n';
@@ -3224,6 +3298,8 @@ int serial8250_console_setup(struct uart
if (!port->iobase && !port->membase)
return -ENODEV;
+ atomic_set(&up->console_printing, 0);
+
if (options)
uart_parse_options(options, &baud, &parity, &bits, &flow);
else if (probe)
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -7,6 +7,7 @@
#ifndef _LINUX_SERIAL_8250_H
#define _LINUX_SERIAL_8250_H
+#include <linux/atomic.h>
#include <linux/serial_core.h>
#include <linux/serial_reg.h>
#include <linux/platform_device.h>
@@ -123,6 +124,8 @@ struct uart_8250_port {
#define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA
unsigned char msr_saved_flags;
+ atomic_t console_printing;
+
struct uart_8250_dma *dma;
const struct uart_8250_ops *ops;
@@ -174,6 +177,8 @@ void serial8250_init_port(struct uart_82
void serial8250_set_defaults(struct uart_8250_port *up);
void serial8250_console_write(struct uart_8250_port *up, const char *s,
unsigned int count);
+void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s,
+ unsigned int count);
int serial8250_console_setup(struct uart_port *port, char *options, bool probe);
extern void serial8250_set_isa_configurator(void (*v)

View File

@ -0,0 +1,132 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:29:59 +0100
Subject: [PATCH 21/25] printk: implement KERN_CONT
Implement KERN_CONT based on the printing CPU rather than on the
printing task. As long as the KERN_CONT messages are coming from the
same CPU and no non-KERN_CONT messages come, the messages are assumed
to belong to each other.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 65 +++++++++++++++++++++++++++----------------------
1 file changed, 37 insertions(+), 28 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1709,8 +1709,6 @@ static inline u32 printk_caller_id(void)
0x80000000 + raw_smp_processor_id();
}
-/* FIXME: no support for LOG_CONT */
-#if 0
/*
* Continuation lines are buffered, and not committed to the record buffer
* until the line is complete, or a race forces it. The line fragments
@@ -1721,52 +1719,55 @@ static struct cont {
char buf[LOG_LINE_MAX];
size_t len; /* length == 0 means unused buffer */
u32 caller_id; /* printk_caller_id() of first print */
+ int cpu_owner; /* cpu of first print */
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
u8 facility; /* log facility of first message */
enum log_flags flags; /* prefix, newline flags */
-} cont;
+} cont[2];
-static void cont_flush(void)
+static void cont_flush(int ctx)
{
- if (cont.len == 0)
+ struct cont *c = &cont[ctx];
+
+ if (c->len == 0)
return;
- log_store(cont.caller_id, cont.facility, cont.level, cont.flags,
- cont.ts_nsec, NULL, 0, cont.buf, cont.len);
- cont.len = 0;
+ log_store(c->caller_id, c->facility, c->level, c->flags,
+ c->ts_nsec, c->cpu_owner, NULL, 0, c->buf, c->len);
+ c->len = 0;
}
-static bool cont_add(u32 caller_id, int facility, int level,
+static void cont_add(int ctx, int cpu, u32 caller_id, int facility, int level,
enum log_flags flags, const char *text, size_t len)
{
+ struct cont *c = &cont[ctx];
+
+ if (cpu != c->cpu_owner || !(flags & LOG_CONT))
+ cont_flush(ctx);
+
/* If the line gets too long, split it up in separate records. */
- if (cont.len + len > sizeof(cont.buf)) {
- cont_flush();
- return false;
- }
+ while (c->len + len > sizeof(c->buf))
+ cont_flush(ctx);
- if (!cont.len) {
- cont.facility = facility;
- cont.level = level;
- cont.caller_id = caller_id;
- cont.ts_nsec = local_clock();
- cont.flags = flags;
+ if (!c->len) {
+ c->facility = facility;
+ c->level = level;
+ c->caller_id = caller_id;
+ c->ts_nsec = local_clock();
+ c->flags = flags;
+ c->cpu_owner = cpu;
}
- memcpy(cont.buf + cont.len, text, len);
- cont.len += len;
+ memcpy(c->buf + c->len, text, len);
+ c->len += len;
// The original flags come from the first line,
// but later continuations can add a newline.
if (flags & LOG_NEWLINE) {
- cont.flags |= LOG_NEWLINE;
- cont_flush();
+ c->flags |= LOG_NEWLINE;
}
-
- return true;
}
-#endif /* 0 */
/* ring buffer used as memory allocator for temporary sprint buffers */
DECLARE_STATIC_PRINTKRB(sprint_rb,
@@ -1778,6 +1779,7 @@ asmlinkage int vprintk_emit(int facility
const char *fmt, va_list args)
{
const u32 caller_id = printk_caller_id();
+ int ctx = !!in_nmi();
enum log_flags lflags = 0;
int printed_len = 0;
struct prb_handle h;
@@ -1843,8 +1845,15 @@ asmlinkage int vprintk_emit(int facility
*/
printk_emergency(rbuf, level, ts_nsec, cpu, text, text_len);
- printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, cpu,
- dict, dictlen, text, text_len);
+ if ((lflags & LOG_CONT) || !(lflags & LOG_NEWLINE)) {
+ cont_add(ctx, cpu, caller_id, facility, level, lflags, text, text_len);
+ printed_len = text_len;
+ } else {
+ if (cpu == cont[ctx].cpu_owner)
+ cont_flush(ctx);
+ printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, cpu,
+ dict, dictlen, text, text_len);
+ }
prb_commit(&h);
return printed_len;

View File

@ -0,0 +1,304 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:30:00 +0100
Subject: [PATCH 22/25] printk: implement /dev/kmsg
Since printk messages are now logged to a new ring buffer, update
the /dev/kmsg functions to pull the messages from there.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/proc/kmsg.c | 4 -
include/linux/printk.h | 1
kernel/printk/printk.c | 162 +++++++++++++++++++++++++++++++++----------------
3 files changed, 113 insertions(+), 54 deletions(-)
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -18,8 +18,6 @@
#include <linux/uaccess.h>
#include <asm/io.h>
-extern wait_queue_head_t log_wait;
-
static int kmsg_open(struct inode * inode, struct file * file)
{
return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC);
@@ -42,7 +40,7 @@ static ssize_t kmsg_read(struct file *fi
static __poll_t kmsg_poll(struct file *file, poll_table *wait)
{
- poll_wait(file, &log_wait, wait);
+ poll_wait(file, printk_wait_queue(), wait);
if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
return EPOLLIN | EPOLLRDNORM;
return 0;
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -192,6 +192,7 @@ void __init setup_log_buf(int early);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack(void) __cold;
+struct wait_queue_head *printk_wait_queue(void);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -673,10 +673,11 @@ static ssize_t msg_print_ext_body(char *
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
u64 seq;
- u32 idx;
+ struct prb_iterator iter;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
+ char msgbuf[PRINTK_RECORD_MAX];
};
static __printf(3, 4) __cold
@@ -759,9 +760,11 @@ static ssize_t devkmsg_read(struct file
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
+ struct prb_iterator backup_iter;
struct printk_log *msg;
- size_t len;
ssize_t ret;
+ size_t len;
+ u64 seq;
if (!user)
return -EBADF;
@@ -770,52 +773,67 @@ static ssize_t devkmsg_read(struct file
if (ret)
return ret;
- logbuf_lock_irq();
- while (user->seq == log_next_seq) {
- if (file->f_flags & O_NONBLOCK) {
- ret = -EAGAIN;
- logbuf_unlock_irq();
- goto out;
- }
+ /* make a backup copy in case there is a problem */
+ prb_iter_copy(&backup_iter, &user->iter);
- logbuf_unlock_irq();
- ret = wait_event_interruptible(log_wait,
- user->seq != log_next_seq);
- if (ret)
- goto out;
- logbuf_lock_irq();
+ if (file->f_flags & O_NONBLOCK) {
+ ret = prb_iter_next(&user->iter, &user->msgbuf[0],
+ sizeof(user->msgbuf), &seq);
+ } else {
+ ret = prb_iter_wait_next(&user->iter, &user->msgbuf[0],
+ sizeof(user->msgbuf), &seq);
}
-
- if (user->seq < log_first_seq) {
- /* our last seen message is gone, return error and reset */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ if (ret == 0) {
+ /* end of list */
+ ret = -EAGAIN;
+ goto out;
+ } else if (ret == -EINVAL) {
+ /* iterator invalid, return error and reset */
ret = -EPIPE;
- logbuf_unlock_irq();
+ prb_iter_init(&user->iter, &printk_rb, &user->seq);
+ goto out;
+ } else if (ret < 0) {
+ /* interrupted by signal */
goto out;
}
- msg = log_from_idx(user->idx);
+ if (user->seq == 0) {
+ user->seq = seq;
+ } else {
+ user->seq++;
+ if (user->seq < seq) {
+ ret = -EPIPE;
+ goto restore_out;
+ }
+ }
+
+ msg = (struct printk_log *)&user->msgbuf[0];
len = msg_print_ext_header(user->buf, sizeof(user->buf),
msg, user->seq);
len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
log_dict(msg), msg->dict_len,
log_text(msg), msg->text_len);
- user->idx = log_next(user->idx);
- user->seq++;
- logbuf_unlock_irq();
-
if (len > count) {
ret = -EINVAL;
- goto out;
+ goto restore_out;
}
if (copy_to_user(buf, user->buf, len)) {
ret = -EFAULT;
- goto out;
+ goto restore_out;
}
+
ret = len;
+ goto out;
+restore_out:
+ /*
+ * There was an error, but this message should not be
+ * lost because of it. Restore the backup and setup
+ * seq so that it will work with the next read.
+ */
+ prb_iter_copy(&user->iter, &backup_iter);
+ user->seq = seq - 1;
out:
mutex_unlock(&user->lock);
return ret;
@@ -824,19 +842,21 @@ static ssize_t devkmsg_read(struct file
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
struct devkmsg_user *user = file->private_data;
- loff_t ret = 0;
+ loff_t ret;
if (!user)
return -EBADF;
if (offset)
return -ESPIPE;
- logbuf_lock_irq();
+ ret = mutex_lock_interruptible(&user->lock);
+ if (ret)
+ return ret;
+
switch (whence) {
case SEEK_SET:
/* the first record */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ prb_iter_init(&user->iter, &printk_rb, &user->seq);
break;
case SEEK_DATA:
/*
@@ -844,40 +864,83 @@ static loff_t devkmsg_llseek(struct file
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->idx = clear_idx;
- user->seq = clear_seq;
+ for (;;) {
+ prb_iter_init(&user->iter, &printk_rb, NULL);
+ ret = prb_iter_seek(&user->iter, clear_seq);
+ if (ret > 0) {
+ /* seeked to clear seq */
+ user->seq = clear_seq;
+ break;
+ } else if (ret == 0) {
+ /*
+ * The end of the list was hit without
+ * ever seeing the clear seq. Just
+ * seek to the beginning of the list.
+ */
+ prb_iter_init(&user->iter, &printk_rb,
+ &user->seq);
+ break;
+ }
+ /* iterator invalid, start over */
+ }
+ ret = 0;
break;
case SEEK_END:
/* after the last record */
- user->idx = log_next_idx;
- user->seq = log_next_seq;
+ for (;;) {
+ ret = prb_iter_next(&user->iter, NULL, 0, &user->seq);
+ if (ret == 0)
+ break;
+ else if (ret > 0)
+ continue;
+ /* iterator invalid, start over */
+ prb_iter_init(&user->iter, &printk_rb, &user->seq);
+ }
+ ret = 0;
break;
default:
ret = -EINVAL;
}
- logbuf_unlock_irq();
+
+ mutex_unlock(&user->lock);
return ret;
}
+struct wait_queue_head *printk_wait_queue(void)
+{
+ /* FIXME: using prb internals! */
+ return printk_rb.wq;
+}
+
static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
struct devkmsg_user *user = file->private_data;
+ struct prb_iterator iter;
__poll_t ret = 0;
+ int rbret;
+ u64 seq;
if (!user)
return EPOLLERR|EPOLLNVAL;
- poll_wait(file, &log_wait, wait);
+ poll_wait(file, printk_wait_queue(), wait);
- logbuf_lock_irq();
- if (user->seq < log_next_seq) {
- /* return error when data has vanished underneath us */
- if (user->seq < log_first_seq)
- ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
- else
- ret = EPOLLIN|EPOLLRDNORM;
- }
- logbuf_unlock_irq();
+ mutex_lock(&user->lock);
+
+ /* use copy so no actual iteration takes place */
+ prb_iter_copy(&iter, &user->iter);
+
+ rbret = prb_iter_next(&iter, &user->msgbuf[0],
+ sizeof(user->msgbuf), &seq);
+ if (rbret == 0)
+ goto out;
+
+ ret = EPOLLIN|EPOLLRDNORM;
+
+ if (rbret < 0 || (seq - user->seq) != 1)
+ ret |= EPOLLERR|EPOLLPRI;
+out:
+ mutex_unlock(&user->lock);
return ret;
}
@@ -907,10 +970,7 @@ static int devkmsg_open(struct inode *in
mutex_init(&user->lock);
- logbuf_lock_irq();
- user->idx = log_first_idx;
- user->seq = log_first_seq;
- logbuf_unlock_irq();
+ prb_iter_init(&user->iter, &printk_rb, &user->seq);
file->private_data = user;
return 0;

View File

@ -0,0 +1,493 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:30:01 +0100
Subject: [PATCH 23/25] printk: implement syslog
Since printk messages are now logged to a new ring buffer, update
the syslog functions to pull the messages from there.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 342 +++++++++++++++++++++++++++++++++----------------
1 file changed, 236 insertions(+), 106 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -407,10 +407,12 @@ DECLARE_STATIC_PRINTKRB_CPULOCK(printk_c
/* record buffer */
DECLARE_STATIC_PRINTKRB(printk_rb, CONFIG_LOG_BUF_SHIFT, &printk_cpulock);
+static DEFINE_MUTEX(syslog_lock);
+DECLARE_STATIC_PRINTKRB_ITER(syslog_iter, &printk_rb);
+
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
-static u32 syslog_idx;
static size_t syslog_partial;
static bool syslog_time;
@@ -1303,30 +1305,42 @@ static size_t msg_print_text(const struc
return len;
}
-static int syslog_print(char __user *buf, int size)
+static int syslog_print(char __user *buf, int size, char *text,
+ char *msgbuf, int *locked)
{
- char *text;
+ struct prb_iterator iter;
struct printk_log *msg;
int len = 0;
-
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
- if (!text)
- return -ENOMEM;
+ u64 seq;
+ int ret;
while (size > 0) {
size_t n;
size_t skip;
- logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
- /* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_partial = 0;
+ for (;;) {
+ prb_iter_copy(&iter, &syslog_iter);
+ ret = prb_iter_next(&iter, msgbuf,
+ PRINTK_RECORD_MAX, &seq);
+ if (ret < 0) {
+ /* messages are gone, move to first one */
+ prb_iter_init(&syslog_iter, &printk_rb,
+ &syslog_seq);
+ syslog_partial = 0;
+ continue;
+ }
+ break;
}
- if (syslog_seq == log_next_seq) {
- logbuf_unlock_irq();
+ if (ret == 0)
break;
+
+ /*
+ * If messages have been missed, the partial tracker
+ * is no longer valid and must be reset.
+ */
+ if (syslog_seq > 0 && seq - 1 != syslog_seq) {
+ syslog_seq = seq - 1;
+ syslog_partial = 0;
}
/*
@@ -1336,131 +1350,212 @@ static int syslog_print(char __user *buf
if (!syslog_partial)
syslog_time = printk_time;
+ msg = (struct printk_log *)msgbuf;
+
skip = syslog_partial;
- msg = log_from_idx(syslog_idx);
n = msg_print_text(msg, true, syslog_time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ PRINTK_SPRINT_MAX);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
- syslog_idx = log_next(syslog_idx);
- syslog_seq++;
+ prb_iter_next(&syslog_iter, NULL, 0, &syslog_seq);
n -= syslog_partial;
syslog_partial = 0;
- } else if (!len){
+ } else if (!len) {
/* partial read(), remember position */
n = size;
syslog_partial += n;
} else
n = 0;
- logbuf_unlock_irq();
if (!n)
break;
+ mutex_unlock(&syslog_lock);
if (copy_to_user(buf, text + skip, n)) {
if (!len)
len = -EFAULT;
+ *locked = 0;
break;
}
+ ret = mutex_lock_interruptible(&syslog_lock);
len += n;
size -= n;
buf += n;
+
+ if (ret) {
+ if (!len)
+ len = ret;
+ *locked = 0;
+ break;
+ }
}
- kfree(text);
return len;
}
-static int syslog_print_all(char __user *buf, int size, bool clear)
+static int count_remaining(struct prb_iterator *iter, u64 until_seq,
+ char *msgbuf, int size, bool records, bool time)
{
- char *text;
+ struct prb_iterator local_iter;
+ struct printk_log *msg;
int len = 0;
- u64 next_seq;
u64 seq;
- u32 idx;
+ int ret;
+
+ prb_iter_copy(&local_iter, iter);
+ for (;;) {
+ ret = prb_iter_next(&local_iter, msgbuf, size, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ /* the iter is invalid, restart from head */
+ prb_iter_init(&local_iter, &printk_rb, NULL);
+ len = 0;
+ continue;
+ }
+
+ if (until_seq && seq >= until_seq)
+ break;
+
+ if (records) {
+ len++;
+ } else {
+ msg = (struct printk_log *)msgbuf;
+ len += msg_print_text(msg, true, time, NULL, 0);
+ }
+ }
+
+ return len;
+}
+
+static void syslog_clear(void)
+{
+ struct prb_iterator iter;
+ int ret;
+
+ prb_iter_init(&iter, &printk_rb, &clear_seq);
+ for (;;) {
+ ret = prb_iter_next(&iter, NULL, 0, &clear_seq);
+ if (ret == 0)
+ break;
+ else if (ret < 0)
+ prb_iter_init(&iter, &printk_rb, &clear_seq);
+ }
+}
+
+static int syslog_print_all(char __user *buf, int size, bool clear)
+{
+ struct prb_iterator iter;
+ struct printk_log *msg;
+ char *msgbuf = NULL;
+ char *text = NULL;
+ int textlen;
+ u64 seq = 0;
+ int len = 0;
bool time;
+ int ret;
- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
+ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
+ msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
+ if (!msgbuf) {
+ kfree(text);
+ return -ENOMEM;
+ }
time = printk_time;
- logbuf_lock_irq();
+
/*
- * Find first record that fits, including all following records,
- * into the user-provided buffer for this dump.
+ * Setup iter to last event before clear. Clear may
+ * be lost, but keep going with a best effort.
*/
- seq = clear_seq;
- idx = clear_idx;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ prb_iter_init(&iter, &printk_rb, NULL);
+ prb_iter_seek(&iter, clear_seq);
- /* move first record forward until length fits into the buffer */
- seq = clear_seq;
- idx = clear_idx;
- while (len > size && seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ /* count the total bytes after clear */
+ len = count_remaining(&iter, 0, msgbuf, PRINTK_RECORD_MAX,
+ false, time);
+
+ /* move iter forward until length fits into the buffer */
+ while (len > size) {
+ ret = prb_iter_next(&iter, msgbuf,
+ PRINTK_RECORD_MAX, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ /*
+ * The iter is now invalid so clear will
+ * also be invalid. Restart from the head.
+ */
+ prb_iter_init(&iter, &printk_rb, NULL);
+ len = count_remaining(&iter, 0, msgbuf,
+ PRINTK_RECORD_MAX, false, time);
+ continue;
+ }
+ msg = (struct printk_log *)msgbuf;
len -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
- /* last message fitting into this dump */
- next_seq = log_next_seq;
+ if (clear)
+ clear_seq = seq;
+ }
+ /* copy messages to buffer */
len = 0;
- while (len >= 0 && seq < next_seq) {
- struct printk_log *msg = log_from_idx(idx);
- int textlen = msg_print_text(msg, true, time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ while (len >= 0 && len < size) {
+ if (clear)
+ clear_seq = seq;
- idx = log_next(idx);
- seq++;
+ ret = prb_iter_next(&iter, msgbuf,
+ PRINTK_RECORD_MAX, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ /*
+ * The iter is now invalid. Make a best
+ * effort to grab the rest of the log
+ * from the new head.
+ */
+ prb_iter_init(&iter, &printk_rb, NULL);
+ continue;
+ }
+
+ msg = (struct printk_log *)msgbuf;
+ textlen = msg_print_text(msg, true, time, text,
+ PRINTK_SPRINT_MAX);
+ if (textlen < 0) {
+ len = textlen;
+ break;
+ }
- logbuf_unlock_irq();
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
len += textlen;
- logbuf_lock_irq();
-
- if (seq < log_first_seq) {
- /* messages are gone, move to next one */
- seq = log_first_seq;
- idx = log_first_idx;
- }
}
- if (clear) {
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
- }
- logbuf_unlock_irq();
+ if (clear && !seq)
+ syslog_clear();
- kfree(text);
+ if (text)
+ kfree(text);
+ if (msgbuf)
+ kfree(msgbuf);
return len;
}
-static void syslog_clear(void)
-{
- logbuf_lock_irq();
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
- logbuf_unlock_irq();
-}
-
int do_syslog(int type, char __user *buf, int len, int source)
{
bool clear = false;
static int saved_console_loglevel = LOGLEVEL_DEFAULT;
+ struct prb_iterator iter;
+ char *msgbuf = NULL;
+ char *text = NULL;
+ int locked;
int error;
+ int ret;
error = check_syslog_permissions(type, source);
if (error)
@@ -1478,11 +1573,49 @@ int do_syslog(int type, char __user *buf
return 0;
if (!access_ok(buf, len))
return -EFAULT;
- error = wait_event_interruptible(log_wait,
- syslog_seq != log_next_seq);
+
+ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL);
+ msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
+ if (!text || !msgbuf) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ error = mutex_lock_interruptible(&syslog_lock);
if (error)
- return error;
- error = syslog_print(buf, len);
+ goto out;
+
+ /*
+ * Wait until a first message is available. Use a copy
+ * because no iteration should occur for syslog now.
+ */
+ for (;;) {
+ prb_iter_copy(&iter, &syslog_iter);
+
+ mutex_unlock(&syslog_lock);
+ ret = prb_iter_wait_next(&iter, NULL, 0, NULL);
+ if (ret == -ERESTARTSYS) {
+ error = ret;
+ goto out;
+ }
+ error = mutex_lock_interruptible(&syslog_lock);
+ if (error)
+ goto out;
+
+ if (ret == -EINVAL) {
+ prb_iter_init(&syslog_iter, &printk_rb,
+ &syslog_seq);
+ syslog_partial = 0;
+ continue;
+ }
+ break;
+ }
+
+ /* print as much as will fit in the user buffer */
+ locked = 1;
+ error = syslog_print(buf, len, text, msgbuf, &locked);
+ if (locked)
+ mutex_unlock(&syslog_lock);
break;
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
@@ -1527,47 +1660,45 @@ int do_syslog(int type, char __user *buf
break;
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
- logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
- /* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_partial = 0;
- }
+ msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL);
+ if (!msgbuf)
+ return -ENOMEM;
+
+ error = mutex_lock_interruptible(&syslog_lock);
+ if (error)
+ goto out;
+
if (source == SYSLOG_FROM_PROC) {
/*
* Short-cut for poll(/"proc/kmsg") which simply checks
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_seq - syslog_seq;
+ error = count_remaining(&syslog_iter, 0, msgbuf,
+ PRINTK_RECORD_MAX, true,
+ printk_time);
} else {
- u64 seq = syslog_seq;
- u32 idx = syslog_idx;
- bool time = syslog_partial ? syslog_time : printk_time;
-
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- error += msg_print_text(msg, true, time, NULL,
- 0);
- time = printk_time;
- idx = log_next(idx);
- seq++;
- }
+ error = count_remaining(&syslog_iter, 0, msgbuf,
+ PRINTK_RECORD_MAX, false,
+ printk_time);
error -= syslog_partial;
}
- logbuf_unlock_irq();
+
+ mutex_unlock(&syslog_lock);
break;
/* Size of the log buffer */
case SYSLOG_ACTION_SIZE_BUFFER:
- error = log_buf_len;
+ error = prb_buffer_size(&printk_rb);
break;
default:
error = -EINVAL;
break;
}
-
+out:
+ if (msgbuf)
+ kfree(msgbuf);
+ if (text)
+ kfree(text);
return error;
}
@@ -1989,7 +2120,6 @@ EXPORT_SYMBOL(printk);
#define printk_time false
static u64 syslog_seq;
-static u32 syslog_idx;
static u64 log_first_seq;
static u32 log_first_idx;
static u64 log_next_seq;

View File

@ -0,0 +1,397 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:30:02 +0100
Subject: [PATCH 24/25] printk: implement kmsg_dump
Since printk messages are now logged to a new ring buffer, update
the kmsg_dump functions to pull the messages from there.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/kmsg_dump.h | 6 -
kernel/printk/printk.c | 258 ++++++++++++++++++++++++----------------------
2 files changed, 139 insertions(+), 125 deletions(-)
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -46,10 +46,8 @@ struct kmsg_dumper {
bool registered;
/* private state of the kmsg iterator */
- u32 cur_idx;
- u32 next_idx;
- u64 cur_seq;
- u64 next_seq;
+ u64 line_seq;
+ u64 buffer_end_seq;
};
#ifdef CONFIG_PRINTK
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -417,13 +417,13 @@ static size_t syslog_partial;
static bool syslog_time;
/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
static u32 log_first_idx;
/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
static u32 log_next_idx;
+static DEFINE_MUTEX(kmsg_dump_lock);
+
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
static u32 clear_idx;
@@ -470,38 +470,6 @@ static char *log_dict(const struct print
return (char *)msg + sizeof(struct printk_log) + msg->text_len;
}
-/* get record by index; idx must point to valid msg */
-static struct printk_log *log_from_idx(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer.
- */
- if (!msg->len)
- return (struct printk_log *)log_buf;
- return msg;
-}
-
-/* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /* length == 0 indicates the end of the buffer; wrap */
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer as *this* one, and
- * return the one after that.
- */
- if (!msg->len) {
- msg = (struct printk_log *)log_buf;
- return msg->len;
- }
- return idx + msg->len;
-}
-
static void printk_emergency(char *buffer, int level, u64 ts_nsec, u16 cpu,
char *text, u16 text_len);
@@ -2120,9 +2088,7 @@ EXPORT_SYMBOL(printk);
#define printk_time false
static u64 syslog_seq;
-static u64 log_first_seq;
static u32 log_first_idx;
-static u64 log_next_seq;
static char *log_text(const struct printk_log *msg) { return NULL; }
static char *log_dict(const struct printk_log *msg) { return NULL; }
static struct printk_log *log_from_idx(u32 idx) { return NULL; }
@@ -3032,7 +2998,6 @@ module_param_named(always_kmsg_dump, alw
void kmsg_dump(enum kmsg_dump_reason reason)
{
struct kmsg_dumper *dumper;
- unsigned long flags;
if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
return;
@@ -3045,12 +3010,7 @@ void kmsg_dump(enum kmsg_dump_reason rea
/* initialize iterator with data about the stored records */
dumper->active = true;
- logbuf_lock_irqsave(flags);
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
- logbuf_unlock_irqrestore(flags);
+ kmsg_dump_rewind(dumper);
/* invoke dumper which will iterate over records */
dumper->dump(dumper, reason);
@@ -3083,33 +3043,67 @@ void kmsg_dump(enum kmsg_dump_reason rea
bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
char *line, size_t size, size_t *len)
{
+ struct prb_iterator iter;
struct printk_log *msg;
- size_t l = 0;
- bool ret = false;
+ struct prb_handle h;
+ bool cont = false;
+ char *msgbuf;
+ char *rbuf;
+ size_t l;
+ u64 seq;
+ int ret;
if (!dumper->active)
- goto out;
+ return cont;
+
+ rbuf = prb_reserve(&h, &sprint_rb, PRINTK_RECORD_MAX);
+ if (!rbuf)
+ return cont;
+ msgbuf = rbuf;
+retry:
+ for (;;) {
+ prb_iter_init(&iter, &printk_rb, &seq);
+
+ if (dumper->line_seq == seq) {
+ /* already where we want to be */
+ break;
+ } else if (dumper->line_seq < seq) {
+ /* messages are gone, move to first available one */
+ dumper->line_seq = seq;
+ break;
+ }
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
+ ret = prb_iter_seek(&iter, dumper->line_seq);
+ if (ret > 0) {
+ /* seeked to line_seq */
+ break;
+ } else if (ret == 0) {
+ /*
+ * The end of the list was hit without ever seeing
+ * line_seq. Reset it to the beginning of the list.
+ */
+ prb_iter_init(&iter, &printk_rb, &dumper->line_seq);
+ break;
+ }
+ /* iterator invalid, start over */
}
- /* last entry */
- if (dumper->cur_seq >= log_next_seq)
+ ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX,
+ &dumper->line_seq);
+ if (ret == 0)
goto out;
+ else if (ret < 0)
+ goto retry;
- msg = log_from_idx(dumper->cur_idx);
+ msg = (struct printk_log *)msgbuf;
l = msg_print_text(msg, syslog, printk_time, line, size);
- dumper->cur_idx = log_next(dumper->cur_idx);
- dumper->cur_seq++;
- ret = true;
-out:
if (len)
*len = l;
- return ret;
+ cont = true;
+out:
+ prb_commit(&h);
+ return cont;
}
/**
@@ -3132,12 +3126,11 @@ bool kmsg_dump_get_line_nolock(struct km
bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog,
char *line, size_t size, size_t *len)
{
- unsigned long flags;
bool ret;
- logbuf_lock_irqsave(flags);
+ mutex_lock(&kmsg_dump_lock);
ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- logbuf_unlock_irqrestore(flags);
+ mutex_unlock(&kmsg_dump_lock);
return ret;
}
@@ -3165,74 +3158,101 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
char *buf, size_t size, size_t *len)
{
- unsigned long flags;
- u64 seq;
- u32 idx;
- u64 next_seq;
- u32 next_idx;
- size_t l = 0;
- bool ret = false;
+ struct prb_iterator iter;
bool time = printk_time;
+ struct printk_log *msg;
+ u64 new_end_seq = 0;
+ struct prb_handle h;
+ bool cont = false;
+ char *msgbuf;
+ u64 end_seq;
+ int textlen;
+ u64 seq = 0;
+ char *rbuf;
+ int l = 0;
+ int ret;
if (!dumper->active)
- goto out;
+ return cont;
- logbuf_lock_irqsave(flags);
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
- }
+ rbuf = prb_reserve(&h, &sprint_rb, PRINTK_RECORD_MAX);
+ if (!rbuf)
+ return cont;
+ msgbuf = rbuf;
- /* last entry */
- if (dumper->cur_seq >= dumper->next_seq) {
- logbuf_unlock_irqrestore(flags);
- goto out;
- }
-
- /* calculate length of entire buffer */
- seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ prb_iter_init(&iter, &printk_rb, NULL);
- l += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
+ /*
+ * seek to the start record, which is set/modified
+ * by kmsg_dump_get_line_nolock()
+ */
+ ret = prb_iter_seek(&iter, dumper->line_seq);
+ if (ret <= 0)
+ prb_iter_init(&iter, &printk_rb, &seq);
+
+ /* work with a local end seq to have a constant value */
+ end_seq = dumper->buffer_end_seq;
+ if (!end_seq) {
+ /* initialize end seq to "infinity" */
+ end_seq = -1;
+ dumper->buffer_end_seq = end_seq;
}
+retry:
+ if (seq >= end_seq)
+ goto out;
- /* move first record forward until length fits into the buffer */
- seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (l >= size && seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ /* count the total bytes after seq */
+ textlen = count_remaining(&iter, end_seq, msgbuf,
+ PRINTK_RECORD_MAX, 0, time);
- l -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
+ /* move iter forward until length fits into the buffer */
+ while (textlen > size) {
+ ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ prb_iter_init(&iter, &printk_rb, &seq);
+ goto retry;
+ }
+
+ msg = (struct printk_log *)msgbuf;
+ textlen -= msg_print_text(msg, true, time, NULL, 0);
}
- /* last message in next interation */
- next_seq = seq;
- next_idx = idx;
+ /* save end seq for the next interation */
+ new_end_seq = seq + 1;
- l = 0;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ /* copy messages to buffer */
+ while (l < size) {
+ ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, &seq);
+ if (ret == 0) {
+ break;
+ } else if (ret < 0) {
+ /*
+ * iterator (and thus also the start position)
+ * invalid, start over from beginning of list
+ */
+ prb_iter_init(&iter, &printk_rb, NULL);
+ continue;
+ }
- l += msg_print_text(msg, syslog, time, buf + l, size - l);
- idx = log_next(idx);
- seq++;
+ if (seq >= end_seq)
+ break;
+
+ msg = (struct printk_log *)msgbuf;
+ textlen = msg_print_text(msg, syslog, time, buf + l, size - l);
+ if (textlen > 0)
+ l += textlen;
+ cont = true;
}
- dumper->next_seq = next_seq;
- dumper->next_idx = next_idx;
- ret = true;
- logbuf_unlock_irqrestore(flags);
-out:
- if (len)
+ if (cont && len)
*len = l;
- return ret;
+out:
+ prb_commit(&h);
+ if (new_end_seq)
+ dumper->buffer_end_seq = new_end_seq;
+ return cont;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
@@ -3248,10 +3268,8 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
*/
void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
{
- dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ dumper->line_seq = 0;
+ dumper->buffer_end_seq = 0;
}
/**
@@ -3264,11 +3282,9 @@ void kmsg_dump_rewind_nolock(struct kmsg
*/
void kmsg_dump_rewind(struct kmsg_dumper *dumper)
{
- unsigned long flags;
-
- logbuf_lock_irqsave(flags);
+ mutex_lock(&kmsg_dump_lock);
kmsg_dump_rewind_nolock(dumper);
- logbuf_unlock_irqrestore(flags);
+ mutex_unlock(&kmsg_dump_lock);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);

View File

@ -0,0 +1,346 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 12 Feb 2019 15:30:03 +0100
Subject: [PATCH 25/25] printk: remove unused code
Code relating to the safe context and anything dealing with the
previous log buffer implementation is no longer in use. Remove it.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/internal.h | 41 -----------
kernel/printk/printk.c | 161 ++++-------------------------------------------
lib/bust_spinlocks.c | 3
3 files changed, 16 insertions(+), 189 deletions(-)
delete mode 100644 kernel/printk/internal.h
--- a/kernel/printk/internal.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * internal.h - printk internal definitions
- */
-#include <linux/percpu.h>
-
-#ifdef CONFIG_PRINTK
-
-#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff
-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000
-#define PRINTK_NMI_CONTEXT_MASK 0x80000000
-
-extern raw_spinlock_t logbuf_lock;
-
-__printf(5, 0)
-int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
- const char *fmt, va_list args);
-
-__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
-
-void defer_console_output(void);
-
-#else
-
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; }
-
-/*
- * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem
- * semaphore and some of console functions (console_unlock()/etc.), so
- * printk-safe must preserve the existing local IRQ guarantees.
- */
-#endif /* CONFIG_PRINTK */
-
-#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
-#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)
-
-#define printk_safe_enter_irq() local_irq_disable()
-#define printk_safe_exit_irq() local_irq_enable()
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -61,7 +61,6 @@
#include "console_cmdline.h"
#include "braille.h"
-#include "internal.h"
int console_printk[5] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
@@ -366,41 +365,6 @@ struct printk_log {
#endif
;
-/*
- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
- * within the scheduler's rq lock. It must be released before calling
- * console_unlock() or anything else that might wake up a process.
- */
-DEFINE_RAW_SPINLOCK(logbuf_lock);
-
-/*
- * Helper macros to lock/unlock logbuf_lock and switch between
- * printk-safe/unsafe modes.
- */
-#define logbuf_lock_irq() \
- do { \
- printk_safe_enter_irq(); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irq() \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irq(); \
- } while (0)
-
-#define logbuf_lock_irqsave(flags) \
- do { \
- printk_safe_enter_irqsave(flags); \
- raw_spin_lock(&logbuf_lock); \
- } while (0)
-
-#define logbuf_unlock_irqrestore(flags) \
- do { \
- raw_spin_unlock(&logbuf_lock); \
- printk_safe_exit_irqrestore(flags); \
- } while (0)
-
DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock);
#ifdef CONFIG_PRINTK
@@ -410,23 +374,15 @@ DECLARE_STATIC_PRINTKRB(printk_rb, CONFI
static DEFINE_MUTEX(syslog_lock);
DECLARE_STATIC_PRINTKRB_ITER(syslog_iter, &printk_rb);
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
-/* the next printk record to read by syslog(READ) or /proc/kmsg */
+/* the last printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
-/* index and sequence number of the first record stored in the buffer */
-static u32 log_first_idx;
-
-/* index and sequence number of the next record to store in the buffer */
-static u32 log_next_idx;
-
static DEFINE_MUTEX(kmsg_dump_lock);
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
-static u32 clear_idx;
#ifdef CONFIG_PRINTK_CALLER
#define PREFIX_MAX 48
@@ -438,24 +394,16 @@ static u32 clear_idx;
#define LOG_LEVEL(v) ((v) & 0x07)
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
-/* record buffer */
-#define LOG_ALIGN __alignof__(struct printk_log)
-#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
-#define LOG_BUF_LEN_MAX (u32)(1 << 31)
-static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
-static char *log_buf = __log_buf;
-static u32 log_buf_len = __LOG_BUF_LEN;
-
/* Return log buffer address */
char *log_buf_addr_get(void)
{
- return log_buf;
+ return printk_rb.buffer;
}
/* Return log buffer size */
u32 log_buf_len_get(void)
{
- return log_buf_len;
+ return (1 << printk_rb.size_bits);
}
/* human readable text of the record */
@@ -980,11 +928,6 @@ const struct file_operations kmsg_fops =
*/
void log_buf_vmcoreinfo_setup(void)
{
- VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(log_first_idx);
- VMCOREINFO_SYMBOL(clear_idx);
- VMCOREINFO_SYMBOL(log_next_idx);
/*
* Export struct printk_log size and field offsets. User space tools can
* parse it and detect any changes to structure down the line.
@@ -1000,6 +943,8 @@ void log_buf_vmcoreinfo_setup(void)
}
#endif
+/* FIXME: no support for buffer resizing */
+#if 0
/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;
@@ -1065,9 +1010,12 @@ static void __init log_buf_add_cpu(void)
#else /* !CONFIG_SMP */
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */
+#endif /* 0 */
void __init setup_log_buf(int early)
{
+/* FIXME: no support for buffer resizing */
+#if 0
unsigned long flags;
char *new_log_buf;
unsigned int free;
@@ -1099,6 +1047,7 @@ void __init setup_log_buf(int early)
pr_info("log_buf_len: %u bytes\n", log_buf_len);
pr_info("early log buf free: %u(%u%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
+#endif
}
static bool __read_mostly ignore_loglevel;
@@ -2019,7 +1968,7 @@ asmlinkage int vprintk_emit(int facility
}
EXPORT_SYMBOL(vprintk_emit);
-__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+static __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
{
return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
}
@@ -2080,31 +2029,6 @@ asmlinkage __visible int printk(const ch
return r;
}
EXPORT_SYMBOL(printk);
-
-#else /* CONFIG_PRINTK */
-
-#define LOG_LINE_MAX 0
-#define PREFIX_MAX 0
-#define printk_time false
-
-static u64 syslog_seq;
-static u32 log_first_idx;
-static char *log_text(const struct printk_log *msg) { return NULL; }
-static char *log_dict(const struct printk_log *msg) { return NULL; }
-static struct printk_log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
-static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg,
- u64 seq) { return 0; }
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *dict, size_t dict_len,
- char *text, size_t text_len) { return 0; }
-static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len,
- const char *text, size_t len, int level) {}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
- bool time, char *buf, size_t size) { return 0; }
-static bool suppress_message_printing(int level) { return false; }
-
#endif /* CONFIG_PRINTK */
#ifdef CONFIG_EARLY_PRINTK
@@ -2401,15 +2325,10 @@ void console_unblank(void)
void console_flush_on_panic(enum con_flush_mode mode)
{
/*
- * If someone else is holding the console lock, trylock will fail
- * and may_schedule may be set. Ignore and proceed to unlock so
- * that messages are flushed out. As this can be called from any
- * context and we don't want to get preempted while flushing,
- * ensure may_schedule is cleared.
+ * FIXME: This is currently a NOP. Emergency messages will have been
+ * printed, but what about if write_atomic is not available on the
+ * console? What if the printk kthread is still alive?
*/
- console_trylock();
- console_may_schedule = 0;
- console_unlock();
}
/*
@@ -2758,43 +2677,6 @@ static int __init printk_late_init(void)
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
-/*
- * Delayed printk version, for scheduler-internal messages:
- */
-#define PRINTK_PENDING_WAKEUP 0x01
-#define PRINTK_PENDING_OUTPUT 0x02
-
-static DEFINE_PER_CPU(int, printk_pending);
-
-static void wake_up_klogd_work_func(struct irq_work *irq_work)
-{
- int pending = __this_cpu_xchg(printk_pending, 0);
-
- if (pending & PRINTK_PENDING_OUTPUT) {
- /* If trylock fails, someone else is doing the printing */
- if (console_trylock())
- console_unlock();
- }
-
- if (pending & PRINTK_PENDING_WAKEUP)
- wake_up_interruptible(&log_wait);
-}
-
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
- .func = wake_up_klogd_work_func,
- .flags = IRQ_WORK_LAZY,
-};
-
-void wake_up_klogd(void)
-{
- preempt_disable();
- if (waitqueue_active(&log_wait)) {
- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
- }
- preempt_enable();
-}
-
static int printk_kthread_func(void *data)
{
struct prb_iterator iter;
@@ -2860,22 +2742,9 @@ static int __init init_printk_kthread(vo
}
late_initcall(init_printk_kthread);
-void defer_console_output(void)
-{
- preempt_disable();
- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
- preempt_enable();
-}
-
-int vprintk_deferred(const char *fmt, va_list args)
+static int vprintk_deferred(const char *fmt, va_list args)
{
- int r;
-
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
- defer_console_output();
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
}
int printk_deferred(const char *fmt, ...)
--- a/lib/bust_spinlocks.c
+++ b/lib/bust_spinlocks.c
@@ -26,7 +26,6 @@ void bust_spinlocks(int yes)
unblank_screen();
#endif
console_unblank();
- if (--oops_in_progress == 0)
- wake_up_klogd();
+ --oops_in_progress;
}
}

View File

@ -0,0 +1,38 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Thu, 14 Feb 2019 23:13:30 +0100
Subject: [PATCH] printk: set deferred to default loglevel, enforce mask
All messages printed via vpritnk_deferred() were being
automatically treated as emergency messages.
Messages printed via vprintk_deferred() should be set to the
default loglevel. LOGLEVEL_SCHED is no longer relevant.
Also, enforce the loglevel mask for emergency messages.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1951,7 +1951,7 @@ asmlinkage int vprintk_emit(int facility
* - text points to beginning of text
* - there is room before text for prefix
*/
- printk_emergency(rbuf, level, ts_nsec, cpu, text, text_len);
+ printk_emergency(rbuf, level & 7, ts_nsec, cpu, text, text_len);
if ((lflags & LOG_CONT) || !(lflags & LOG_NEWLINE)) {
cont_add(ctx, cpu, caller_id, facility, level, lflags, text, text_len);
@@ -2744,7 +2744,7 @@ late_initcall(init_printk_kthread);
static int vprintk_deferred(const char *fmt, va_list args)
{
- return vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
}
int printk_deferred(const char *fmt, ...)

View File

@ -0,0 +1,43 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 14 Feb 2019 17:38:24 +0100
Subject: [PATCH] serial: 8250: remove that trylock in
serial8250_console_write_atomic()
This does not work as rtmutex in NMI context. As per John, it is not
needed.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/tty/serial/8250/8250_port.c | 11 -----------
1 file changed, 11 deletions(-)
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -3191,17 +3191,9 @@ void serial8250_console_write_atomic(str
{
struct uart_port *port = &up->port;
unsigned int flags;
- bool locked;
console_atomic_lock(&flags);
- /*
- * If possible, keep any other CPUs from working with the
- * UART until the atomic message is completed. This helps
- * to keep the output more orderly.
- */
- locked = spin_trylock(&port->lock);
-
touch_nmi_watchdog();
clear_ier(up);
@@ -3216,9 +3208,6 @@ void serial8250_console_write_atomic(str
wait_for_xmitr(up, BOTH_EMPTY);
restore_ier(up);
- if (locked)
- spin_unlock(&port->lock);
-
console_atomic_unlock(flags);
}

View File

@ -0,0 +1,38 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat, 16 Feb 2019 09:02:00 +0100
Subject: [PATCH] serial: 8250: export symbols which are used by symbols
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/tty/serial/8250/8250_port.c | 2 ++
kernel/printk/printk.c | 1 +
2 files changed, 3 insertions(+)
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -2025,6 +2025,7 @@ void clear_ier(struct uart_8250_port *up
}
console_atomic_unlock(flags);
}
+EXPORT_SYMBOL_GPL(clear_ier);
void restore_ier(struct uart_8250_port *up)
{
@@ -2036,6 +2037,7 @@ void restore_ier(struct uart_8250_port *
serial_port_out(port, UART_IER, atomic_read(&ier_value));
console_atomic_unlock(flags);
}
+EXPORT_SYMBOL_GPL(restore_ier);
#ifdef CONFIG_CONSOLE_POLL
/*
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2257,6 +2257,7 @@ int is_console_locked(void)
{
return console_locked;
}
+EXPORT_SYMBOL(is_console_locked);
/**
* console_unlock - unlock the console system

View File

@ -0,0 +1,25 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 15 Feb 2019 14:34:20 +0100
Subject: [PATCH] arm: remove printk_nmi_.*()
It is no longer provided by the printk core code.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/arm/kernel/smp.c | 2 --
1 file changed, 2 deletions(-)
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -682,11 +682,9 @@ void handle_IPI(int ipinr, struct pt_reg
break;
case IPI_CPU_BACKTRACE:
- printk_nmi_enter();
irq_enter();
nmi_cpu_backtrace(regs);
irq_exit();
- printk_nmi_exit();
break;
default:

View File

@ -0,0 +1,67 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Sun, 17 Feb 2019 03:11:20 +0100
Subject: [PATCH] printk: only allow kernel to emergency message
Emergency messages exist as a mechanism for the kernel to
communicate critical information to users. It is not meant for
use by userspace. Only allow facility=0 messages to be
processed by the emergency message code.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1754,7 +1754,8 @@ static void printk_write_history(struct
* The console_lock must be held.
*/
static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len,
- const char *text, size_t len, int level)
+ const char *text, size_t len, int level,
+ int facility)
{
struct console *con;
@@ -1774,13 +1775,14 @@ static void call_console_drivers(u64 seq
con->wrote_history = 1;
con->printk_seq = seq - 1;
}
- if (con->write_atomic && level < emergency_console_loglevel) {
+ if (con->write_atomic && level < emergency_console_loglevel &&
+ facility == 0) {
/* skip emergency messages, already printed */
if (con->printk_seq < seq)
con->printk_seq = seq;
continue;
}
- if (con->flags & CON_BOOT) {
+ if (con->flags & CON_BOOT && facility == 0) {
/* skip emergency messages, already printed */
if (con->printk_seq < seq)
con->printk_seq = seq;
@@ -1951,7 +1953,10 @@ asmlinkage int vprintk_emit(int facility
* - text points to beginning of text
* - there is room before text for prefix
*/
- printk_emergency(rbuf, level & 7, ts_nsec, cpu, text, text_len);
+ if (facility == 0) {
+ /* only the kernel can create emergency messages */
+ printk_emergency(rbuf, level & 7, ts_nsec, cpu, text, text_len);
+ }
if ((lflags & LOG_CONT) || !(lflags & LOG_NEWLINE)) {
cont_add(ctx, cpu, caller_id, facility, level, lflags, text, text_len);
@@ -2715,8 +2720,8 @@ static int printk_kthread_func(void *dat
&len, printk_time);
console_lock();
- call_console_drivers(master_seq, ext_text,
- ext_len, text, len, msg->level);
+ call_console_drivers(master_seq, ext_text, ext_len, text, len,
+ msg->level, msg->facility);
if (len > 0 || ext_len > 0)
printk_delay(msg->level);
console_unlock();

View File

@ -0,0 +1,45 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Fri, 22 Feb 2019 23:02:44 +0100
Subject: [PATCH] printk: devkmsg: llseek: reset clear if it is lost
SEEK_DATA will seek to the last clear record. If this clear record
is no longer in the ring buffer, devkmsg_llseek() will go into an
infinite loop. Fix that by resetting the clear sequence if the old
clear record is no longer in the ring buffer.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -761,6 +761,7 @@ static loff_t devkmsg_llseek(struct file
{
struct devkmsg_user *user = file->private_data;
loff_t ret;
+ u64 seq;
if (!user)
return -EBADF;
@@ -783,7 +784,7 @@ static loff_t devkmsg_llseek(struct file
* changes no global state, and does not clear anything.
*/
for (;;) {
- prb_iter_init(&user->iter, &printk_rb, NULL);
+ prb_iter_init(&user->iter, &printk_rb, &seq);
ret = prb_iter_seek(&user->iter, clear_seq);
if (ret > 0) {
/* seeked to clear seq */
@@ -800,6 +801,10 @@ static loff_t devkmsg_llseek(struct file
break;
}
/* iterator invalid, start over */
+
+ /* reset clear_seq if it is no longer available */
+ if (seq > clear_seq)
+ clear_seq = 0;
}
ret = 0;
break;

View File

@ -0,0 +1,24 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 22 Feb 2019 12:47:13 +0100
Subject: [PATCH] printk: print "rate-limitted" message as info
If messages which are injected via kmsg are dropped then they don't need
to be printed as warnings. This is to avoid latency spikes if the
interface decides to print a lot of important messages.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/ratelimit.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -59,7 +59,7 @@ static inline void ratelimit_state_exit(
return;
if (rs->missed) {
- pr_warn("%s: %d output lines suppressed due to ratelimiting\n",
+ pr_info("%s: %d output lines suppressed due to ratelimiting\n",
current->comm, rs->missed);
rs->missed = 0;
}

View File

@ -0,0 +1,84 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Wed, 24 Apr 2019 16:36:04 +0200
Subject: [PATCH] printk: kmsg_dump: remove mutex usage
The kmsg dumper can be called from any context, but the dumping
helpers were using a mutex to synchronize the iterator against
concurrent dumps.
Rather than trying to synchronize the iterator, use a local copy
of the iterator during the dump. Then no synchronization is
required.
Reported-by: Scott Wood <swood@redhat.com>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 23 ++++++++++-------------
1 file changed, 10 insertions(+), 13 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -379,8 +379,6 @@ static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;
-static DEFINE_MUTEX(kmsg_dump_lock);
-
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
@@ -2877,6 +2875,7 @@ module_param_named(always_kmsg_dump, alw
*/
void kmsg_dump(enum kmsg_dump_reason reason)
{
+ struct kmsg_dumper dumper_local;
struct kmsg_dumper *dumper;
if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
@@ -2887,16 +2886,18 @@ void kmsg_dump(enum kmsg_dump_reason rea
if (dumper->max_reason && reason > dumper->max_reason)
continue;
- /* initialize iterator with data about the stored records */
- dumper->active = true;
+ /*
+ * use a local copy to avoid modifying the
+ * iterator used by any other cpus/contexts
+ */
+ memcpy(&dumper_local, dumper, sizeof(dumper_local));
- kmsg_dump_rewind(dumper);
+ /* initialize iterator with data about the stored records */
+ dumper_local.active = true;
+ kmsg_dump_rewind(&dumper_local);
/* invoke dumper which will iterate over records */
- dumper->dump(dumper, reason);
-
- /* reset iterator */
- dumper->active = false;
+ dumper_local.dump(&dumper_local, reason);
}
rcu_read_unlock();
}
@@ -3008,9 +3009,7 @@ bool kmsg_dump_get_line(struct kmsg_dump
{
bool ret;
- mutex_lock(&kmsg_dump_lock);
ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len);
- mutex_unlock(&kmsg_dump_lock);
return ret;
}
@@ -3162,9 +3161,7 @@ void kmsg_dump_rewind_nolock(struct kmsg
*/
void kmsg_dump_rewind(struct kmsg_dumper *dumper)
{
- mutex_lock(&kmsg_dump_lock);
kmsg_dump_rewind_nolock(dumper);
- mutex_unlock(&kmsg_dump_lock);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);

View File

@ -0,0 +1,43 @@
From: He Zhe <zhe.he@windriver.com>
Date: Tue, 24 Sep 2019 15:26:39 +0800
Subject: [PATCH] printk: devkmsg: read: Return EPIPE when the first
message user-space wants has gone
When user-space wants to read the first message, that is when user->seq
is 0, and that message has gone, it currently automatically resets
user->seq to current first seq. This mis-aligns with mainline kernel.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/ABI/testing/dev-kmsg#n39
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/printk/printk.c#n899
We should inform user-space that what it wants has gone by returning EPIPE
in such scenario.
Link: https://lore.kernel.org/r/20190924072639.25986-1-zhe.he@windriver.com
Signed-off-by: He Zhe <zhe.he@windriver.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -713,14 +713,10 @@ static ssize_t devkmsg_read(struct file
goto out;
}
- if (user->seq == 0) {
- user->seq = seq;
- } else {
- user->seq++;
- if (user->seq < seq) {
- ret = -EPIPE;
- goto restore_out;
- }
+ user->seq++;
+ if (user->seq < seq) {
+ ret = -EPIPE;
+ goto restore_out;
}
msg = (struct printk_log *)&user->msgbuf[0];

View File

@ -0,0 +1,43 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Mon, 7 Oct 2019 16:20:39 +0200
Subject: [PATCH] printk: handle iterating while buffer changing
The syslog and kmsg_dump readers are provided buffers to fill.
Both try to maximize the provided buffer usage by calculating the
maximum number of messages that can fit. However, if after the
calculation, messages are dropped and new messages added, the
calculation will no longer match.
For syslog, add a check to make sure the provided buffer is not
overfilled.
For kmsg_dump, start over by recalculating the messages
available.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1446,6 +1446,9 @@ static int syslog_print_all(char __user
break;
}
+ if (len + textlen > size)
+ break;
+
if (copy_to_user(buf + len, text, textlen))
len = -EFAULT;
else
@@ -3085,7 +3088,7 @@ bool kmsg_dump_get_buffer(struct kmsg_du
ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, &seq);
if (ret == 0) {
break;
- } else if (ret < 0) {
+ } else if (ret < 0 || seq >= end_seq) {
prb_iter_init(&iter, &printk_rb, &seq);
goto retry;
}

View File

@ -0,0 +1,52 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Tue, 3 Dec 2019 09:14:57 +0100
Subject: [PATCH] printk: hack out emergency loglevel usage
Instead of using an emergency loglevel to determine if atomic
messages should be printed, use oops_in_progress. This conforms
to the decision that latency-causing atomic messages never be
generated during normal operation.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/printk/printk.c | 13 +++----------
1 file changed, 3 insertions(+), 10 deletions(-)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1777,15 +1777,8 @@ static void call_console_drivers(u64 seq
con->wrote_history = 1;
con->printk_seq = seq - 1;
}
- if (con->write_atomic && level < emergency_console_loglevel &&
- facility == 0) {
- /* skip emergency messages, already printed */
- if (con->printk_seq < seq)
- con->printk_seq = seq;
- continue;
- }
if (con->flags & CON_BOOT && facility == 0) {
- /* skip emergency messages, already printed */
+ /* skip boot messages, already printed */
if (con->printk_seq < seq)
con->printk_seq = seq;
continue;
@@ -3171,7 +3164,7 @@ static bool console_can_emergency(int le
for_each_console(con) {
if (!(con->flags & CON_ENABLED))
continue;
- if (con->write_atomic && level < emergency_console_loglevel)
+ if (con->write_atomic && oops_in_progress)
return true;
if (con->write && (con->flags & CON_BOOT))
return true;
@@ -3187,7 +3180,7 @@ static void call_emergency_console_drive
for_each_console(con) {
if (!(con->flags & CON_ENABLED))
continue;
- if (con->write_atomic && level < emergency_console_loglevel) {
+ if (con->write_atomic && oops_in_progress) {
con->write_atomic(con, text, text_len);
continue;
}

View File

@ -0,0 +1,384 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Fri, 10 Jan 2020 16:45:31 +0106
Subject: [PATCH] serial: 8250: only atomic lock for console
The atomic console implementation requires that IER is synchronized
between atomic and non-atomic usage. However, it was implemented such
that the console_atomic_lock was performed for all IER access, even
if that port was not a console.
The implementation also used a usage counter to keep track of IER
clear/restore windows. However, this is not needed because the
console_atomic_lock synchronization of IER access with prevent any
situations where IER is prematurely restored or left cleared.
Move the IER access functions to inline macros. They will only
console_atomic_lock if the port is a console. Remove the
restore_ier() function by having clear_ier() return the prior IER
value so that the caller can restore it using set_ier(). Rename the
IER access functions to match other 8250 wrapper macros.
Suggested-by: Dick Hollenbeck <dick@softplc.com>
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/tty/serial/8250/8250.h | 65 +++++++++++++++++++---------
drivers/tty/serial/8250/8250_core.c | 6 +-
drivers/tty/serial/8250/8250_dma.c | 4 -
drivers/tty/serial/8250/8250_port.c | 81 ++++++++----------------------------
4 files changed, 66 insertions(+), 90 deletions(-)
--- a/drivers/tty/serial/8250/8250.h
+++ b/drivers/tty/serial/8250/8250.h
@@ -96,10 +96,6 @@ struct serial8250_config {
#define SERIAL8250_SHARE_IRQS 0
#endif
-void set_ier(struct uart_8250_port *up, unsigned char ier);
-void clear_ier(struct uart_8250_port *up);
-void restore_ier(struct uart_8250_port *up);
-
#define SERIAL8250_PORT_FLAGS(_base, _irq, _flags) \
{ \
.iobase = _base, \
@@ -134,39 +130,64 @@ static inline void serial_dl_write(struc
up->dl_write(up, value);
}
-static inline bool serial8250_set_THRI(struct uart_8250_port *up)
+static inline void serial8250_set_IER(struct uart_8250_port *up,
+ unsigned char ier)
{
- if (up->ier & UART_IER_THRI)
- return false;
- up->ier |= UART_IER_THRI;
- serial_out(up, UART_IER, up->ier);
- return true;
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ bool is_console;
+
+ is_console = uart_console(port);
+
+ if (is_console)
+ console_atomic_lock(&flags);
+
+ serial_out(up, UART_IER, ier);
+
+ if (is_console)
+ console_atomic_unlock(flags);
}
-static inline bool serial8250_set_THRI_sier(struct uart_8250_port *up)
+static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up)
{
- if (up->ier & UART_IER_THRI)
- return false;
- up->ier |= UART_IER_THRI;
- set_ier(up, up->ier);
- return true;
+ struct uart_port *port = &up->port;
+ unsigned int clearval = 0;
+ unsigned int prior;
+ unsigned int flags;
+ bool is_console;
+
+ is_console = uart_console(port);
+
+ if (up->capabilities & UART_CAP_UUE)
+ clearval = UART_IER_UUE;
+
+ if (is_console)
+ console_atomic_lock(&flags);
+
+ prior = serial_port_in(port, UART_IER);
+ serial_port_out(port, UART_IER, clearval);
+
+ if (is_console)
+ console_atomic_unlock(flags);
+
+ return prior;
}
-static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
+static inline bool serial8250_set_THRI(struct uart_8250_port *up)
{
- if (!(up->ier & UART_IER_THRI))
+ if (up->ier & UART_IER_THRI)
return false;
- up->ier &= ~UART_IER_THRI;
- serial_out(up, UART_IER, up->ier);
+ up->ier |= UART_IER_THRI;
+ serial8250_set_IER(up, up->ier);
return true;
}
-static inline bool serial8250_clear_THRI_sier(struct uart_8250_port *up)
+static inline bool serial8250_clear_THRI(struct uart_8250_port *up)
{
if (!(up->ier & UART_IER_THRI))
return false;
up->ier &= ~UART_IER_THRI;
- set_ier(up, up->ier);
+ serial8250_set_IER(up, up->ier);
return true;
}
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -265,7 +265,7 @@ static void serial8250_timeout(struct ti
static void serial8250_backup_timeout(struct timer_list *t)
{
struct uart_8250_port *up = from_timer(up, t, timer);
- unsigned int iir, lsr;
+ unsigned int iir, ier = 0, lsr;
unsigned long flags;
spin_lock_irqsave(&up->port.lock, flags);
@@ -275,7 +275,7 @@ static void serial8250_backup_timeout(st
* based handler.
*/
if (up->port.irq)
- clear_ier(up);
+ ier = serial8250_clear_IER(up);
iir = serial_in(up, UART_IIR);
@@ -298,7 +298,7 @@ static void serial8250_backup_timeout(st
serial8250_tx_chars(up);
if (up->port.irq)
- restore_ier(up);
+ serial8250_set_IER(up, ier);
spin_unlock_irqrestore(&up->port.lock, flags);
--- a/drivers/tty/serial/8250/8250_dma.c
+++ b/drivers/tty/serial/8250/8250_dma.c
@@ -35,7 +35,7 @@ static void __dma_tx_complete(void *para
ret = serial8250_tx_dma(p);
if (ret)
- serial8250_set_THRI_sier(p);
+ serial8250_set_THRI(p);
spin_unlock_irqrestore(&p->port.lock, flags);
}
@@ -98,7 +98,7 @@ int serial8250_tx_dma(struct uart_8250_p
dma_async_issue_pending(dma->txchan);
if (dma->tx_err) {
dma->tx_err = 0;
- serial8250_clear_THRI_sier(p);
+ serial8250_clear_THRI(p);
}
return 0;
err:
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -721,7 +721,7 @@ static void serial8250_set_sleep(struct
serial_out(p, UART_EFR, UART_EFR_ECB);
serial_out(p, UART_LCR, 0);
}
- set_ier(p, sleep ? UART_IERX_SLEEP : 0);
+ serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0);
if (p->capabilities & UART_CAP_EFR) {
serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B);
serial_out(p, UART_EFR, efr);
@@ -1390,7 +1390,7 @@ static void serial8250_stop_rx(struct ua
up->ier &= ~(UART_IER_RLSI | UART_IER_RDI);
up->port.read_status_mask &= ~UART_LSR_DR;
- set_ier(up, up->ier);
+ serial8250_set_IER(up, up->ier);
serial8250_rpm_put(up);
}
@@ -1408,7 +1408,7 @@ static void __do_stop_tx_rs485(struct ua
serial8250_clear_and_reinit_fifos(p);
p->ier |= UART_IER_RLSI | UART_IER_RDI;
- set_ier(p, p->ier);
+ serial8250_set_IER(p, p->ier);
}
}
static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t)
@@ -1459,7 +1459,7 @@ static void __stop_tx_rs485(struct uart_
static inline void __do_stop_tx(struct uart_8250_port *p)
{
- if (serial8250_clear_THRI_sier(p))
+ if (serial8250_clear_THRI(p))
serial8250_rpm_put_tx(p);
}
@@ -1509,7 +1509,7 @@ static inline void __start_tx(struct uar
if (up->dma && !up->dma->tx_dma(up))
return;
- if (serial8250_set_THRI_sier(up)) {
+ if (serial8250_set_THRI(up)) {
if (up->bugs & UART_BUG_TXEN) {
unsigned char lsr;
@@ -1616,7 +1616,7 @@ static void serial8250_disable_ms(struct
mctrl_gpio_disable_ms(up->gpios);
up->ier &= ~UART_IER_MSI;
- set_ier(up, up->ier);
+ serial8250_set_IER(up, up->ier);
}
static void serial8250_enable_ms(struct uart_port *port)
@@ -1632,7 +1632,7 @@ static void serial8250_enable_ms(struct
up->ier |= UART_IER_MSI;
serial8250_rpm_get(up);
- set_ier(up, up->ier);
+ serial8250_set_IER(up, up->ier);
serial8250_rpm_put(up);
}
@@ -1991,54 +1991,6 @@ static void wait_for_xmitr(struct uart_8
}
}
-static atomic_t ier_counter = ATOMIC_INIT(0);
-static atomic_t ier_value = ATOMIC_INIT(0);
-
-void set_ier(struct uart_8250_port *up, unsigned char ier)
-{
- struct uart_port *port = &up->port;
- unsigned int flags;
-
- console_atomic_lock(&flags);
- if (atomic_read(&ier_counter) > 0)
- atomic_set(&ier_value, ier);
- else
- serial_port_out(port, UART_IER, ier);
- console_atomic_unlock(flags);
-}
-
-void clear_ier(struct uart_8250_port *up)
-{
- struct uart_port *port = &up->port;
- unsigned int ier_cleared = 0;
- unsigned int flags;
- unsigned int ier;
-
- console_atomic_lock(&flags);
- atomic_inc(&ier_counter);
- ier = serial_port_in(port, UART_IER);
- if (up->capabilities & UART_CAP_UUE)
- ier_cleared = UART_IER_UUE;
- if (ier != ier_cleared) {
- serial_port_out(port, UART_IER, ier_cleared);
- atomic_set(&ier_value, ier);
- }
- console_atomic_unlock(flags);
-}
-EXPORT_SYMBOL_GPL(clear_ier);
-
-void restore_ier(struct uart_8250_port *up)
-{
- struct uart_port *port = &up->port;
- unsigned int flags;
-
- console_atomic_lock(&flags);
- if (atomic_fetch_dec(&ier_counter) == 1)
- serial_port_out(port, UART_IER, atomic_read(&ier_value));
- console_atomic_unlock(flags);
-}
-EXPORT_SYMBOL_GPL(restore_ier);
-
#ifdef CONFIG_CONSOLE_POLL
/*
* Console polling routines for writing and reading from the uart while
@@ -2070,10 +2022,11 @@ static int serial8250_get_poll_char(stru
static void serial8250_put_poll_char(struct uart_port *port,
unsigned char c)
{
+ unsigned int ier;
struct uart_8250_port *up = up_to_u8250p(port);
serial8250_rpm_get(up);
- clear_ier(up);
+ ier = serial8250_clear_IER(up);
wait_for_xmitr(up, BOTH_EMPTY);
/*
@@ -2086,7 +2039,7 @@ static void serial8250_put_poll_char(str
* and restore the IER
*/
wait_for_xmitr(up, BOTH_EMPTY);
- restore_ier(up);
+ serial8250_set_IER(up, ier);
serial8250_rpm_put(up);
}
@@ -2398,7 +2351,7 @@ void serial8250_do_shutdown(struct uart_
*/
spin_lock_irqsave(&port->lock, flags);
up->ier = 0;
- set_ier(up, 0);
+ serial8250_set_IER(up, 0);
spin_unlock_irqrestore(&port->lock, flags);
synchronize_irq(port->irq);
@@ -2683,7 +2636,7 @@ serial8250_do_set_termios(struct uart_po
if (up->capabilities & UART_CAP_RTOIE)
up->ier |= UART_IER_RTOIE;
- set_ier(up, up->ier);
+ serial8250_set_IER(up, up->ier);
if (up->capabilities & UART_CAP_EFR) {
unsigned char efr = 0;
@@ -3193,12 +3146,13 @@ void serial8250_console_write_atomic(str
{
struct uart_port *port = &up->port;
unsigned int flags;
+ unsigned int ier;
console_atomic_lock(&flags);
touch_nmi_watchdog();
- clear_ier(up);
+ ier = serial8250_clear_IER(up);
if (atomic_fetch_inc(&up->console_printing)) {
uart_console_write(port, "\n", 1,
@@ -3208,7 +3162,7 @@ void serial8250_console_write_atomic(str
atomic_dec(&up->console_printing);
wait_for_xmitr(up, BOTH_EMPTY);
- restore_ier(up);
+ serial8250_set_IER(up, ier);
console_atomic_unlock(flags);
}
@@ -3224,13 +3178,14 @@ void serial8250_console_write(struct uar
{
struct uart_port *port = &up->port;
unsigned long flags;
+ unsigned int ier;
touch_nmi_watchdog();
serial8250_rpm_get(up);
spin_lock_irqsave(&port->lock, flags);
- clear_ier(up);
+ ier = serial8250_clear_IER(up);
/* check scratch reg to see if port powered off during system sleep */
if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) {
@@ -3247,7 +3202,7 @@ void serial8250_console_write(struct uar
* and restore the IER
*/
wait_for_xmitr(up, BOTH_EMPTY);
- restore_ier(up);
+ serial8250_set_IER(up, ier);
/*
* The receive handling will happen properly because the

View File

@ -0,0 +1,102 @@
From: John Ogness <john.ogness@linutronix.de>
Date: Fri, 10 Jan 2020 16:45:32 +0106
Subject: [PATCH] serial: 8250: fsl/ingenic/mtk: fix atomic console
A few 8250 implementations have their own IER access. If the port
is a console, wrap the accesses with console_atomic_lock.
Signed-off-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/tty/serial/8250/8250_fsl.c | 9 +++++++++
drivers/tty/serial/8250/8250_ingenic.c | 7 +++++++
drivers/tty/serial/8250/8250_mtk.c | 29 +++++++++++++++++++++++++++--
3 files changed, 43 insertions(+), 2 deletions(-)
--- a/drivers/tty/serial/8250/8250_fsl.c
+++ b/drivers/tty/serial/8250/8250_fsl.c
@@ -57,9 +57,18 @@ int fsl8250_handle_irq(struct uart_port
/* Stop processing interrupts on input overrun */
if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) {
+ unsigned int ca_flags;
unsigned long delay;
+ bool is_console;
+ is_console = uart_console(port);
+
+ if (is_console)
+ console_atomic_lock(&ca_flags);
up->ier = port->serial_in(port, UART_IER);
+ if (is_console)
+ console_atomic_unlock(ca_flags);
+
if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) {
port->ops->stop_rx(port);
} else {
--- a/drivers/tty/serial/8250/8250_ingenic.c
+++ b/drivers/tty/serial/8250/8250_ingenic.c
@@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic
static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value)
{
+ unsigned int flags;
+ bool is_console;
int ier;
switch (offset) {
@@ -167,7 +169,12 @@ static void ingenic_uart_serial_out(stru
* If we have enabled modem status IRQs we should enable
* modem mode.
*/
+ is_console = uart_console(p);
+ if (is_console)
+ console_atomic_lock(&flags);
ier = p->serial_in(p, UART_IER);
+ if (is_console)
+ console_atomic_unlock(flags);
if (ier & UART_IER_MSI)
value |= UART_MCR_MDCE | UART_MCR_FCM;
--- a/drivers/tty/serial/8250/8250_mtk.c
+++ b/drivers/tty/serial/8250/8250_mtk.c
@@ -212,12 +212,37 @@ static void mtk8250_shutdown(struct uart
static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask)
{
- serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask));
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ unsigned int ier;
+ bool is_console;
+
+ is_console = uart_console(port);
+
+ if (is_console)
+ console_atomic_lock(&flags);
+
+ ier = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, ier & (~mask));
+
+ if (is_console)
+ console_atomic_unlock(flags);
}
static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask)
{
- serial_out(up, UART_IER, serial_in(up, UART_IER) | mask);
+ struct uart_port *port = &up->port;
+ unsigned int flags;
+ unsigned int ier;
+
+ if (uart_console(port))
+ console_atomic_lock(&flags);
+
+ ier = serial_in(up, UART_IER);
+ serial_out(up, UART_IER, ier | mask);
+
+ if (uart_console(port))
+ console_atomic_unlock(flags);
}
static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode)

View File

@ -0,0 +1,217 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 31 Jan 2020 16:07:04 +0100
Subject: [PATCH 1/7] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its
own lockdep_map
As preparation for replacing the embedded rwsem, give percpu-rwsem its
own lockdep_map.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/percpu-rwsem.h | 29 +++++++++++++++++++----------
kernel/cpu.c | 4 ++--
kernel/locking/percpu-rwsem.c | 16 ++++++++++++----
kernel/locking/rwsem.c | 4 ++--
kernel/locking/rwsem.h | 2 ++
5 files changed, 37 insertions(+), 18 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -15,8 +15,17 @@ struct percpu_rw_semaphore {
struct rw_semaphore rw_sem; /* slowpath */
struct rcuwait writer; /* blocked writer */
int readers_block;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
};
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname },
+#else
+#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
+#endif
+
#define __DEFINE_PERCPU_RWSEM(name, is_static) \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \
is_static struct percpu_rw_semaphore name = { \
@@ -24,7 +33,9 @@ is_static struct percpu_rw_semaphore nam
.read_count = &__percpu_rwsem_rc_##name, \
.rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
.writer = __RCUWAIT_INITIALIZER(name.writer), \
+ __PERCPU_RWSEM_DEP_MAP_INIT(name) \
}
+
#define DEFINE_PERCPU_RWSEM(name) \
__DEFINE_PERCPU_RWSEM(name, /* not static */)
#define DEFINE_STATIC_PERCPU_RWSEM(name) \
@@ -37,7 +48,7 @@ static inline void percpu_down_read(stru
{
might_sleep();
- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_);
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
preempt_disable();
/*
@@ -76,13 +87,15 @@ static inline int percpu_down_read_trylo
*/
if (ret)
- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_);
+ rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
return ret;
}
static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
preempt_disable();
/*
* Same as in percpu_down_read().
@@ -92,8 +105,6 @@ static inline void percpu_up_read(struct
else
__percpu_up_read(sem); /* Unconditional memory barrier */
preempt_enable();
-
- rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_);
}
extern void percpu_down_write(struct percpu_rw_semaphore *);
@@ -110,15 +121,13 @@ extern void percpu_free_rwsem(struct per
__percpu_init_rwsem(sem, #sem, &rwsem_key); \
})
-#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem)
-
-#define percpu_rwsem_assert_held(sem) \
- lockdep_assert_held(&(sem)->rw_sem)
+#define percpu_rwsem_is_held(sem) lockdep_is_held(sem)
+#define percpu_rwsem_assert_held(sem) lockdep_assert_held(sem)
static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
bool read, unsigned long ip)
{
- lock_release(&sem->rw_sem.dep_map, 1, ip);
+ lock_release(&sem->dep_map, 1, ip);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
if (!read)
atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
@@ -128,7 +137,7 @@ static inline void percpu_rwsem_release(
static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
bool read, unsigned long ip)
{
- lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
+ lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
if (!read)
atomic_long_set(&sem->rw_sem.owner, (long)current);
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void)
static void lockdep_acquire_cpus_lock(void)
{
- rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+ rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
}
static void lockdep_release_cpus_lock(void)
{
- rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
+ rwsem_release(&cpu_hotplug_lock.dep_map, 1, _THIS_IP_);
}
/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -11,7 +11,7 @@
#include "rwsem.h"
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
- const char *name, struct lock_class_key *rwsem_key)
+ const char *name, struct lock_class_key *key)
{
sem->read_count = alloc_percpu(int);
if (unlikely(!sem->read_count))
@@ -19,9 +19,13 @@ int __percpu_init_rwsem(struct percpu_rw
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
rcu_sync_init(&sem->rss);
- __init_rwsem(&sem->rw_sem, name, rwsem_key);
+ init_rwsem(&sem->rw_sem);
rcuwait_init(&sem->writer);
sem->readers_block = 0;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
@@ -142,10 +146,12 @@ static bool readers_active_check(struct
void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
- down_write(&sem->rw_sem);
+ __down_write(&sem->rw_sem);
/*
* Notify new readers to block; up until now, and thus throughout the
@@ -168,6 +174,8 @@ EXPORT_SYMBOL_GPL(percpu_down_write);
void percpu_up_write(struct percpu_rw_semaphore *sem)
{
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+
/*
* Signal the writer is done, no fast path yet.
*
@@ -183,7 +191,7 @@ void percpu_up_write(struct percpu_rw_se
/*
* Release the write lock, this will allow readers back in the game.
*/
- up_write(&sem->rw_sem);
+ __up_write(&sem->rw_sem);
/*
* Once this completes (at least one RCU-sched grace period hence) the
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1383,7 +1383,7 @@ static inline int __down_read_trylock(st
/*
* lock for writing
*/
-static inline void __down_write(struct rw_semaphore *sem)
+inline void __down_write(struct rw_semaphore *sem)
{
long tmp = RWSEM_UNLOCKED_VALUE;
@@ -1446,7 +1446,7 @@ inline void __up_read(struct rw_semaphor
/*
* unlock after writing
*/
-static inline void __up_write(struct rw_semaphore *sem)
+inline void __up_write(struct rw_semaphore *sem)
{
long tmp;
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -6,5 +6,7 @@
extern void __down_read(struct rw_semaphore *sem);
extern void __up_read(struct rw_semaphore *sem);
+extern void __down_write(struct rw_semaphore *sem);
+extern void __up_write(struct rw_semaphore *sem);
#endif /* __INTERNAL_RWSEM_H */

View File

@ -0,0 +1,75 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 31 Jan 2020 16:07:05 +0100
Subject: [PATCH 2/7] locking/percpu-rwsem: Convert to bool
Use bool where possible.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/percpu-rwsem.h | 6 +++---
kernel/locking/percpu-rwsem.c | 8 ++++----
2 files changed, 7 insertions(+), 7 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -41,7 +41,7 @@ is_static struct percpu_rw_semaphore nam
#define DEFINE_STATIC_PERCPU_RWSEM(name) \
__DEFINE_PERCPU_RWSEM(name, static)
-extern int __percpu_down_read(struct percpu_rw_semaphore *, int);
+extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
extern void __percpu_up_read(struct percpu_rw_semaphore *);
static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
@@ -69,9 +69,9 @@ static inline void percpu_down_read(stru
preempt_enable();
}
-static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
+static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
- int ret = 1;
+ bool ret = true;
preempt_disable();
/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ void percpu_free_rwsem(struct percpu_rw_
}
EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
/*
* Due to having preemption disabled the decrement happens on
@@ -69,7 +69,7 @@ int __percpu_down_read(struct percpu_rw_
* release in percpu_up_write().
*/
if (likely(!smp_load_acquire(&sem->readers_block)))
- return 1;
+ return true;
/*
* Per the above comment; we still have preemption disabled and
@@ -78,7 +78,7 @@ int __percpu_down_read(struct percpu_rw_
__percpu_up_read(sem);
if (try)
- return 0;
+ return false;
/*
* We either call schedule() in the wait, or we'll fall through
@@ -94,7 +94,7 @@ int __percpu_down_read(struct percpu_rw_
__up_read(&sem->rw_sem);
preempt_disable();
- return 1;
+ return true;
}
EXPORT_SYMBOL_GPL(__percpu_down_read);

View File

@ -0,0 +1,53 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 31 Jan 2020 16:07:06 +0100
Subject: [PATCH 3/7] locking/percpu-rwsem: Move __this_cpu_inc() into the
slowpath
As preparation to rework __percpu_down_read() move the
__this_cpu_inc() into it.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/percpu-rwsem.h | 10 ++++++----
kernel/locking/percpu-rwsem.c | 2 ++
2 files changed, 8 insertions(+), 4 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -59,8 +59,9 @@ static inline void percpu_down_read(stru
* and that once the synchronize_rcu() is done, the writer will see
* anything we did within this RCU-sched read-size critical section.
*/
- __this_cpu_inc(*sem->read_count);
- if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+ if (likely(rcu_sync_is_idle(&sem->rss)))
+ __this_cpu_inc(*sem->read_count);
+ else
__percpu_down_read(sem, false); /* Unconditional memory barrier */
/*
* The preempt_enable() prevents the compiler from
@@ -77,8 +78,9 @@ static inline bool percpu_down_read_tryl
/*
* Same as in percpu_down_read().
*/
- __this_cpu_inc(*sem->read_count);
- if (unlikely(!rcu_sync_is_idle(&sem->rss)))
+ if (likely(rcu_sync_is_idle(&sem->rss)))
+ __this_cpu_inc(*sem->read_count);
+ else
ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
preempt_enable();
/*
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -47,6 +47,8 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem);
bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
+ __this_cpu_inc(*sem->read_count);
+
/*
* Due to having preemption disabled the decrement happens on
* the same CPU as the increment, avoiding the

View File

@ -0,0 +1,50 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 31 Jan 2020 16:07:07 +0100
Subject: [PATCH 4/7] locking/percpu-rwsem: Extract
__percpu_down_read_trylock()
In preparation for removing the embedded rwsem and building a custom
lock, extract the read-trylock primitive.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/locking/percpu-rwsem.c | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ void percpu_free_rwsem(struct percpu_rw_
}
EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
__this_cpu_inc(*sem->read_count);
@@ -73,11 +73,18 @@ bool __percpu_down_read(struct percpu_rw
if (likely(!smp_load_acquire(&sem->readers_block)))
return true;
- /*
- * Per the above comment; we still have preemption disabled and
- * will thus decrement on the same CPU as we incremented.
- */
- __percpu_up_read(sem);
+ __this_cpu_dec(*sem->read_count);
+
+ /* Prod writer to re-evaluate readers_active_check() */
+ rcuwait_wake_up(&sem->writer);
+
+ return false;
+}
+
+bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+{
+ if (__percpu_down_read_trylock(sem))
+ return true;
if (try)
return false;

View File

@ -0,0 +1,433 @@
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 31 Jan 2020 16:07:08 +0100
Subject: [PATCH 5/7] locking/percpu-rwsem: Remove the embedded rwsem
The filesystem freezer uses percpu-rwsem in a way that is effectively
write_non_owner() and achieves this with a few horrible hacks that
rely on the rwsem (!percpu) implementation.
When PREEMPT_RT replaces the rwsem implementation with a PI aware
variant this comes apart.
Remove the embedded rwsem and implement it using a waitqueue and an
atomic_t.
- make readers_block an atomic, and use it, with the waitqueue
for a blocking test-and-set write-side.
- have the read-side wait for the 'lock' state to clear.
Have the waiters use FIFO queueing and mark them (reader/writer) with
a new WQ_FLAG. Use a custom wake_function to wake either a single
writer or all readers until a writer.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/percpu-rwsem.h | 19 +----
include/linux/rwsem.h | 6 -
include/linux/wait.h | 1
kernel/locking/percpu-rwsem.c | 153 ++++++++++++++++++++++++++++++------------
kernel/locking/rwsem.c | 11 +--
kernel/locking/rwsem.h | 12 ---
6 files changed, 123 insertions(+), 79 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -3,18 +3,18 @@
#define _LINUX_PERCPU_RWSEM_H
#include <linux/atomic.h>
-#include <linux/rwsem.h>
#include <linux/percpu.h>
#include <linux/rcuwait.h>
+#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>
struct percpu_rw_semaphore {
struct rcu_sync rss;
unsigned int __percpu *read_count;
- struct rw_semaphore rw_sem; /* slowpath */
- struct rcuwait writer; /* blocked writer */
- int readers_block;
+ struct rcuwait writer;
+ wait_queue_head_t waiters;
+ atomic_t block;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -31,8 +31,9 @@ static DEFINE_PER_CPU(unsigned int, __pe
is_static struct percpu_rw_semaphore name = { \
.rss = __RCU_SYNC_INITIALIZER(name.rss), \
.read_count = &__percpu_rwsem_rc_##name, \
- .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \
.writer = __RCUWAIT_INITIALIZER(name.writer), \
+ .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters), \
+ .block = ATOMIC_INIT(0), \
__PERCPU_RWSEM_DEP_MAP_INIT(name) \
}
@@ -130,20 +131,12 @@ static inline void percpu_rwsem_release(
bool read, unsigned long ip)
{
lock_release(&sem->dep_map, 1, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
- if (!read)
- atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
-#endif
}
static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
bool read, unsigned long ip)
{
lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
- if (!read)
- atomic_long_set(&sem->rw_sem.owner, (long)current);
-#endif
}
#endif
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -53,12 +53,6 @@ struct rw_semaphore {
#endif
};
-/*
- * Setting all bits of the owner field except bit 0 will indicate
- * that the rwsem is writer-owned with an unknown owner.
- */
-#define RWSEM_OWNER_UNKNOWN (-2L)
-
/* In all implementations count != 0 means locked */
static inline int rwsem_is_locked(struct rw_semaphore *sem)
{
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -20,6 +20,7 @@ int default_wake_function(struct wait_qu
#define WQ_FLAG_EXCLUSIVE 0x01
#define WQ_FLAG_WOKEN 0x02
#define WQ_FLAG_BOOKMARK 0x04
+#define WQ_FLAG_CUSTOM 0x08
/*
* A single wait-queue entry structure:
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,15 +1,14 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/atomic.h>
-#include <linux/rwsem.h>
#include <linux/percpu.h>
+#include <linux/wait.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
+#include <linux/sched/task.h>
#include <linux/errno.h>
-#include "rwsem.h"
-
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *key)
{
@@ -17,11 +16,10 @@ int __percpu_init_rwsem(struct percpu_rw
if (unlikely(!sem->read_count))
return -ENOMEM;
- /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
rcu_sync_init(&sem->rss);
- init_rwsem(&sem->rw_sem);
rcuwait_init(&sem->writer);
- sem->readers_block = 0;
+ init_waitqueue_head(&sem->waiters);
+ atomic_set(&sem->block, 0);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
@@ -54,23 +52,23 @@ static bool __percpu_down_read_trylock(s
* the same CPU as the increment, avoiding the
* increment-on-one-CPU-and-decrement-on-another problem.
*
- * If the reader misses the writer's assignment of readers_block, then
- * the writer is guaranteed to see the reader's increment.
+ * If the reader misses the writer's assignment of sem->block, then the
+ * writer is guaranteed to see the reader's increment.
*
* Conversely, any readers that increment their sem->read_count after
- * the writer looks are guaranteed to see the readers_block value,
- * which in turn means that they are guaranteed to immediately
- * decrement their sem->read_count, so that it doesn't matter that the
- * writer missed them.
+ * the writer looks are guaranteed to see the sem->block value, which
+ * in turn means that they are guaranteed to immediately decrement
+ * their sem->read_count, so that it doesn't matter that the writer
+ * missed them.
*/
smp_mb(); /* A matches D */
/*
- * If !readers_block the critical section starts here, matched by the
+ * If !sem->block the critical section starts here, matched by the
* release in percpu_up_write().
*/
- if (likely(!smp_load_acquire(&sem->readers_block)))
+ if (likely(!atomic_read_acquire(&sem->block)))
return true;
__this_cpu_dec(*sem->read_count);
@@ -81,6 +79,88 @@ static bool __percpu_down_read_trylock(s
return false;
}
+static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem)
+{
+ if (atomic_read(&sem->block))
+ return false;
+
+ return atomic_xchg(&sem->block, 1) == 0;
+}
+
+static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader)
+{
+ if (reader) {
+ bool ret;
+
+ preempt_disable();
+ ret = __percpu_down_read_trylock(sem);
+ preempt_enable();
+
+ return ret;
+ }
+ return __percpu_down_write_trylock(sem);
+}
+
+/*
+ * The return value of wait_queue_entry::func means:
+ *
+ * <0 - error, wakeup is terminated and the error is returned
+ * 0 - no wakeup, a next waiter is tried
+ * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive.
+ *
+ * We use EXCLUSIVE for both readers and writers to preserve FIFO order,
+ * and play games with the return value to allow waking multiple readers.
+ *
+ * Specifically, we wake readers until we've woken a single writer, or until a
+ * trylock fails.
+ */
+static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry,
+ unsigned int mode, int wake_flags,
+ void *key)
+{
+ struct task_struct *p = get_task_struct(wq_entry->private);
+ bool reader = wq_entry->flags & WQ_FLAG_CUSTOM;
+ struct percpu_rw_semaphore *sem = key;
+
+ /* concurrent against percpu_down_write(), can get stolen */
+ if (!__percpu_rwsem_trylock(sem, reader))
+ return 1;
+
+ list_del_init(&wq_entry->entry);
+ smp_store_release(&wq_entry->private, NULL);
+
+ wake_up_process(p);
+ put_task_struct(p);
+
+ return !reader; /* wake (readers until) 1 writer */
+}
+
+static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
+{
+ DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function);
+ bool wait;
+
+ spin_lock_irq(&sem->waiters.lock);
+ /*
+ * Serialize against the wakeup in percpu_up_write(), if we fail
+ * the trylock, the wakeup must see us on the list.
+ */
+ wait = !__percpu_rwsem_trylock(sem, reader);
+ if (wait) {
+ wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM;
+ __add_wait_queue_entry_tail(&sem->waiters, &wq_entry);
+ }
+ spin_unlock_irq(&sem->waiters.lock);
+
+ while (wait) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!smp_load_acquire(&wq_entry.private))
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+}
+
bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
if (__percpu_down_read_trylock(sem))
@@ -89,20 +169,10 @@ bool __percpu_down_read(struct percpu_rw
if (try)
return false;
- /*
- * We either call schedule() in the wait, or we'll fall through
- * and reschedule on the preempt_enable() in percpu_down_read().
- */
- preempt_enable_no_resched();
-
- /*
- * Avoid lockdep for the down/up_read() we already have them.
- */
- __down_read(&sem->rw_sem);
- this_cpu_inc(*sem->read_count);
- __up_read(&sem->rw_sem);
-
+ preempt_enable();
+ percpu_rwsem_wait(sem, /* .reader = */ true);
preempt_disable();
+
return true;
}
EXPORT_SYMBOL_GPL(__percpu_down_read);
@@ -117,7 +187,7 @@ void __percpu_up_read(struct percpu_rw_s
*/
__this_cpu_dec(*sem->read_count);
- /* Prod writer to recheck readers_active */
+ /* Prod writer to re-evaluate readers_active_check() */
rcuwait_wake_up(&sem->writer);
}
EXPORT_SYMBOL_GPL(__percpu_up_read);
@@ -137,6 +207,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read);
* zero. If this sum is zero, then it is stable due to the fact that if any
* newly arriving readers increment a given counter, they will immediately
* decrement that same counter.
+ *
+ * Assumes sem->block is set.
*/
static bool readers_active_check(struct percpu_rw_semaphore *sem)
{
@@ -160,23 +232,22 @@ void percpu_down_write(struct percpu_rw_
/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
- __down_write(&sem->rw_sem);
-
/*
- * Notify new readers to block; up until now, and thus throughout the
- * longish rcu_sync_enter() above, new readers could still come in.
+ * Try set sem->block; this provides writer-writer exclusion.
+ * Having sem->block set makes new readers block.
*/
- WRITE_ONCE(sem->readers_block, 1);
+ if (!__percpu_down_write_trylock(sem))
+ percpu_rwsem_wait(sem, /* .reader = */ false);
- smp_mb(); /* D matches A */
+ /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */
/*
- * If they don't see our writer of readers_block, then we are
- * guaranteed to see their sem->read_count increment, and therefore
- * will wait for them.
+ * If they don't see our store of sem->block, then we are guaranteed to
+ * see their sem->read_count increment, and therefore will wait for
+ * them.
*/
- /* Wait for all now active readers to complete. */
+ /* Wait for all active readers to complete. */
rcuwait_wait_event(&sem->writer, readers_active_check(sem));
}
EXPORT_SYMBOL_GPL(percpu_down_write);
@@ -195,12 +266,12 @@ void percpu_up_write(struct percpu_rw_se
* Therefore we force it through the slow path which guarantees an
* acquire and thereby guarantees the critical section's consistency.
*/
- smp_store_release(&sem->readers_block, 0);
+ atomic_set_release(&sem->block, 0);
/*
- * Release the write lock, this will allow readers back in the game.
+ * Prod any pending reader/writer to make progress.
*/
- __up_write(&sem->rw_sem);
+ __wake_up(&sem->waiters, TASK_NORMAL, 1, sem);
/*
* Once this completes (at least one RCU-sched grace period hence) the
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -28,7 +28,6 @@
#include <linux/rwsem.h>
#include <linux/atomic.h>
-#include "rwsem.h"
#include "lock_events.h"
/*
@@ -660,8 +659,6 @@ static inline bool rwsem_can_spin_on_own
unsigned long flags;
bool ret = true;
- BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
-
if (need_resched()) {
lockevent_inc(rwsem_opt_fail);
return false;
@@ -1338,7 +1335,7 @@ static struct rw_semaphore *rwsem_downgr
/*
* lock for reading
*/
-inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct rw_semaphore *sem)
{
if (!rwsem_read_trylock(sem)) {
rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
@@ -1383,7 +1380,7 @@ static inline int __down_read_trylock(st
/*
* lock for writing
*/
-inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct rw_semaphore *sem)
{
long tmp = RWSEM_UNLOCKED_VALUE;
@@ -1426,7 +1423,7 @@ static inline int __down_write_trylock(s
/*
* unlock after reading
*/
-inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct rw_semaphore *sem)
{
long tmp;
@@ -1446,7 +1443,7 @@ inline void __up_read(struct rw_semaphor
/*
* unlock after writing
*/
-inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct rw_semaphore *sem)
{
long tmp;
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __INTERNAL_RWSEM_H
-#define __INTERNAL_RWSEM_H
-#include <linux/rwsem.h>
-
-extern void __down_read(struct rw_semaphore *sem);
-extern void __up_read(struct rw_semaphore *sem);
-extern void __down_write(struct rw_semaphore *sem);
-extern void __up_write(struct rw_semaphore *sem);
-
-#endif /* __INTERNAL_RWSEM_H */

View File

@ -0,0 +1,85 @@
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 31 Jan 2020 16:07:09 +0100
Subject: [PATCH 6/7] locking/percpu-rwsem: Fold __percpu_up_read()
Now that __percpu_up_read() is only ever used from percpu_up_read()
merge them, it's a small function.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/percpu-rwsem.h | 19 +++++++++++++++----
kernel/exit.c | 1 +
kernel/locking/percpu-rwsem.c | 15 ---------------
3 files changed, 16 insertions(+), 19 deletions(-)
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -43,7 +43,6 @@ is_static struct percpu_rw_semaphore nam
__DEFINE_PERCPU_RWSEM(name, static)
extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);
-extern void __percpu_up_read(struct percpu_rw_semaphore *);
static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
@@ -103,10 +102,22 @@ static inline void percpu_up_read(struct
/*
* Same as in percpu_down_read().
*/
- if (likely(rcu_sync_is_idle(&sem->rss)))
+ if (likely(rcu_sync_is_idle(&sem->rss))) {
__this_cpu_dec(*sem->read_count);
- else
- __percpu_up_read(sem); /* Unconditional memory barrier */
+ } else {
+ /*
+ * slowpath; reader will only ever wake a single blocked
+ * writer.
+ */
+ smp_mb(); /* B matches C */
+ /*
+ * In other words, if they see our decrement (presumably to
+ * aggregate zero, as that is the only time it matters) they
+ * will also see our critical section.
+ */
+ __this_cpu_dec(*sem->read_count);
+ rcuwait_wake_up(&sem->writer);
+ }
preempt_enable();
}
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -258,6 +258,7 @@ void rcuwait_wake_up(struct rcuwait *w)
wake_up_process(task);
rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(rcuwait_wake_up);
/*
* Determine if a process group is "orphaned", according to the POSIX
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -177,21 +177,6 @@ bool __percpu_down_read(struct percpu_rw
}
EXPORT_SYMBOL_GPL(__percpu_down_read);
-void __percpu_up_read(struct percpu_rw_semaphore *sem)
-{
- smp_mb(); /* B matches C */
- /*
- * In other words, if they see our decrement (presumably to aggregate
- * zero, as that is the only time it matters) they will also see our
- * critical section.
- */
- __this_cpu_dec(*sem->read_count);
-
- /* Prod writer to re-evaluate readers_active_check() */
- rcuwait_wake_up(&sem->writer);
-}
-EXPORT_SYMBOL_GPL(__percpu_up_read);
-
#define per_cpu_sum(var) \
({ \
typeof(var) __sum = 0; \

View File

@ -0,0 +1,26 @@
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Fri, 31 Jan 2020 16:07:10 +0100
Subject: [PATCH 7/7] locking/percpu-rwsem: Add might_sleep() for writer
locking
We are missing this annotation in percpu_down_write(). Correct
this.
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200108013305.7732-1-dave@stgolabs.net
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/locking/percpu-rwsem.c | 1 +
1 file changed, 1 insertion(+)
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -212,6 +212,7 @@ static bool readers_active_check(struct
void percpu_down_write(struct percpu_rw_semaphore *sem)
{
+ might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
/* Notify readers to take the slow path. */

View File

@ -0,0 +1,192 @@
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 15 Nov 2019 18:54:20 +0100
Subject: [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular
spinlock_t
Bit spinlocks are problematic if PREEMPT_RT is enabled, because they
disable preemption, which is undesired for latency reasons and breaks when
regular spinlocks are taken within the bit_spinlock locked region because
regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT
replaces the bit spinlocks with regular spinlocks to avoid this problem.
Bit spinlocks are also not covered by lock debugging, e.g. lockdep.
Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[bigeasy: remove the wrapper and use always spinlock_t and move it into
the padding hole]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
v2…v3: rename uptodate_lock to b_uptodate_lock.
v1…v2: Move the spinlock_t to the padding hole as per Jan Kara. pahole says
its total size remained unchanged, before
| atomic_t b_count; /* 96 4 */
|
| /* size: 104, cachelines: 2, members: 12 */
| /* padding: 4 */
| /* last cacheline: 40 bytes */
after
| atomic_t b_count; /* 96 4 */
| spinlock_t uptodate_lock; /* 100 4 */
|
| /* size: 104, cachelines: 2, members: 13 */
| /* last cacheline: 40 bytes */
fs/buffer.c | 19 +++++++------------
fs/ext4/page-io.c | 8 +++-----
fs/ntfs/aops.c | 9 +++------
include/linux/buffer_head.h | 6 +++---
4 files changed, 16 insertions(+), 26 deletions(-)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,8 +275,7 @@ static void end_buffer_async_read(struct
* decide that the page is now completely done.
*/
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
@@ -289,8 +288,7 @@ static void end_buffer_async_read(struct
}
tmp = tmp->b_this_page;
} while (tmp != bh);
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
/*
* If none of the buffers had errors and they are all
@@ -302,8 +300,7 @@ static void end_buffer_async_read(struct
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
@@ -331,8 +328,7 @@ void end_buffer_async_write(struct buffe
}
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_write(bh);
unlock_buffer(bh);
@@ -344,14 +340,12 @@ void end_buffer_async_write(struct buffe
}
tmp = tmp->b_this_page;
}
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
end_page_writeback(page);
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
EXPORT_SYMBOL(end_buffer_async_write);
@@ -3345,6 +3339,7 @@ struct buffer_head *alloc_buffer_head(gf
struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
if (ret) {
INIT_LIST_HEAD(&ret->b_assoc_buffers);
+ spin_lock_init(&ret->b_uptodate_lock);
preempt_disable();
__this_cpu_inc(bh_accounting.nr);
recalc_bh_state();
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -87,11 +87,10 @@ static void ext4_finish_bio(struct bio *
}
bh = head = page_buffers(page);
/*
- * We check all buffers in the page under BH_Uptodate_Lock
+ * We check all buffers in the page under b_uptodate_lock
* to avoid races with other end io clearing async_write flags
*/
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+ spin_lock_irqsave(&head->b_uptodate_lock, flags);
do {
if (bh_offset(bh) < bio_start ||
bh_offset(bh) + bh->b_size > bio_end) {
@@ -103,8 +102,7 @@ static void ext4_finish_bio(struct bio *
if (bio->bi_status)
buffer_io_error(bh);
} while ((bh = bh->b_this_page) != head);
- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
fscrypt_free_bounce_page(bounce_page);
end_page_writeback(page);
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -92,8 +92,7 @@ static void ntfs_end_buffer_async_read(s
"0x%llx.", (unsigned long long)bh->b_blocknr);
}
first = page_buffers(page);
- local_irq_save(flags);
- bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+ spin_lock_irqsave(&first->b_uptodate_lock, flags);
clear_buffer_async_read(bh);
unlock_buffer(bh);
tmp = bh;
@@ -108,8 +107,7 @@ static void ntfs_end_buffer_async_read(s
}
tmp = tmp->b_this_page;
} while (tmp != bh);
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
/*
* If none of the buffers had errors then we can set the page uptodate,
* but we first have to perform the post read mst fixups, if the
@@ -142,8 +140,7 @@ static void ntfs_end_buffer_async_read(s
unlock_page(page);
return;
still_busy:
- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
- local_irq_restore(flags);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -22,9 +22,6 @@ enum bh_state_bits {
BH_Dirty, /* Is dirty */
BH_Lock, /* Is locked */
BH_Req, /* Has been submitted for I/O */
- BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
- * IO completion of other buffers in the page
- */
BH_Mapped, /* Has a disk mapping */
BH_New, /* Disk mapping was newly created by get_block */
@@ -76,6 +73,9 @@ struct buffer_head {
struct address_space *b_assoc_map; /* mapping this buffer is
associated with */
atomic_t b_count; /* users using this buffer_head */
+ spinlock_t b_uptodate_lock; /* Used by the first bh in a page, to
+ * serialise IO completion of other
+ * buffers in the page */
};
/*

View File

@ -0,0 +1,109 @@
From: Clark Williams <williams@redhat.com>
Date: Mon, 15 Jul 2019 15:25:00 -0500
Subject: [PATCH] thermal/x86_pkg_temp: Make pkg_temp_lock a raw_spinlock_t
The spinlock pkg_temp_lock has the potential of being taken in atomic
context because it can be acquired from the thermal IRQ vector.
It's static and limited scope so go ahead and make it a raw spinlock.
Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/thermal/intel/x86_pkg_temp_thermal.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
--- a/drivers/thermal/intel/x86_pkg_temp_thermal.c
+++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c
@@ -63,7 +63,7 @@ static int max_id __read_mostly;
/* Array of zone pointers */
static struct zone_device **zones;
/* Serializes interrupt notification, work and hotplug */
-static DEFINE_SPINLOCK(pkg_temp_lock);
+static DEFINE_RAW_SPINLOCK(pkg_temp_lock);
/* Protects zone operation in the work function against hotplug removal */
static DEFINE_MUTEX(thermal_zone_mutex);
@@ -266,12 +266,12 @@ static void pkg_temp_thermal_threshold_w
u64 msr_val, wr_val;
mutex_lock(&thermal_zone_mutex);
- spin_lock_irq(&pkg_temp_lock);
+ raw_spin_lock_irq(&pkg_temp_lock);
++pkg_work_cnt;
zonedev = pkg_temp_thermal_get_dev(cpu);
if (!zonedev) {
- spin_unlock_irq(&pkg_temp_lock);
+ raw_spin_unlock_irq(&pkg_temp_lock);
mutex_unlock(&thermal_zone_mutex);
return;
}
@@ -285,7 +285,7 @@ static void pkg_temp_thermal_threshold_w
}
enable_pkg_thres_interrupt();
- spin_unlock_irq(&pkg_temp_lock);
+ raw_spin_unlock_irq(&pkg_temp_lock);
/*
* If tzone is not NULL, then thermal_zone_mutex will prevent the
@@ -310,7 +310,7 @@ static int pkg_thermal_notify(u64 msr_va
struct zone_device *zonedev;
unsigned long flags;
- spin_lock_irqsave(&pkg_temp_lock, flags);
+ raw_spin_lock_irqsave(&pkg_temp_lock, flags);
++pkg_interrupt_cnt;
disable_pkg_thres_interrupt();
@@ -322,7 +322,7 @@ static int pkg_thermal_notify(u64 msr_va
pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work);
}
- spin_unlock_irqrestore(&pkg_temp_lock, flags);
+ raw_spin_unlock_irqrestore(&pkg_temp_lock, flags);
return 0;
}
@@ -368,9 +368,9 @@ static int pkg_temp_thermal_device_add(u
zonedev->msr_pkg_therm_high);
cpumask_set_cpu(cpu, &zonedev->cpumask);
- spin_lock_irq(&pkg_temp_lock);
+ raw_spin_lock_irq(&pkg_temp_lock);
zones[id] = zonedev;
- spin_unlock_irq(&pkg_temp_lock);
+ raw_spin_unlock_irq(&pkg_temp_lock);
return 0;
}
@@ -407,7 +407,7 @@ static int pkg_thermal_cpu_offline(unsig
}
/* Protect against work and interrupts */
- spin_lock_irq(&pkg_temp_lock);
+ raw_spin_lock_irq(&pkg_temp_lock);
/*
* Check whether this cpu was the current target and store the new
@@ -439,9 +439,9 @@ static int pkg_thermal_cpu_offline(unsig
* To cancel the work we need to drop the lock, otherwise
* we might deadlock if the work needs to be flushed.
*/
- spin_unlock_irq(&pkg_temp_lock);
+ raw_spin_unlock_irq(&pkg_temp_lock);
cancel_delayed_work_sync(&zonedev->work);
- spin_lock_irq(&pkg_temp_lock);
+ raw_spin_lock_irq(&pkg_temp_lock);
/*
* If this is not the last cpu in the package and the work
* did not run after we dropped the lock above, then we
@@ -452,7 +452,7 @@ static int pkg_thermal_cpu_offline(unsig
pkg_thermal_schedule_work(target, &zonedev->work);
}
- spin_unlock_irq(&pkg_temp_lock);
+ raw_spin_unlock_irq(&pkg_temp_lock);
/* Final cleanup if this is the last cpu */
if (lastcpu)

View File

@ -0,0 +1,30 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 15 Nov 2019 18:04:07 +0100
Subject: [PATCH] perf/core: Add SRCU annotation for pmus list walk
Since commit
28875945ba98d ("rcu: Add support for consolidated-RCU reader checking")
there is an additional check to ensure that a RCU related lock is held
while the RCU list is iterated.
This section holds the SRCU reader lock instead.
Add annotation to list_for_each_entry_rcu() that pmus_srcu must be
acquired during the list traversal.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/events/core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10264,7 +10264,7 @@ static struct pmu *perf_init_event(struc
goto unlock;
}
- list_for_each_entry_rcu(pmu, &pmus, entry) {
+ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
goto unlock;

View File

@ -0,0 +1,411 @@
From: He Zhe <zhe.he@windriver.com>
Date: Wed, 19 Dec 2018 16:30:57 +0100
Subject: [PATCH] kmemleak: Turn kmemleak_lock and object->lock to
raw_spinlock_t
kmemleak_lock as a rwlock on RT can possibly be acquired in atomic context
which does work on RT.
Since the kmemleak operation is performed in atomic context make it a
raw_spinlock_t so it can also be acquired on RT. This is used for
debugging and is not enabled by default in a production like environment
(where performance/latency matters) so it makes sense to make it a
raw_spinlock_t instead trying to get rid of the atomic context.
Turn also the kmemleak_object->lock into raw_spinlock_t which is
acquired (nested) while the kmemleak_lock is held.
The time spent in "echo scan > kmemleak" slightly improved on 64core box
with this patch applied after boot.
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lkml.kernel.org/r/20181218150744.GB20197@arrakis.emea.arm.com
Link: https://lkml.kernel.org/r/1542877459-144382-1-git-send-email-zhe.he@windriver.com
Link: https://lkml.kernel.org/r/20190927082230.34152-1-yongxin.liu@windriver.com
Signed-off-by: He Zhe <zhe.he@windriver.com>
Signed-off-by: Liu Haitao <haitao.liu@windriver.com>
Signed-off-by: Yongxin Liu <yongxin.liu@windriver.com>
[bigeasy: Redo the description. Merge the individual bits: He Zhe did
the kmemleak_lock, Liu Haitao the ->lock and Yongxin Liu forwarded the
patch.]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/kmemleak.c | 112 +++++++++++++++++++++++++++++-----------------------------
1 file changed, 56 insertions(+), 56 deletions(-)
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -13,7 +13,7 @@
*
* The following locks and mutexes are used by kmemleak:
*
- * - kmemleak_lock (rwlock): protects the object_list modifications and
+ * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
* accesses to the object_tree_root. The object_list is the main list
* holding the metadata (struct kmemleak_object) for the allocated memory
* blocks. The object_tree_root is a red black tree used to look-up
@@ -22,13 +22,13 @@
* object_tree_root in the create_object() function called from the
* kmemleak_alloc() callback and removed in delete_object() called from the
* kmemleak_free() callback
- * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
- * the metadata (e.g. count) are protected by this lock. Note that some
- * members of this structure may be protected by other means (atomic or
- * kmemleak_lock). This lock is also held when scanning the corresponding
- * memory block to avoid the kernel freeing it via the kmemleak_free()
- * callback. This is less heavyweight than holding a global lock like
- * kmemleak_lock during scanning
+ * - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object.
+ * Accesses to the metadata (e.g. count) are protected by this lock. Note
+ * that some members of this structure may be protected by other means
+ * (atomic or kmemleak_lock). This lock is also held when scanning the
+ * corresponding memory block to avoid the kernel freeing it via the
+ * kmemleak_free() callback. This is less heavyweight than holding a global
+ * lock like kmemleak_lock during scanning.
* - scan_mutex (mutex): ensures that only one thread may scan the memory for
* unreferenced objects at a time. The gray_list contains the objects which
* are already referenced or marked as false positives and need to be
@@ -135,7 +135,7 @@ struct kmemleak_scan_area {
* (use_count) and freed using the RCU mechanism.
*/
struct kmemleak_object {
- spinlock_t lock;
+ raw_spinlock_t lock;
unsigned int flags; /* object status flags */
struct list_head object_list;
struct list_head gray_list;
@@ -191,8 +191,8 @@ static int mem_pool_free_count = ARRAY_S
static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
-/* rw_lock protecting the access to object_list and object_tree_root */
-static DEFINE_RWLOCK(kmemleak_lock);
+/* protecting the access to object_list and object_tree_root */
+static DEFINE_RAW_SPINLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
static struct kmem_cache *object_cache;
@@ -426,7 +426,7 @@ static struct kmemleak_object *mem_pool_
}
/* slab allocation failed, try the memory pool */
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = list_first_entry_or_null(&mem_pool_free_list,
typeof(*object), object_list);
if (object)
@@ -435,7 +435,7 @@ static struct kmemleak_object *mem_pool_
object = &mem_pool[--mem_pool_free_count];
else
pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -453,9 +453,9 @@ static void mem_pool_free(struct kmemlea
}
/* add the object to the memory pool free list */
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
list_add(&object->object_list, &mem_pool_free_list);
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
/*
@@ -514,9 +514,9 @@ static struct kmemleak_object *find_and_
struct kmemleak_object *object;
rcu_read_lock();
- read_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
- read_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
if (object && !get_object(object))
@@ -546,11 +546,11 @@ static struct kmemleak_object *find_and_
unsigned long flags;
struct kmemleak_object *object;
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
if (object)
__remove_object(object);
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -585,7 +585,7 @@ static struct kmemleak_object *create_ob
INIT_LIST_HEAD(&object->object_list);
INIT_LIST_HEAD(&object->gray_list);
INIT_HLIST_HEAD(&object->area_list);
- spin_lock_init(&object->lock);
+ raw_spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
object->flags = OBJECT_ALLOCATED;
object->pointer = ptr;
@@ -617,7 +617,7 @@ static struct kmemleak_object *create_ob
/* kernel backtrace */
object->trace_len = __save_stack_trace(object->trace);
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
min_addr = min(min_addr, untagged_ptr);
@@ -649,7 +649,7 @@ static struct kmemleak_object *create_ob
list_add_tail_rcu(&object->object_list, &object_list);
out:
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -667,9 +667,9 @@ static void __delete_object(struct kmeml
* Locking here also ensures that the corresponding memory block
* cannot be freed when it is being scanned.
*/
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->flags &= ~OBJECT_ALLOCATED;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -739,9 +739,9 @@ static void paint_it(struct kmemleak_obj
{
unsigned long flags;
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
__paint_it(object, color);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
static void paint_ptr(unsigned long ptr, int color)
@@ -798,7 +798,7 @@ static void add_scan_area(unsigned long
if (scan_area_cache)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (!area) {
pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
/* mark the object for full scan to avoid false positives */
@@ -820,7 +820,7 @@ static void add_scan_area(unsigned long
hlist_add_head(&area->node, &object->area_list);
out_unlock:
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -842,9 +842,9 @@ static void object_set_excess_ref(unsign
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->excess_ref = excess_ref;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -864,9 +864,9 @@ static void object_no_scan(unsigned long
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->flags |= OBJECT_NO_SCAN;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -1026,9 +1026,9 @@ void __ref kmemleak_update_trace(const v
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->trace_len = __save_stack_trace(object->trace);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -1233,7 +1233,7 @@ static void scan_block(void *_start, voi
unsigned long flags;
unsigned long untagged_ptr;
- read_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
struct kmemleak_object *object;
unsigned long pointer;
@@ -1268,7 +1268,7 @@ static void scan_block(void *_start, voi
* previously acquired in scan_object(). These locks are
* enclosed by scan_mutex.
*/
- spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
/* only pass surplus references (object already gray) */
if (color_gray(object)) {
excess_ref = object->excess_ref;
@@ -1277,7 +1277,7 @@ static void scan_block(void *_start, voi
excess_ref = 0;
update_refs(object);
}
- spin_unlock(&object->lock);
+ raw_spin_unlock(&object->lock);
if (excess_ref) {
object = lookup_object(excess_ref, 0);
@@ -1286,12 +1286,12 @@ static void scan_block(void *_start, voi
if (object == scanned)
/* circular reference, ignore */
continue;
- spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
update_refs(object);
- spin_unlock(&object->lock);
+ raw_spin_unlock(&object->lock);
}
}
- read_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
/*
@@ -1324,7 +1324,7 @@ static void scan_object(struct kmemleak_
* Once the object->lock is acquired, the corresponding memory block
* cannot be freed (the same lock is acquired in delete_object).
*/
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (object->flags & OBJECT_NO_SCAN)
goto out;
if (!(object->flags & OBJECT_ALLOCATED))
@@ -1344,9 +1344,9 @@ static void scan_object(struct kmemleak_
if (start >= end)
break;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
cond_resched();
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
} while (object->flags & OBJECT_ALLOCATED);
} else
hlist_for_each_entry(area, &object->area_list, node)
@@ -1354,7 +1354,7 @@ static void scan_object(struct kmemleak_
(void *)(area->start + area->size),
object);
out:
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
/*
@@ -1407,7 +1407,7 @@ static void kmemleak_scan(void)
/* prepare the kmemleak_object's */
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
#ifdef DEBUG
/*
* With a few exceptions there should be a maximum of
@@ -1424,7 +1424,7 @@ static void kmemleak_scan(void)
if (color_gray(object) && get_object(object))
list_add_tail(&object->gray_list, &gray_list);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1492,14 +1492,14 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
&& update_checksum(object) && get_object(object)) {
/* color it gray temporarily */
object->count = object->min_count;
list_add_tail(&object->gray_list, &gray_list);
}
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1519,7 +1519,7 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
@@ -1529,7 +1529,7 @@ static void kmemleak_scan(void)
new_leaks++;
}
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1681,10 +1681,10 @@ static int kmemleak_seq_show(struct seq_
struct kmemleak_object *object = v;
unsigned long flags;
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
print_unreferenced(seq, object);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
return 0;
}
@@ -1714,9 +1714,9 @@ static int dump_str_object_info(const ch
return -EINVAL;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
dump_object_info(object);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
return 0;
@@ -1735,11 +1735,11 @@ static void kmemleak_clear(void)
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) &&
unreferenced_object(object))
__paint_it(object, KMEMLEAK_GREY);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();

View File

@ -0,0 +1,99 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 16 Jan 2020 12:00:31 +0100
Subject: [PATCH] smp: Use smp_cond_func_t as type for the conditional
function
Use a typdef for the conditional function instead defining it each time in
the function prototype.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/smp.h | 14 +++++++-------
kernel/smp.c | 11 +++++------
kernel/up.c | 11 +++++------
3 files changed, 17 insertions(+), 19 deletions(-)
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -15,6 +15,7 @@
#include <linux/llist.h>
typedef void (*smp_call_func_t)(void *info);
+typedef bool (*smp_cond_func_t)(int cpu, void *info);
struct __call_single_data {
struct llist_node llist;
smp_call_func_t func;
@@ -49,13 +50,12 @@ void on_each_cpu_mask(const struct cpuma
* cond_func returns a positive value. This may include the local
* processor.
*/
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags);
-
-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags, const struct cpumask *mask);
+void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, gfp_t gfp_flags);
+
+void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, gfp_t gfp_flags,
+ const struct cpumask *mask);
int smp_call_function_single_async(int cpu, call_single_data_t *csd);
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -680,9 +680,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* You must not call this function with disabled interrupts or
* from a hardware interrupt handler or from a bottom half handler.
*/
-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags, const struct cpumask *mask)
+void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, gfp_t gfp_flags,
+ const struct cpumask *mask)
{
cpumask_var_t cpus;
int cpu, ret;
@@ -714,9 +714,8 @@ void on_each_cpu_cond_mask(bool (*cond_f
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags)
+void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, gfp_t gfp_flags)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
cpu_online_mask);
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* Preemption is disabled here to make sure the cond_func is called under the
* same condtions in UP and SMP.
*/
-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags, const struct cpumask *mask)
+void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, gfp_t gfp_flags,
+ const struct cpumask *mask)
{
unsigned long flags;
@@ -84,9 +84,8 @@ void on_each_cpu_cond_mask(bool (*cond_f
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
- smp_call_func_t func, void *info, bool wait,
- gfp_t gfp_flags)
+void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
+ void *info, bool wait, gfp_t gfp_flags)
{
on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
}

View File

@ -0,0 +1,139 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 16 Jan 2020 12:14:38 +0100
Subject: [PATCH] smp: Add a smp_cond_func_t argument to
smp_call_function_many()
on_each_cpu_cond_mask() allocates a new CPU mask. The newly allocated
mask is a subset of the provided mask based on the conditional function.
This memory allocation could be avoided by extending
smp_call_function_many() with the conditional function and performing the
remote function call based on the mask and the conditional function.
Rename smp_call_function_many() to smp_call_function_many_cond() and add
the smp_cond_func_t argument. If smp_cond_func_t is provided then it is
used before invoking the function.
Provide smp_call_function_many() with cond_func set to NULL.
Let on_each_cpu_cond_mask() use smp_call_function_many_cond().
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/smp.c | 77 +++++++++++++++++++++++++++--------------------------------
1 file changed, 36 insertions(+), 41 deletions(-)
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -395,22 +395,9 @@ int smp_call_function_any(const struct c
}
EXPORT_SYMBOL_GPL(smp_call_function_any);
-/**
- * smp_call_function_many(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on (only runs on online subset).
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- * on other CPUs.
- *
- * If @wait is true, then returns once @func has returned.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler. Preemption
- * must be disabled when calling this function.
- */
-void smp_call_function_many(const struct cpumask *mask,
- smp_call_func_t func, void *info, bool wait)
+static void smp_call_function_many_cond(const struct cpumask *mask,
+ smp_call_func_t func, void *info,
+ bool wait, smp_cond_func_t cond_func)
{
struct call_function_data *cfd;
int cpu, next_cpu, this_cpu = smp_processor_id();
@@ -448,7 +435,8 @@ void smp_call_function_many(const struct
/* Fastpath: do that cpu by itself. */
if (next_cpu >= nr_cpu_ids) {
- smp_call_function_single(cpu, func, info, wait);
+ if (!cond_func || cond_func(cpu, info))
+ smp_call_function_single(cpu, func, info, wait);
return;
}
@@ -465,6 +453,9 @@ void smp_call_function_many(const struct
for_each_cpu(cpu, cfd->cpumask) {
call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
+ if (cond_func && !cond_func(cpu, info))
+ continue;
+
csd_lock(csd);
if (wait)
csd->flags |= CSD_FLAG_SYNCHRONOUS;
@@ -486,6 +477,26 @@ void smp_call_function_many(const struct
}
}
}
+
+/**
+ * smp_call_function_many(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler. Preemption
+ * must be disabled when calling this function.
+ */
+void smp_call_function_many(const struct cpumask *mask,
+ smp_call_func_t func, void *info, bool wait)
+{
+ smp_call_function_many_cond(mask, func, info, wait, NULL);
+}
EXPORT_SYMBOL(smp_call_function_many);
/**
@@ -684,33 +695,17 @@ void on_each_cpu_cond_mask(smp_cond_func
void *info, bool wait, gfp_t gfp_flags,
const struct cpumask *mask)
{
- cpumask_var_t cpus;
- int cpu, ret;
+ int cpu = get_cpu();
- might_sleep_if(gfpflags_allow_blocking(gfp_flags));
+ smp_call_function_many_cond(mask, func, info, wait, cond_func);
+ if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) {
+ unsigned long flags;
- if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
- preempt_disable();
- for_each_cpu(cpu, mask)
- if (cond_func(cpu, info))
- __cpumask_set_cpu(cpu, cpus);
- on_each_cpu_mask(cpus, func, info, wait);
- preempt_enable();
- free_cpumask_var(cpus);
- } else {
- /*
- * No free cpumask, bother. No matter, we'll
- * just have to IPI them one by one.
- */
- preempt_disable();
- for_each_cpu(cpu, mask)
- if (cond_func(cpu, info)) {
- ret = smp_call_function_single(cpu, func,
- info, wait);
- WARN_ON_ONCE(ret);
- }
- preempt_enable();
+ local_irq_save(flags);
+ func(info);
+ local_irq_restore(flags);
}
+ put_cpu();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);

View File

@ -0,0 +1,127 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 16 Jan 2020 13:13:41 +0100
Subject: [PATCH] smp: Remove allocation mask from on_each_cpu_cond.*()
The allocation mask is no longer used by on_each_cpu_cond() and
on_each_cpu_cond_mask() and ca be removed.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
arch/x86/mm/tlb.c | 2 +-
fs/buffer.c | 2 +-
include/linux/smp.h | 5 ++---
kernel/smp.c | 13 +++----------
kernel/up.c | 7 +++----
mm/slub.c | 2 +-
6 files changed, 11 insertions(+), 20 deletions(-)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -708,7 +708,7 @@ void native_flush_tlb_others(const struc
(void *)info, 1);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
- (void *)info, 1, GFP_ATOMIC, cpumask);
+ (void *)info, 1, cpumask);
}
/*
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1387,7 +1387,7 @@ static bool has_bh_in_lru(int cpu, void
void invalidate_bh_lrus(void)
{
- on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
+ on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -51,11 +51,10 @@ void on_each_cpu_mask(const struct cpuma
* processor.
*/
void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
- void *info, bool wait, gfp_t gfp_flags);
+ void *info, bool wait);
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
- void *info, bool wait, gfp_t gfp_flags,
- const struct cpumask *mask);
+ void *info, bool wait, const struct cpumask *mask);
int smp_call_function_single_async(int cpu, call_single_data_t *csd);
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -679,11 +679,6 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* @info: An arbitrary pointer to pass to both functions.
* @wait: If true, wait (atomically) until function has
* completed on other CPUs.
- * @gfp_flags: GFP flags to use when allocating the cpumask
- * used internally by the function.
- *
- * The function might sleep if the GFP flags indicates a non
- * atomic allocation is allowed.
*
* Preemption is disabled to protect against CPUs going offline but not online.
* CPUs going online during the call will not be seen or sent an IPI.
@@ -692,8 +687,7 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* from a hardware interrupt handler or from a bottom half handler.
*/
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
- void *info, bool wait, gfp_t gfp_flags,
- const struct cpumask *mask)
+ void *info, bool wait, const struct cpumask *mask)
{
int cpu = get_cpu();
@@ -710,10 +704,9 @@ void on_each_cpu_cond_mask(smp_cond_func
EXPORT_SYMBOL(on_each_cpu_cond_mask);
void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
- void *info, bool wait, gfp_t gfp_flags)
+ void *info, bool wait)
{
- on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
- cpu_online_mask);
+ on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask);
}
EXPORT_SYMBOL(on_each_cpu_cond);
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -69,8 +69,7 @@ EXPORT_SYMBOL(on_each_cpu_mask);
* same condtions in UP and SMP.
*/
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
- void *info, bool wait, gfp_t gfp_flags,
- const struct cpumask *mask)
+ void *info, bool wait, const struct cpumask *mask)
{
unsigned long flags;
@@ -85,9 +84,9 @@ void on_each_cpu_cond_mask(smp_cond_func
EXPORT_SYMBOL(on_each_cpu_cond_mask);
void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func,
- void *info, bool wait, gfp_t gfp_flags)
+ void *info, bool wait)
{
- on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
+ on_each_cpu_cond_mask(cond_func, func, info, wait, NULL);
}
EXPORT_SYMBOL(on_each_cpu_cond);
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2338,7 +2338,7 @@ static bool has_cpu_slab(int cpu, void *
static void flush_all(struct kmem_cache *s)
{
- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
+ on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
}
/*

View File

@ -0,0 +1,35 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 21 Feb 2020 18:57:11 +0100
Subject: [PATCH] drm/vmwgfx: Drop preempt_disable() in
vmw_fifo_ping_host()
vmw_fifo_ping_host() disables preemption around a test and a register
write via vmw_write(). The write function acquires a spinlock_t typed
lock which is not allowed in a preempt_disable()ed section on
PREEMPT_RT. This has been reported in the bugzilla.
It has been explained by Thomas Hellstrom that this preempt_disable()ed
section is not required for correctness.
Remove the preempt_disable() section.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=206591
Link: https://lkml.kernel.org/r/0b5e1c65d89951de993deab06d1d197b40fd67aa.camel@vmware.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c | 2 --
1 file changed, 2 deletions(-)
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c
@@ -169,10 +169,8 @@ void vmw_fifo_ping_host(struct vmw_priva
{
u32 *fifo_mem = dev_priv->mmio_virt;
- preempt_disable();
if (cmpxchg(fifo_mem + SVGA_FIFO_BUSY, 0, 1) == 0)
vmw_write(dev_priv, SVGA_REG_SYNC, reason);
- preempt_enable();
}
void vmw_fifo_release(struct vmw_private *dev_priv, struct vmw_fifo_state *fifo)

View File

@ -0,0 +1,32 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 3 Mar 2020 13:43:25 +0100
Subject: [PATCH] =?UTF-8?q?mm/compaction:=20Really=20limit=20compact=5Fune?=
=?UTF-8?q?victable=5Fallowed=20to=200=E2=80=A61?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The proc file `compact_unevictable_allowed' should allow 0 and 1 only,
the `extra*' attribues have been set properly but without
proc_dointvec_minmax() as the `proc_handler' the limit will not be
enforced.
Use proc_dointvec_minmax() as the `proc_handler' to enfoce the valid
specified range.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/sysctl.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1493,7 +1493,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_compact_unevictable_allowed,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},

View File

@ -0,0 +1,102 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 8 Nov 2019 12:55:47 +0100
Subject: [PATCH] mm/compaction: Disable compact_unevictable_allowed on RT
Since commit
5bbe3547aa3ba ("mm: allow compaction of unevictable pages")
it is allowed to examine mlocked pages and compact them by default.
On -RT even minor pagefaults are problematic because it may take a few
100us to resolve them and until then the task is blocked.
Make compact_unevictable_allowed = 0 default and issue a warning on RT
if it is changed.
Link: https://lore.kernel.org/linux-mm/20190710144138.qyn4tuttdq6h7kqx@linutronix.de/
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
Documentation/admin-guide/sysctl/vm.rst | 3 +++
kernel/sysctl.c | 29 ++++++++++++++++++++++++++++-
mm/compaction.c | 4 ++++
3 files changed, 35 insertions(+), 1 deletion(-)
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -128,6 +128,9 @@ allowed to examine the unevictable lru (
This should be used on systems where stalls for minor page faults are an
acceptable trade for large contiguous free memory. Set to 0 to prevent
compaction from moving pages that are unevictable. Default value is 1.
+On CONFIG_PREEMPT_RT the default value is 0 in order to avoid a page fault, due
+to compaction, which would block the task from becomming active until the fault
+is resolved.
dirty_background_bytes
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -212,6 +212,11 @@ static int proc_do_cad_pid(struct ctl_ta
void __user *buffer, size_t *lenp, loff_t *ppos);
static int proc_taint(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_COMPACTION
+static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
+ int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos);
+#endif
#endif
#ifdef CONFIG_PRINTK
@@ -1493,7 +1498,7 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_compact_unevictable_allowed,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dointvec_minmax_warn_RT_change,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
@@ -2581,6 +2586,28 @@ int proc_dointvec(struct ctl_table *tabl
return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
}
+#ifdef CONFIG_COMPACTION
+static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
+ int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret, old;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ old = *(int *)table->data;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+ if (old != *(int *)table->data)
+ pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
+ table->procname, current->comm,
+ task_pid_nr(current));
+ return ret;
+}
+#endif
+
/**
* proc_douintvec - read a vector of unsigned integers
* @table: the sysctl table
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1590,7 +1590,11 @@ typedef enum {
* Allow userspace to control policy on scanning the unevictable LRU for
* compactable pages.
*/
+#ifdef CONFIG_PREEMPT_RT
+int sysctl_compact_unevictable_allowed __read_mostly = 0;
+#else
int sysctl_compact_unevictable_allowed __read_mostly = 1;
+#endif
static inline void
update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,35 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 11 Jun 2019 11:21:02 +0200
Subject: [PATCH 1/4] workqueue: Don't assume that the callback has interrupts
disabled
Due to the TIMER_IRQSAFE flag, the timer callback is invoked with
disabled interrupts. On -RT the callback is invoked in softirq context
with enabled interrupts. Since the interrupts are threaded, there are
are no in_irq() users. The local_bh_disable() around the threaded
handler ensures that there is either a timer or a threaded handler
active on the CPU.
Disable interrupts before __queue_work() is invoked from the timer
callback.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/workqueue.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1614,9 +1614,11 @@ EXPORT_SYMBOL_GPL(queue_work_node);
void delayed_work_timer_fn(struct timer_list *t)
{
struct delayed_work *dwork = from_timer(dwork, t, timer);
+ unsigned long flags;
- /* should have been called from irqsafe timer with irq already off */
+ local_irq_save(flags);
__queue_work(dwork->cpu, dwork->wq, &dwork->work);
+ local_irq_restore(flags);
}
EXPORT_SYMBOL(delayed_work_timer_fn);

View File

@ -0,0 +1,33 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 22 May 2019 12:42:26 +0200
Subject: [PATCH 2/4] sched/swait: Add swait_event_lock_irq()
The swait_event_lock_irq() is inspired by wait_event_lock_irq(). This is
required by the workqueue code once it switches to swait.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/swait.h | 14 ++++++++++++++
1 file changed, 14 insertions(+)
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -297,4 +297,18 @@ do { \
__ret; \
})
+#define __swait_event_lock_irq(wq, condition, lock, cmd) \
+ ___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0, \
+ raw_spin_unlock_irq(&lock); \
+ cmd; \
+ schedule(); \
+ raw_spin_lock_irq(&lock))
+
+#define swait_event_lock_irq(wq_head, condition, lock) \
+ do { \
+ if (condition) \
+ break; \
+ __swait_event_lock_irq(wq_head, condition, lock, ); \
+ } while (0)
+
#endif /* _LINUX_SWAIT_H */

View File

@ -0,0 +1,53 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 11 Jun 2019 11:21:09 +0200
Subject: [PATCH 3/4] workqueue: Use swait for wq_manager_wait
In order for the workqueue code use raw_spinlock_t typed locking there
must not be a spinlock_t typed lock be acquired. A wait_queue_head uses
a spinlock_t lock for its list protection.
Use a swait based queue head to avoid raw_spinlock_t -> spinlock_t
locking.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/workqueue.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -50,6 +50,7 @@
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
#include <linux/nmi.h>
+#include <linux/swait.h>
#include "workqueue_internal.h"
@@ -301,7 +302,7 @@ static struct workqueue_attrs *wq_update
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
-static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
+static DECLARE_SWAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
@@ -2146,7 +2147,7 @@ static bool manage_workers(struct worker
pool->manager = NULL;
pool->flags &= ~POOL_MANAGER_ACTIVE;
- wake_up(&wq_manager_wait);
+ swake_up_one(&wq_manager_wait);
return true;
}
@@ -3547,7 +3548,7 @@ static void put_unbound_pool(struct work
* manager and @pool gets freed with the flag set.
*/
spin_lock_irq(&pool->lock);
- wait_event_lock_irq(wq_manager_wait,
+ swait_event_lock_irq(wq_manager_wait,
!(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
pool->flags |= POOL_MANAGER_ACTIVE;

View File

@ -0,0 +1,680 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 22 May 2019 12:43:56 +0200
Subject: [PATCH 4/4] workqueue: Convert the locks to raw type
After all the workqueue and the timer rework, we can finally make the
worker_pool lock raw.
The lock is not held over an unbounded period of time/iterations.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/workqueue.c | 168 ++++++++++++++++++++++++++---------------------------
1 file changed, 84 insertions(+), 84 deletions(-)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -146,7 +146,7 @@ enum {
/* struct worker is defined in workqueue_internal.h */
struct worker_pool {
- spinlock_t lock; /* the pool lock */
+ raw_spinlock_t lock; /* the pool lock */
int cpu; /* I: the associated cpu */
int node; /* I: the associated node ID */
int id; /* I: pool ID */
@@ -301,7 +301,7 @@ static struct workqueue_attrs *wq_update
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
-static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+static DEFINE_RAW_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
static DECLARE_SWAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
@@ -833,7 +833,7 @@ static struct worker *first_idle_worker(
* Wake up the first idle worker of @pool.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void wake_up_worker(struct worker_pool *pool)
{
@@ -886,7 +886,7 @@ void wq_worker_sleeping(struct task_stru
return;
worker->sleeping = 1;
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* The counterpart of the following dec_and_test, implied mb,
@@ -905,7 +905,7 @@ void wq_worker_sleeping(struct task_stru
if (next)
wake_up_process(next->task);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
/**
@@ -916,7 +916,7 @@ void wq_worker_sleeping(struct task_stru
* the scheduler to get a worker's last known identity.
*
* CONTEXT:
- * spin_lock_irq(rq->lock)
+ * raw_spin_lock_irq(rq->lock)
*
* This function is called during schedule() when a kworker is going
* to sleep. It's used by psi to identify aggregation workers during
@@ -947,7 +947,7 @@ work_func_t wq_worker_last_func(struct t
* Set @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
- * spin_lock_irq(pool->lock)
+ * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
@@ -972,7 +972,7 @@ static inline void worker_set_flags(stru
* Clear @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
- * spin_lock_irq(pool->lock)
+ * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
@@ -1020,7 +1020,7 @@ static inline void worker_clr_flags(stru
* actually occurs, it should be easy to locate the culprit work function.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*
* Return:
* Pointer to worker which is executing @work if found, %NULL
@@ -1055,7 +1055,7 @@ static struct worker *find_worker_execut
* nested inside outer list_for_each_entry_safe().
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void move_linked_works(struct work_struct *work, struct list_head *head,
struct work_struct **nextp)
@@ -1133,9 +1133,9 @@ static void put_pwq_unlocked(struct pool
* As both pwqs and pools are RCU protected, the
* following lock operations are safe.
*/
- spin_lock_irq(&pwq->pool->lock);
+ raw_spin_lock_irq(&pwq->pool->lock);
put_pwq(pwq);
- spin_unlock_irq(&pwq->pool->lock);
+ raw_spin_unlock_irq(&pwq->pool->lock);
}
}
@@ -1168,7 +1168,7 @@ static void pwq_activate_first_delayed(s
* decrement nr_in_flight of its pwq and handle workqueue flushing.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
{
@@ -1267,7 +1267,7 @@ static int try_to_grab_pending(struct wo
if (!pool)
goto fail;
- spin_lock(&pool->lock);
+ raw_spin_lock(&pool->lock);
/*
* work->data is guaranteed to point to pwq only while the work
* item is queued on pwq->wq, and both updating work->data to point
@@ -1296,11 +1296,11 @@ static int try_to_grab_pending(struct wo
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
- spin_unlock(&pool->lock);
+ raw_spin_unlock(&pool->lock);
rcu_read_unlock();
return 1;
}
- spin_unlock(&pool->lock);
+ raw_spin_unlock(&pool->lock);
fail:
rcu_read_unlock();
local_irq_restore(*flags);
@@ -1321,7 +1321,7 @@ static int try_to_grab_pending(struct wo
* work_struct flags.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
@@ -1438,7 +1438,7 @@ static void __queue_work(int cpu, struct
if (last_pool && last_pool != pwq->pool) {
struct worker *worker;
- spin_lock(&last_pool->lock);
+ raw_spin_lock(&last_pool->lock);
worker = find_worker_executing_work(last_pool, work);
@@ -1446,11 +1446,11 @@ static void __queue_work(int cpu, struct
pwq = worker->current_pwq;
} else {
/* meh... not running there, queue here */
- spin_unlock(&last_pool->lock);
- spin_lock(&pwq->pool->lock);
+ raw_spin_unlock(&last_pool->lock);
+ raw_spin_lock(&pwq->pool->lock);
}
} else {
- spin_lock(&pwq->pool->lock);
+ raw_spin_lock(&pwq->pool->lock);
}
/*
@@ -1463,7 +1463,7 @@ static void __queue_work(int cpu, struct
*/
if (unlikely(!pwq->refcnt)) {
if (wq->flags & WQ_UNBOUND) {
- spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pwq->pool->lock);
cpu_relax();
goto retry;
}
@@ -1495,7 +1495,7 @@ static void __queue_work(int cpu, struct
insert_work(pwq, work, worklist, work_flags);
out:
- spin_unlock(&pwq->pool->lock);
+ raw_spin_unlock(&pwq->pool->lock);
rcu_read_unlock();
}
@@ -1766,7 +1766,7 @@ EXPORT_SYMBOL(queue_rcu_work);
* necessary.
*
* LOCKING:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void worker_enter_idle(struct worker *worker)
{
@@ -1806,7 +1806,7 @@ static void worker_enter_idle(struct wor
* @worker is leaving idle state. Update stats.
*
* LOCKING:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void worker_leave_idle(struct worker *worker)
{
@@ -1944,11 +1944,11 @@ static struct worker *create_worker(stru
worker_attach_to_pool(worker, pool);
/* start the newly created worker */
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
worker->pool->nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker->task);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
return worker;
@@ -1967,7 +1967,7 @@ static struct worker *create_worker(stru
* be idle.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void destroy_worker(struct worker *worker)
{
@@ -1993,7 +1993,7 @@ static void idle_worker_timeout(struct t
{
struct worker_pool *pool = from_timer(pool, t, idle_timer);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
while (too_many_workers(pool)) {
struct worker *worker;
@@ -2011,7 +2011,7 @@ static void idle_worker_timeout(struct t
destroy_worker(worker);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
static void send_mayday(struct work_struct *work)
@@ -2042,8 +2042,8 @@ static void pool_mayday_timeout(struct t
struct worker_pool *pool = from_timer(pool, t, mayday_timer);
struct work_struct *work;
- spin_lock_irq(&pool->lock);
- spin_lock(&wq_mayday_lock); /* for wq->maydays */
+ raw_spin_lock_irq(&pool->lock);
+ raw_spin_lock(&wq_mayday_lock); /* for wq->maydays */
if (need_to_create_worker(pool)) {
/*
@@ -2056,8 +2056,8 @@ static void pool_mayday_timeout(struct t
send_mayday(work);
}
- spin_unlock(&wq_mayday_lock);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock(&wq_mayday_lock);
+ raw_spin_unlock_irq(&pool->lock);
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
@@ -2076,7 +2076,7 @@ static void pool_mayday_timeout(struct t
* may_start_working() %true.
*
* LOCKING:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations. Called only from
* manager.
*/
@@ -2085,7 +2085,7 @@ static void maybe_create_worker(struct w
__acquires(&pool->lock)
{
restart:
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
/* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
@@ -2101,7 +2101,7 @@ static void maybe_create_worker(struct w
}
del_timer_sync(&pool->mayday_timer);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* This is necessary even after a new worker was just successfully
* created as @pool->lock was dropped and the new worker might have
@@ -2124,7 +2124,7 @@ static void maybe_create_worker(struct w
* and may_start_working() is true.
*
* CONTEXT:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times. Does GFP_KERNEL allocations.
*
* Return:
@@ -2163,7 +2163,7 @@ static bool manage_workers(struct worker
* call this function to process a work.
*
* CONTEXT:
- * spin_lock_irq(pool->lock) which is released and regrabbed.
+ * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
*/
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
@@ -2245,7 +2245,7 @@ static void process_one_work(struct work
*/
set_work_pool_and_clear_pending(work, pool->id);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
@@ -2300,7 +2300,7 @@ static void process_one_work(struct work
*/
cond_resched();
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
if (unlikely(cpu_intensive))
@@ -2326,7 +2326,7 @@ static void process_one_work(struct work
* fetches a work from the top and executes it.
*
* CONTEXT:
- * spin_lock_irq(pool->lock) which may be released and regrabbed
+ * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
* multiple times.
*/
static void process_scheduled_works(struct worker *worker)
@@ -2368,11 +2368,11 @@ static int worker_thread(void *__worker)
/* tell the scheduler that this is a workqueue worker */
set_pf_worker(true);
woke_up:
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/* am I supposed to die? */
if (unlikely(worker->flags & WORKER_DIE)) {
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);
@@ -2438,7 +2438,7 @@ static int worker_thread(void *__worker)
*/
worker_enter_idle(worker);
__set_current_state(TASK_IDLE);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
}
@@ -2492,7 +2492,7 @@ static int rescuer_thread(void *__rescue
should_stop = kthread_should_stop();
/* see whether any pwq is asking for help */
- spin_lock_irq(&wq_mayday_lock);
+ raw_spin_lock_irq(&wq_mayday_lock);
while (!list_empty(&wq->maydays)) {
struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
@@ -2504,11 +2504,11 @@ static int rescuer_thread(void *__rescue
__set_current_state(TASK_RUNNING);
list_del_init(&pwq->mayday_node);
- spin_unlock_irq(&wq_mayday_lock);
+ raw_spin_unlock_irq(&wq_mayday_lock);
worker_attach_to_pool(rescuer, pool);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* Slurp in all works issued via this workqueue and
@@ -2537,7 +2537,7 @@ static int rescuer_thread(void *__rescue
* incur MAYDAY_INTERVAL delay inbetween.
*/
if (need_to_create_worker(pool)) {
- spin_lock(&wq_mayday_lock);
+ raw_spin_lock(&wq_mayday_lock);
/*
* Queue iff we aren't racing destruction
* and somebody else hasn't queued it already.
@@ -2546,7 +2546,7 @@ static int rescuer_thread(void *__rescue
get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
}
- spin_unlock(&wq_mayday_lock);
+ raw_spin_unlock(&wq_mayday_lock);
}
}
@@ -2564,14 +2564,14 @@ static int rescuer_thread(void *__rescue
if (need_more_worker(pool))
wake_up_worker(pool);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
worker_detach_from_pool(rescuer);
- spin_lock_irq(&wq_mayday_lock);
+ raw_spin_lock_irq(&wq_mayday_lock);
}
- spin_unlock_irq(&wq_mayday_lock);
+ raw_spin_unlock_irq(&wq_mayday_lock);
if (should_stop) {
__set_current_state(TASK_RUNNING);
@@ -2651,7 +2651,7 @@ static void wq_barrier_func(struct work_
* underneath us, so we can't reliably determine pwq from @target.
*
* CONTEXT:
- * spin_lock_irq(pool->lock).
+ * raw_spin_lock_irq(pool->lock).
*/
static void insert_wq_barrier(struct pool_workqueue *pwq,
struct wq_barrier *barr,
@@ -2738,7 +2738,7 @@ static bool flush_workqueue_prep_pwqs(st
for_each_pwq(pwq, wq) {
struct worker_pool *pool = pwq->pool;
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
if (flush_color >= 0) {
WARN_ON_ONCE(pwq->flush_color != -1);
@@ -2755,7 +2755,7 @@ static bool flush_workqueue_prep_pwqs(st
pwq->work_color = work_color;
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
@@ -2955,9 +2955,9 @@ void drain_workqueue(struct workqueue_st
for_each_pwq(pwq, wq) {
bool drained;
- spin_lock_irq(&pwq->pool->lock);
+ raw_spin_lock_irq(&pwq->pool->lock);
drained = !pwq->nr_active && list_empty(&pwq->delayed_works);
- spin_unlock_irq(&pwq->pool->lock);
+ raw_spin_unlock_irq(&pwq->pool->lock);
if (drained)
continue;
@@ -2993,7 +2993,7 @@ static bool start_flush_work(struct work
return false;
}
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/* see the comment in try_to_grab_pending() with the same code */
pwq = get_work_pwq(work);
if (pwq) {
@@ -3009,7 +3009,7 @@ static bool start_flush_work(struct work
check_flush_dependency(pwq->wq, work);
insert_wq_barrier(pwq, barr, work, worker);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
/*
* Force a lock recursion deadlock when using flush_work() inside a
@@ -3028,7 +3028,7 @@ static bool start_flush_work(struct work
rcu_read_unlock();
return true;
already_gone:
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
rcu_read_unlock();
return false;
}
@@ -3421,7 +3421,7 @@ static bool wqattrs_equal(const struct w
*/
static int init_worker_pool(struct worker_pool *pool)
{
- spin_lock_init(&pool->lock);
+ raw_spin_lock_init(&pool->lock);
pool->id = -1;
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
@@ -3547,7 +3547,7 @@ static void put_unbound_pool(struct work
* @pool's workers from blocking on attach_mutex. We're the last
* manager and @pool gets freed with the flag set.
*/
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
swait_event_lock_irq(wq_manager_wait,
!(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
pool->flags |= POOL_MANAGER_ACTIVE;
@@ -3555,7 +3555,7 @@ static void put_unbound_pool(struct work
while ((worker = first_idle_worker(pool)))
destroy_worker(worker);
WARN_ON(pool->nr_workers || pool->nr_idle);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
mutex_lock(&wq_pool_attach_mutex);
if (!list_empty(&pool->workers))
@@ -3711,7 +3711,7 @@ static void pwq_adjust_max_active(struct
return;
/* this function can be called during early boot w/ irq disabled */
- spin_lock_irqsave(&pwq->pool->lock, flags);
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
/*
* During [un]freezing, the caller is responsible for ensuring that
@@ -3734,7 +3734,7 @@ static void pwq_adjust_max_active(struct
pwq->max_active = 0;
}
- spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
}
/* initialize newly alloced @pwq which is associated with @wq and @pool */
@@ -4136,9 +4136,9 @@ static void wq_update_unbound_numa(struc
use_dfl_pwq:
mutex_lock(&wq->mutex);
- spin_lock_irq(&wq->dfl_pwq->pool->lock);
+ raw_spin_lock_irq(&wq->dfl_pwq->pool->lock);
get_pwq(wq->dfl_pwq);
- spin_unlock_irq(&wq->dfl_pwq->pool->lock);
+ raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock);
old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
out_unlock:
mutex_unlock(&wq->mutex);
@@ -4351,9 +4351,9 @@ void destroy_workqueue(struct workqueue_
struct worker *rescuer = wq->rescuer;
/* this prevents new queueing */
- spin_lock_irq(&wq_mayday_lock);
+ raw_spin_lock_irq(&wq_mayday_lock);
wq->rescuer = NULL;
- spin_unlock_irq(&wq_mayday_lock);
+ raw_spin_unlock_irq(&wq_mayday_lock);
/* rescuer will empty maydays list before exiting */
kthread_stop(rescuer->task);
@@ -4549,10 +4549,10 @@ unsigned int work_busy(struct work_struc
rcu_read_lock();
pool = get_work_pool(work);
if (pool) {
- spin_lock_irqsave(&pool->lock, flags);
+ raw_spin_lock_irqsave(&pool->lock, flags);
if (find_worker_executing_work(pool, work))
ret |= WORK_BUSY_RUNNING;
- spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
}
rcu_read_unlock();
@@ -4759,10 +4759,10 @@ void show_workqueue_state(void)
pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
for_each_pwq(pwq, wq) {
- spin_lock_irqsave(&pwq->pool->lock, flags);
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
if (pwq->nr_active || !list_empty(&pwq->delayed_works))
show_pwq(pwq);
- spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
* sysrq-t -> show_workqueue_state(). Avoid triggering
@@ -4776,7 +4776,7 @@ void show_workqueue_state(void)
struct worker *worker;
bool first = true;
- spin_lock_irqsave(&pool->lock, flags);
+ raw_spin_lock_irqsave(&pool->lock, flags);
if (pool->nr_workers == pool->nr_idle)
goto next_pool;
@@ -4795,7 +4795,7 @@ void show_workqueue_state(void)
}
pr_cont("\n");
next_pool:
- spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
* sysrq-t -> show_workqueue_state(). Avoid triggering
@@ -4825,7 +4825,7 @@ void wq_worker_comm(char *buf, size_t si
struct worker_pool *pool = worker->pool;
if (pool) {
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* ->desc tracks information (wq name or
* set_worker_desc()) for the latest execution. If
@@ -4839,7 +4839,7 @@ void wq_worker_comm(char *buf, size_t si
scnprintf(buf + off, size - off, "-%s",
worker->desc);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
}
@@ -4870,7 +4870,7 @@ static void unbind_workers(int cpu)
for_each_cpu_worker_pool(pool, cpu) {
mutex_lock(&wq_pool_attach_mutex);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
/*
* We've blocked all attach/detach operations. Make all workers
@@ -4884,7 +4884,7 @@ static void unbind_workers(int cpu)
pool->flags |= POOL_DISASSOCIATED;
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
mutex_unlock(&wq_pool_attach_mutex);
/*
@@ -4910,9 +4910,9 @@ static void unbind_workers(int cpu)
* worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items.
*/
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
wake_up_worker(pool);
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
}
@@ -4939,7 +4939,7 @@ static void rebind_workers(struct worker
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);
- spin_lock_irq(&pool->lock);
+ raw_spin_lock_irq(&pool->lock);
pool->flags &= ~POOL_DISASSOCIATED;
@@ -4978,7 +4978,7 @@ static void rebind_workers(struct worker
WRITE_ONCE(worker->flags, worker_flags);
}
- spin_unlock_irq(&pool->lock);
+ raw_spin_unlock_irq(&pool->lock);
}
/**

View File

@ -0,0 +1,116 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 15 Aug 2019 18:14:16 +0200
Subject: [PATCH 1/4] cgroup: Remove ->css_rstat_flush()
I was looking at the lifetime of the the ->css_rstat_flush() to see if
cgroup_rstat_cpu_lock should remain a raw_spinlock_t. I didn't find any
users and is unused since it was introduced in commit
8f53470bab042 ("cgroup: Add cgroup_subsys->css_rstat_flush()")
Remove the css_rstat_flush callback because it has no users.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/cgroup-defs.h | 5 -----
kernel/cgroup/cgroup.c | 12 ------------
kernel/cgroup/rstat.c | 10 +---------
3 files changed, 1 insertion(+), 26 deletions(-)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -144,9 +144,6 @@ struct cgroup_subsys_state {
struct list_head sibling;
struct list_head children;
- /* flush target list anchored at cgrp->rstat_css_list */
- struct list_head rstat_css_node;
-
/*
* PI: Subsys-unique ID. 0 is unused and root is always 1. The
* matching css can be looked up using css_from_id().
@@ -455,7 +452,6 @@ struct cgroup {
/* per-cpu recursive resource statistics */
struct cgroup_rstat_cpu __percpu *rstat_cpu;
- struct list_head rstat_css_list;
/* cgroup basic resource statistics */
struct cgroup_base_stat pending_bstat; /* pending from children */
@@ -633,7 +629,6 @@ struct cgroup_subsys {
void (*css_released)(struct cgroup_subsys_state *css);
void (*css_free)(struct cgroup_subsys_state *css);
void (*css_reset)(struct cgroup_subsys_state *css);
- void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu);
int (*css_extra_stat_show)(struct seq_file *seq,
struct cgroup_subsys_state *css);
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1957,7 +1957,6 @@ static void init_cgroup_housekeeping(str
cgrp->dom_cgrp = cgrp;
cgrp->max_descendants = INT_MAX;
cgrp->max_depth = INT_MAX;
- INIT_LIST_HEAD(&cgrp->rstat_css_list);
prev_cputime_init(&cgrp->prev_cputime);
for_each_subsys(ss, ssid)
@@ -5027,12 +5026,6 @@ static void css_release_work_fn(struct w
list_del_rcu(&css->sibling);
if (ss) {
- /* css release path */
- if (!list_empty(&css->rstat_css_node)) {
- cgroup_rstat_flush(cgrp);
- list_del_rcu(&css->rstat_css_node);
- }
-
cgroup_idr_replace(&ss->css_idr, NULL, css->id);
if (ss->css_released)
ss->css_released(css);
@@ -5094,7 +5087,6 @@ static void init_and_link_css(struct cgr
css->id = -1;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
- INIT_LIST_HEAD(&css->rstat_css_node);
css->serial_nr = css_serial_nr_next++;
atomic_set(&css->online_cnt, 0);
@@ -5103,9 +5095,6 @@ static void init_and_link_css(struct cgr
css_get(css->parent);
}
- if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
- list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
-
BUG_ON(cgroup_css(cgrp, ss));
}
@@ -5207,7 +5196,6 @@ static struct cgroup_subsys_state *css_c
err_list_del:
list_del_rcu(&css->sibling);
err_free_css:
- list_del_rcu(&css->rstat_css_node);
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
return ERR_PTR(err);
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -162,17 +162,9 @@ static void cgroup_rstat_flush_locked(st
struct cgroup *pos = NULL;
raw_spin_lock(cpu_lock);
- while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
- struct cgroup_subsys_state *css;
-
+ while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu)))
cgroup_base_stat_flush(pos, cpu);
- rcu_read_lock();
- list_for_each_entry_rcu(css, &pos->rstat_css_list,
- rstat_css_node)
- css->ss->css_rstat_flush(css, cpu);
- rcu_read_unlock();
- }
raw_spin_unlock(cpu_lock);
/* if @may_sleep, play nice and yield if necessary */

View File

@ -0,0 +1,68 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 16 Aug 2019 12:20:42 +0200
Subject: [PATCH 2/4] cgroup: Consolidate users of cgroup_rstat_lock.
cgroup_rstat_flush_irqsafe() has no users, remove it.
cgroup_rstat_flush_hold() and cgroup_rstat_flush_release() are only used within
this file. Make it static.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/cgroup.h | 3 ---
kernel/cgroup/rstat.c | 19 ++-----------------
2 files changed, 2 insertions(+), 20 deletions(-)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -751,9 +751,6 @@ static inline void cgroup_path_from_kern
*/
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);
-void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp);
-void cgroup_rstat_flush_hold(struct cgroup *cgrp);
-void cgroup_rstat_flush_release(void);
/*
* Basic resource stats.
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -201,21 +201,6 @@ void cgroup_rstat_flush(struct cgroup *c
}
/**
- * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
- * @cgrp: target cgroup
- *
- * This function can be called from any context.
- */
-void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&cgroup_rstat_lock, flags);
- cgroup_rstat_flush_locked(cgrp, false);
- spin_unlock_irqrestore(&cgroup_rstat_lock, flags);
-}
-
-/**
* cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup
*
@@ -224,7 +209,7 @@ void cgroup_rstat_flush_irqsafe(struct c
*
* This function may block.
*/
-void cgroup_rstat_flush_hold(struct cgroup *cgrp)
+static void cgroup_rstat_flush_hold(struct cgroup *cgrp)
__acquires(&cgroup_rstat_lock)
{
might_sleep();
@@ -235,7 +220,7 @@ void cgroup_rstat_flush_hold(struct cgro
/**
* cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
*/
-void cgroup_rstat_flush_release(void)
+static void cgroup_rstat_flush_release(void)
__releases(&cgroup_rstat_lock)
{
spin_unlock_irq(&cgroup_rstat_lock);

View File

@ -0,0 +1,55 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 16 Aug 2019 12:25:35 +0200
Subject: [PATCH 3/4] cgroup: Remove `may_sleep' from
cgroup_rstat_flush_locked()
cgroup_rstat_flush_locked() is always invoked with `may_sleep' set to
true so that this case can be made default and the parameter removed.
Remove the `may_sleep' parameter.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/cgroup/rstat.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -149,7 +149,7 @@ static struct cgroup *cgroup_rstat_cpu_p
}
/* see cgroup_rstat_flush() */
-static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
+static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
{
int cpu;
@@ -167,9 +167,7 @@ static void cgroup_rstat_flush_locked(st
raw_spin_unlock(cpu_lock);
- /* if @may_sleep, play nice and yield if necessary */
- if (may_sleep && (need_resched() ||
- spin_needbreak(&cgroup_rstat_lock))) {
+ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
spin_unlock_irq(&cgroup_rstat_lock);
if (!cond_resched())
cpu_relax();
@@ -196,7 +194,7 @@ void cgroup_rstat_flush(struct cgroup *c
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
- cgroup_rstat_flush_locked(cgrp, true);
+ cgroup_rstat_flush_locked(cgrp);
spin_unlock_irq(&cgroup_rstat_lock);
}
@@ -214,7 +212,7 @@ static void cgroup_rstat_flush_hold(stru
{
might_sleep();
spin_lock_irq(&cgroup_rstat_lock);
- cgroup_rstat_flush_locked(cgrp, true);
+ cgroup_rstat_flush_locked(cgrp);
}
/**

View File

@ -0,0 +1,71 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 16 Aug 2019 12:49:36 +0200
Subject: [PATCH 4/4] cgroup: Acquire cgroup_rstat_lock with enabled interrupts
There is no need to disable interrupts while cgroup_rstat_lock is
acquired. The lock is never used in-IRQ context so a simple spin_lock()
is enough for synchronisation purpose.
Acquire cgroup_rstat_lock without disabling interrupts and ensure that
cgroup_rstat_cpu_lock is acquired with disabled interrupts (this one is
acquired in-IRQ context).
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
kernel/cgroup/rstat.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -161,17 +161,17 @@ static void cgroup_rstat_flush_locked(st
cpu);
struct cgroup *pos = NULL;
- raw_spin_lock(cpu_lock);
+ raw_spin_lock_irq(cpu_lock);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu)))
cgroup_base_stat_flush(pos, cpu);
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irq(cpu_lock);
if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
- spin_unlock_irq(&cgroup_rstat_lock);
+ spin_unlock(&cgroup_rstat_lock);
if (!cond_resched())
cpu_relax();
- spin_lock_irq(&cgroup_rstat_lock);
+ spin_lock(&cgroup_rstat_lock);
}
}
}
@@ -193,9 +193,9 @@ void cgroup_rstat_flush(struct cgroup *c
{
might_sleep();
- spin_lock_irq(&cgroup_rstat_lock);
+ spin_lock(&cgroup_rstat_lock);
cgroup_rstat_flush_locked(cgrp);
- spin_unlock_irq(&cgroup_rstat_lock);
+ spin_unlock(&cgroup_rstat_lock);
}
/**
@@ -211,7 +211,7 @@ static void cgroup_rstat_flush_hold(stru
__acquires(&cgroup_rstat_lock)
{
might_sleep();
- spin_lock_irq(&cgroup_rstat_lock);
+ spin_lock(&cgroup_rstat_lock);
cgroup_rstat_flush_locked(cgrp);
}
@@ -221,7 +221,7 @@ static void cgroup_rstat_flush_hold(stru
static void cgroup_rstat_flush_release(void)
__releases(&cgroup_rstat_lock)
{
- spin_unlock_irq(&cgroup_rstat_lock);
+ spin_unlock(&cgroup_rstat_lock);
}
int cgroup_rstat_init(struct cgroup *cgrp)

View File

@ -0,0 +1,41 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 11 Feb 2019 10:40:46 +0100
Subject: [PATCH] mm: workingset: replace IRQ-off check with a lockdep assert.
Commit
68d48e6a2df57 ("mm: workingset: add vmstat counter for shadow nodes")
introduced an IRQ-off check to ensure that a lock is held which also
disabled interrupts. This does not work the same way on -RT because none
of the locks, that are held, disable interrupts.
Replace this check with a lockdep assert which ensures that the lock is
held.
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/workingset.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -367,6 +367,8 @@ static struct list_lru shadow_nodes;
void workingset_update_node(struct xa_node *node)
{
+ struct address_space *mapping;
+
/*
* Track non-empty nodes that contain only shadow entries;
* unlink those that contain pages or are being freed.
@@ -375,7 +377,8 @@ void workingset_update_node(struct xa_no
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+ mapping = container_of(node->array, struct address_space, i_pages);
+ lockdep_assert_held(&mapping->i_pages.xa_lock);
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {

View File

@ -0,0 +1,28 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 11 Feb 2019 11:33:11 +0100
Subject: [PATCH] tpm: remove tpm_dev_wq_lock
Added in commit
9e1b74a63f776 ("tpm: add support for nonblocking operation")
but never actually used it.
Cc: Philip Tricca <philip.b.tricca@intel.com>
Cc: Tadeusz Struk <tadeusz.struk@intel.com>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/char/tpm/tpm-dev-common.c | 1 -
1 file changed, 1 deletion(-)
--- a/drivers/char/tpm/tpm-dev-common.c
+++ b/drivers/char/tpm/tpm-dev-common.c
@@ -20,7 +20,6 @@
#include "tpm-dev.h"
static struct workqueue_struct *tpm_dev_wq;
-static DEFINE_MUTEX(tpm_dev_wq_lock);
static ssize_t tpm_dev_transmit(struct tpm_chip *chip, struct tpm_space *space,
u8 *buf, size_t bufsiz)

View File

@ -0,0 +1,314 @@
From: Rob Herring <robh@kernel.org>
Date: Wed, 11 Dec 2019 17:23:45 -0600
Subject: [PATCH] of: Rework and simplify phandle cache to use a fixed size
The phandle cache was added to speed up of_find_node_by_phandle() by
avoiding walking the whole DT to find a matching phandle. The
implementation has several shortcomings:
- The cache is designed to work on a linear set of phandle values.
This is true for dtc generated DTs, but not for other cases such as
Power.
- The cache isn't enabled until of_core_init() and a typical system
may see hundreds of calls to of_find_node_by_phandle() before that
point.
- The cache is freed and re-allocated when the number of phandles
changes.
- It takes a raw spinlock around a memory allocation which breaks on
RT.
Change the implementation to a fixed size and use hash_32() as the
cache index. This greatly simplifies the implementation. It avoids
the need for any re-alloc of the cache and taking a reference on nodes
in the cache. We only have a single source of removing cache entries
which is of_detach_node().
Using hash_32() removes any assumption on phandle values improving
the hit rate for non-linear phandle values. The effect on linear values
using hash_32() is about a 10% collision. The chances of thrashing on
colliding values seems to be low.
To compare performance, I used a RK3399 board which is a pretty typical
system. I found that just measuring boot time as done previously is
noisy and may be impacted by other things. Also bringing up secondary
cores causes some issues with measuring, so I booted with 'nr_cpus=1'.
With no caching, calls to of_find_node_by_phandle() take about 20124 us
for 1248 calls. There's an additional 288 calls before time keeping is
up. Using the average time per hit/miss with the cache, we can calculate
these calls to take 690 us (277 hit / 11 miss) with a 128 entry cache
and 13319 us with no cache or an uninitialized cache.
Comparing the 3 implementations the time spent in
of_find_node_by_phandle() is:
no cache: 20124 us (+ 13319 us)
128 entry cache: 5134 us (+ 690 us)
current cache: 819 us (+ 13319 us)
We could move the allocation of the cache earlier to improve the
current cache, but that just further complicates the situation as it
needs to be after slab is up, so we can't do it when unflattening (which
uses memblock).
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: Frank Rowand <frowand.list@gmail.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lkml.kernel.org/r/20191211232345.24810-1-robh@kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/of/base.c | 133 +++++++++---------------------------------------
drivers/of/dynamic.c | 2
drivers/of/of_private.h | 4 -
drivers/of/overlay.c | 10 ---
4 files changed, 28 insertions(+), 121 deletions(-)
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -123,115 +123,38 @@ int __weak of_node_to_nid(struct device_
}
#endif
-/*
- * Assumptions behind phandle_cache implementation:
- * - phandle property values are in a contiguous range of 1..n
- *
- * If the assumptions do not hold, then
- * - the phandle lookup overhead reduction provided by the cache
- * will likely be less
- */
+#define OF_PHANDLE_CACHE_BITS 7
+#define OF_PHANDLE_CACHE_SZ BIT(OF_PHANDLE_CACHE_BITS)
-static struct device_node **phandle_cache;
-static u32 phandle_cache_mask;
+static struct device_node *phandle_cache[OF_PHANDLE_CACHE_SZ];
-/*
- * Caller must hold devtree_lock.
- */
-static void __of_free_phandle_cache(void)
+static u32 of_phandle_cache_hash(phandle handle)
{
- u32 cache_entries = phandle_cache_mask + 1;
- u32 k;
-
- if (!phandle_cache)
- return;
-
- for (k = 0; k < cache_entries; k++)
- of_node_put(phandle_cache[k]);
-
- kfree(phandle_cache);
- phandle_cache = NULL;
+ return hash_32(handle, OF_PHANDLE_CACHE_BITS);
}
-int of_free_phandle_cache(void)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&devtree_lock, flags);
-
- __of_free_phandle_cache();
-
- raw_spin_unlock_irqrestore(&devtree_lock, flags);
-
- return 0;
-}
-#if !defined(CONFIG_MODULES)
-late_initcall_sync(of_free_phandle_cache);
-#endif
-
/*
* Caller must hold devtree_lock.
*/
-void __of_free_phandle_cache_entry(phandle handle)
+void __of_phandle_cache_inv_entry(phandle handle)
{
- phandle masked_handle;
+ u32 handle_hash;
struct device_node *np;
if (!handle)
return;
- masked_handle = handle & phandle_cache_mask;
-
- if (phandle_cache) {
- np = phandle_cache[masked_handle];
- if (np && handle == np->phandle) {
- of_node_put(np);
- phandle_cache[masked_handle] = NULL;
- }
- }
-}
-
-void of_populate_phandle_cache(void)
-{
- unsigned long flags;
- u32 cache_entries;
- struct device_node *np;
- u32 phandles = 0;
-
- raw_spin_lock_irqsave(&devtree_lock, flags);
-
- __of_free_phandle_cache();
+ handle_hash = of_phandle_cache_hash(handle);
- for_each_of_allnodes(np)
- if (np->phandle && np->phandle != OF_PHANDLE_ILLEGAL)
- phandles++;
-
- if (!phandles)
- goto out;
-
- cache_entries = roundup_pow_of_two(phandles);
- phandle_cache_mask = cache_entries - 1;
-
- phandle_cache = kcalloc(cache_entries, sizeof(*phandle_cache),
- GFP_ATOMIC);
- if (!phandle_cache)
- goto out;
-
- for_each_of_allnodes(np)
- if (np->phandle && np->phandle != OF_PHANDLE_ILLEGAL) {
- of_node_get(np);
- phandle_cache[np->phandle & phandle_cache_mask] = np;
- }
-
-out:
- raw_spin_unlock_irqrestore(&devtree_lock, flags);
+ np = phandle_cache[handle_hash];
+ if (np && handle == np->phandle)
+ phandle_cache[handle_hash] = NULL;
}
void __init of_core_init(void)
{
struct device_node *np;
- of_populate_phandle_cache();
/* Create the kset, and register existing nodes */
mutex_lock(&of_mutex);
@@ -241,8 +164,11 @@ void __init of_core_init(void)
pr_err("failed to register existing nodes\n");
return;
}
- for_each_of_allnodes(np)
+ for_each_of_allnodes(np) {
__of_attach_node_sysfs(np);
+ if (np->phandle && !phandle_cache[of_phandle_cache_hash(np->phandle)])
+ phandle_cache[of_phandle_cache_hash(np->phandle)] = np;
+ }
mutex_unlock(&of_mutex);
/* Symlink in /proc as required by userspace ABI */
@@ -1223,36 +1149,29 @@ struct device_node *of_find_node_by_phan
{
struct device_node *np = NULL;
unsigned long flags;
- phandle masked_handle;
+ u32 handle_hash;
if (!handle)
return NULL;
- raw_spin_lock_irqsave(&devtree_lock, flags);
+ handle_hash = of_phandle_cache_hash(handle);
- masked_handle = handle & phandle_cache_mask;
+ raw_spin_lock_irqsave(&devtree_lock, flags);
- if (phandle_cache) {
- if (phandle_cache[masked_handle] &&
- handle == phandle_cache[masked_handle]->phandle)
- np = phandle_cache[masked_handle];
- if (np && of_node_check_flag(np, OF_DETACHED)) {
- WARN_ON(1); /* did not uncache np on node removal */
- of_node_put(np);
- phandle_cache[masked_handle] = NULL;
- np = NULL;
- }
+ if (phandle_cache[handle_hash] &&
+ handle == phandle_cache[handle_hash]->phandle)
+ np = phandle_cache[handle_hash];
+ if (np && of_node_check_flag(np, OF_DETACHED)) {
+ WARN_ON(1); /* did not uncache np on node removal */
+ phandle_cache[handle_hash] = NULL;
+ np = NULL;
}
if (!np) {
for_each_of_allnodes(np)
if (np->phandle == handle &&
!of_node_check_flag(np, OF_DETACHED)) {
- if (phandle_cache) {
- /* will put when removed from cache */
- of_node_get(np);
- phandle_cache[masked_handle] = np;
- }
+ phandle_cache[handle_hash] = np;
break;
}
}
--- a/drivers/of/dynamic.c
+++ b/drivers/of/dynamic.c
@@ -276,7 +276,7 @@ void __of_detach_node(struct device_node
of_node_set_flag(np, OF_DETACHED);
/* race with of_find_node_by_phandle() prevented by devtree_lock */
- __of_free_phandle_cache_entry(np->phandle);
+ __of_phandle_cache_inv_entry(np->phandle);
}
/**
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -85,14 +85,12 @@ int of_resolve_phandles(struct device_no
#endif
#if defined(CONFIG_OF_DYNAMIC)
-void __of_free_phandle_cache_entry(phandle handle);
+void __of_phandle_cache_inv_entry(phandle handle);
#endif
#if defined(CONFIG_OF_OVERLAY)
void of_overlay_mutex_lock(void);
void of_overlay_mutex_unlock(void);
-int of_free_phandle_cache(void);
-void of_populate_phandle_cache(void);
#else
static inline void of_overlay_mutex_lock(void) {};
static inline void of_overlay_mutex_unlock(void) {};
--- a/drivers/of/overlay.c
+++ b/drivers/of/overlay.c
@@ -974,8 +974,6 @@ static int of_overlay_apply(const void *
goto err_free_overlay_changeset;
}
- of_populate_phandle_cache();
-
ret = __of_changeset_apply_notify(&ovcs->cset);
if (ret)
pr_err("overlay apply changeset entry notify error %d\n", ret);
@@ -1218,17 +1216,9 @@ int of_overlay_remove(int *ovcs_id)
list_del(&ovcs->ovcs_list);
- /*
- * Disable phandle cache. Avoids race condition that would arise
- * from removing cache entry when the associated node is deleted.
- */
- of_free_phandle_cache();
-
ret_apply = 0;
ret = __of_changeset_revert_entries(&ovcs->cset, &ret_apply);
- of_populate_phandle_cache();
-
if (ret) {
if (ret_apply)
devicetree_state_flags |= DTSF_REVERT_FAIL;

View File

@ -0,0 +1,43 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 6 Mar 2020 15:59:06 +0100
Subject: [PATCH] mm: Warn on memory allocation in non-preemptible context on
RT
The memory allocation via kmalloc(, GFP_ATOMIC) in atomic context
(disabled preemption or interrupts) is not allowed on RT because the
buddy allocator is using sleeping locks which can't be acquired in this
context.
Such an an allocation may not trigger a warning in the buddy allocator
if it is always satisfied in the SLUB allocator.
Add a warning on RT if a memory allocation was attempted in not
preemptible region.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
---
mm/slub.c | 6 ++++++
1 file changed, 6 insertions(+)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2687,6 +2687,9 @@ static __always_inline void *slab_alloc_
struct page *page;
unsigned long tid;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP))
+ WARN_ON_ONCE(!preemptible() && system_state >= SYSTEM_SCHEDULING);
+
s = slab_pre_alloc_hook(s, gfpflags);
if (!s)
return NULL;
@@ -3148,6 +3151,9 @@ int kmem_cache_alloc_bulk(struct kmem_ca
struct kmem_cache_cpu *c;
int i;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP))
+ WARN_ON_ONCE(!preemptible() && system_state >= SYSTEM_SCHEDULING);
+
/* memcg and kmem_cache debug support */
s = slab_pre_alloc_hook(s, flags);
if (unlikely(!s))

View File

@ -0,0 +1,156 @@
Subject: timekeeping: Split jiffies seqlock
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 14 Feb 2013 22:36:59 +0100
Replace jiffies_lock seqlock with a simple seqcounter and a rawlock so
it can be taken in atomic context on RT.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/time/jiffies.c | 7 ++++---
kernel/time/tick-common.c | 10 ++++++----
kernel/time/tick-sched.c | 19 ++++++++++++-------
kernel/time/timekeeping.c | 6 ++++--
kernel/time/timekeeping.h | 3 ++-
5 files changed, 28 insertions(+), 17 deletions(-)
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -58,7 +58,8 @@ static struct clocksource clocksource_ji
.max_cycles = 10,
};
-__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);
+__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
+__cacheline_aligned_in_smp seqcount_t jiffies_seq;
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
@@ -67,9 +68,9 @@ u64 get_jiffies_64(void)
u64 ret;
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
ret = jiffies_64;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
return ret;
}
EXPORT_SYMBOL(get_jiffies_64);
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -83,13 +83,15 @@ int tick_is_oneshot_available(void)
static void tick_periodic(int cpu)
{
if (tick_do_timer_cpu == cpu) {
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
/* Keep track of the next tick event */
tick_next_period = ktime_add(tick_next_period, tick_period);
do_timer(1);
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -161,9 +163,9 @@ void tick_setup_periodic(struct clock_ev
ktime_t next;
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
next = tick_next_period;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -65,7 +65,8 @@ static void tick_do_update_jiffies64(kti
return;
/* Reevaluate with jiffies_lock held */
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
delta = ktime_sub(now, last_jiffies_update);
if (delta >= tick_period) {
@@ -91,10 +92,12 @@ static void tick_do_update_jiffies64(kti
/* Keep the tick_next_period variable up to date */
tick_next_period = ktime_add(last_jiffies_update, tick_period);
} else {
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
return;
}
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -105,12 +108,14 @@ static ktime_t tick_init_jiffy_update(vo
{
ktime_t period;
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
/* Did we start the jiffies update yet ? */
if (last_jiffies_update == 0)
last_jiffies_update = tick_next_period;
period = last_jiffies_update;
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
return period;
}
@@ -665,10 +670,10 @@ static ktime_t tick_nohz_next_event(stru
/* Read jiffies and the time when jiffies were updated last */
do {
- seq = read_seqbegin(&jiffies_lock);
+ seq = read_seqcount_begin(&jiffies_seq);
basemono = last_jiffies_update;
basejiff = jiffies;
- } while (read_seqretry(&jiffies_lock, seq));
+ } while (read_seqcount_retry(&jiffies_seq, seq));
ts->last_jiffies = basejiff;
ts->timer_expires_base = basemono;
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -2397,8 +2397,10 @@ EXPORT_SYMBOL(hardpps);
*/
void xtime_update(unsigned long ticks)
{
- write_seqlock(&jiffies_lock);
+ raw_spin_lock(&jiffies_lock);
+ write_seqcount_begin(&jiffies_seq);
do_timer(ticks);
- write_sequnlock(&jiffies_lock);
+ write_seqcount_end(&jiffies_seq);
+ raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -25,7 +25,8 @@ static inline void sched_clock_resume(vo
extern void do_timer(unsigned long ticks);
extern void update_wall_time(void);
-extern seqlock_t jiffies_lock;
+extern raw_spinlock_t jiffies_lock;
+extern seqcount_t jiffies_seq;
#define CS_NAME_LEN 32

View File

@ -0,0 +1,32 @@
Subject: signal: Revert ptrace preempt magic
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 21 Sep 2011 19:57:12 +0200
Upstream commit '53da1d9456fe7f8 fix ptrace slowness' is nothing more
than a bandaid around the ptrace design trainwreck. It's not a
correctness issue, it's merily a cosmetic bandaid.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
kernel/signal.c | 8 --------
1 file changed, 8 deletions(-)
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2202,16 +2202,8 @@ static void ptrace_stop(int exit_code, i
if (gstop_done && ptrace_reparented(current))
do_notify_parent_cldstop(current, false, why);
- /*
- * Don't want to allow preemption here, because
- * sys_ptrace() needs this task to be inactive.
- *
- * XXX: implement read_unlock_no_resched().
- */
- preempt_disable();
read_unlock(&tasklist_lock);
cgroup_enter_frozen();
- preempt_enable_no_resched();
freezable_schedule();
cgroup_leave_frozen(true);
} else {

View File

@ -0,0 +1,261 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 14 Aug 2019 16:38:43 +0200
Subject: [PATCH] dma-buf: Use seqlock_t instread disabling preemption
"dma reservation" disables preemption while acquiring the write access
for "seqcount".
Replace the seqcount with a seqlock_t which provides seqcount like
semantic and lock for writer.
Link: https://lkml.kernel.org/r/f410b429-db86-f81c-7c67-f563fa808b62@free.fr
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
drivers/dma-buf/dma-buf.c | 8 ++--
drivers/dma-buf/dma-resv.c | 45 ++++++++---------------
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 6 +--
drivers/gpu/drm/i915/gem/i915_gem_busy.c | 6 +--
include/linux/dma-resv.h | 4 +-
5 files changed, 27 insertions(+), 42 deletions(-)
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -215,7 +215,7 @@ static __poll_t dma_buf_poll(struct file
return 0;
retry:
- seq = read_seqcount_begin(&resv->seq);
+ seq = read_seqbegin(&resv->seq);
rcu_read_lock();
fobj = rcu_dereference(resv->fence);
@@ -224,7 +224,7 @@ static __poll_t dma_buf_poll(struct file
else
shared_count = 0;
fence_excl = rcu_dereference(resv->fence_excl);
- if (read_seqcount_retry(&resv->seq, seq)) {
+ if (read_seqretry(&resv->seq, seq)) {
rcu_read_unlock();
goto retry;
}
@@ -1190,12 +1190,12 @@ static int dma_buf_debug_show(struct seq
robj = buf_obj->resv;
while (true) {
- seq = read_seqcount_begin(&robj->seq);
+ seq = read_seqbegin(&robj->seq);
rcu_read_lock();
fobj = rcu_dereference(robj->fence);
shared_count = fobj ? fobj->shared_count : 0;
fence = rcu_dereference(robj->fence_excl);
- if (!read_seqcount_retry(&robj->seq, seq))
+ if (!read_seqretry(&robj->seq, seq))
break;
rcu_read_unlock();
}
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -49,12 +49,6 @@
DEFINE_WD_CLASS(reservation_ww_class);
EXPORT_SYMBOL(reservation_ww_class);
-struct lock_class_key reservation_seqcount_class;
-EXPORT_SYMBOL(reservation_seqcount_class);
-
-const char reservation_seqcount_string[] = "reservation_seqcount";
-EXPORT_SYMBOL(reservation_seqcount_string);
-
/**
* dma_resv_list_alloc - allocate fence list
* @shared_max: number of fences we need space for
@@ -103,8 +97,7 @@ void dma_resv_init(struct dma_resv *obj)
{
ww_mutex_init(&obj->lock, &reservation_ww_class);
- __seqcount_init(&obj->seq, reservation_seqcount_string,
- &reservation_seqcount_class);
+ seqlock_init(&obj->seq);
RCU_INIT_POINTER(obj->fence, NULL);
RCU_INIT_POINTER(obj->fence_excl, NULL);
}
@@ -234,8 +227,7 @@ void dma_resv_add_shared_fence(struct dm
fobj = dma_resv_get_list(obj);
count = fobj->shared_count;
- preempt_disable();
- write_seqcount_begin(&obj->seq);
+ write_seqlock(&obj->seq);
for (i = 0; i < count; ++i) {
@@ -255,8 +247,7 @@ void dma_resv_add_shared_fence(struct dm
/* pointer update must be visible before we extend the shared_count */
smp_store_mb(fobj->shared_count, count);
- write_seqcount_end(&obj->seq);
- preempt_enable();
+ write_sequnlock(&obj->seq);
dma_fence_put(old);
}
EXPORT_SYMBOL(dma_resv_add_shared_fence);
@@ -283,14 +274,12 @@ void dma_resv_add_excl_fence(struct dma_
if (fence)
dma_fence_get(fence);
- preempt_disable();
- write_seqcount_begin(&obj->seq);
- /* write_seqcount_begin provides the necessary memory barrier */
+ write_seqlock(&obj->seq);
+ /* write_seqlock provides the necessary memory barrier */
RCU_INIT_POINTER(obj->fence_excl, fence);
if (old)
old->shared_count = 0;
- write_seqcount_end(&obj->seq);
- preempt_enable();
+ write_sequnlock(&obj->seq);
/* inplace update, no shared fences */
while (i--)
@@ -368,13 +357,11 @@ int dma_resv_copy_fences(struct dma_resv
src_list = dma_resv_get_list(dst);
old = dma_resv_get_excl(dst);
- preempt_disable();
- write_seqcount_begin(&dst->seq);
- /* write_seqcount_begin provides the necessary memory barrier */
+ write_seqlock(&dst->seq);
+ /* write_seqlock provides the necessary memory barrier */
RCU_INIT_POINTER(dst->fence_excl, new);
RCU_INIT_POINTER(dst->fence, dst_list);
- write_seqcount_end(&dst->seq);
- preempt_enable();
+ write_sequnlock(&dst->seq);
dma_resv_list_free(src_list);
dma_fence_put(old);
@@ -414,7 +401,7 @@ int dma_resv_get_fences_rcu(struct dma_r
shared_count = i = 0;
rcu_read_lock();
- seq = read_seqcount_begin(&obj->seq);
+ seq = read_seqbegin(&obj->seq);
fence_excl = rcu_dereference(obj->fence_excl);
if (fence_excl && !dma_fence_get_rcu(fence_excl))
@@ -456,7 +443,7 @@ int dma_resv_get_fences_rcu(struct dma_r
}
}
- if (i != shared_count || read_seqcount_retry(&obj->seq, seq)) {
+ if (i != shared_count || read_seqretry(&obj->seq, seq)) {
while (i--)
dma_fence_put(shared[i]);
dma_fence_put(fence_excl);
@@ -507,7 +494,7 @@ long dma_resv_wait_timeout_rcu(struct dm
retry:
shared_count = 0;
- seq = read_seqcount_begin(&obj->seq);
+ seq = read_seqbegin(&obj->seq);
rcu_read_lock();
i = -1;
@@ -553,7 +540,7 @@ long dma_resv_wait_timeout_rcu(struct dm
rcu_read_unlock();
if (fence) {
- if (read_seqcount_retry(&obj->seq, seq)) {
+ if (read_seqretry(&obj->seq, seq)) {
dma_fence_put(fence);
goto retry;
}
@@ -607,7 +594,7 @@ bool dma_resv_test_signaled_rcu(struct d
retry:
ret = true;
shared_count = 0;
- seq = read_seqcount_begin(&obj->seq);
+ seq = read_seqbegin(&obj->seq);
if (test_all) {
unsigned i;
@@ -627,7 +614,7 @@ bool dma_resv_test_signaled_rcu(struct d
break;
}
- if (read_seqcount_retry(&obj->seq, seq))
+ if (read_seqretry(&obj->seq, seq))
goto retry;
}
@@ -639,7 +626,7 @@ bool dma_resv_test_signaled_rcu(struct d
if (ret < 0)
goto retry;
- if (read_seqcount_retry(&obj->seq, seq))
+ if (read_seqretry(&obj->seq, seq))
goto retry;
}
}
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -252,11 +252,9 @@ static int amdgpu_amdkfd_remove_eviction
new->shared_count = k;
/* Install the new fence list, seqcount provides the barriers */
- preempt_disable();
- write_seqcount_begin(&resv->seq);
+ write_seqlock(&resv->seq);
RCU_INIT_POINTER(resv->fence, new);
- write_seqcount_end(&resv->seq);
- preempt_enable();
+ write_sequnlock(&resv->seq);
/* Drop the references to the removed fences or move them to ef_list */
for (i = j, k = 0; i < old->shared_count; ++i) {
--- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c
@@ -75,7 +75,6 @@ busy_check_writer(const struct dma_fence
return __busy_set_if_active(fence, __busy_write_id);
}
-
int
i915_gem_busy_ioctl(struct drm_device *dev, void *data,
struct drm_file *file)
@@ -110,7 +109,8 @@ i915_gem_busy_ioctl(struct drm_device *d
*
*/
retry:
- seq = raw_read_seqcount(&obj->base.resv->seq);
+ /* XXX raw_read_seqcount() does not wait for the WRTIE to finish */
+ seq = read_seqbegin(&obj->base.resv->seq);
/* Translate the exclusive fence to the READ *and* WRITE engine */
args->busy =
@@ -129,7 +129,7 @@ i915_gem_busy_ioctl(struct drm_device *d
}
}
- if (args->busy && read_seqcount_retry(&obj->base.resv->seq, seq))
+ if (args->busy && read_seqretry(&obj->base.resv->seq, seq))
goto retry;
err = 0;
--- a/include/linux/dma-resv.h
+++ b/include/linux/dma-resv.h
@@ -65,13 +65,13 @@ struct dma_resv_list {
/**
* struct dma_resv - a reservation object manages fences for a buffer
* @lock: update side lock
- * @seq: sequence count for managing RCU read-side synchronization
+ * @seq: sequence lock for managing RCU read-side synchronization
* @fence_excl: the exclusive fence, if there is one currently
* @fence: list of current shared fences
*/
struct dma_resv {
struct ww_mutex lock;
- seqcount_t seq;
+ seqlock_t seq;
struct dma_fence __rcu *fence_excl;
struct dma_resv_list __rcu *fence;

View File

@ -0,0 +1,188 @@
Subject: seqlock: Prevent rt starvation
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 22 Feb 2012 12:03:30 +0100
If a low prio writer gets preempted while holding the seqlock write
locked, a high prio reader spins forever on RT.
To prevent this let the reader grab the spinlock, so it blocks and
eventually boosts the writer. This way the writer can proceed and
endless spinning is prevented.
For seqcount writers we disable preemption over the update code
path. Thanks to Al Viro for distangling some VFS code to make that
possible.
Nicholas Mc Guire:
- spin_lock+unlock => spin_unlock_wait
- __write_seqcount_begin => __raw_write_seqcount_begin
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
include/linux/seqlock.h | 57 +++++++++++++++++++++++++++++++++++++-----------
include/net/neighbour.h | 6 ++---
2 files changed, 48 insertions(+), 15 deletions(-)
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -221,20 +221,30 @@ static inline int read_seqcount_retry(co
return __read_seqcount_retry(s, start);
}
-
-
-static inline void raw_write_seqcount_begin(seqcount_t *s)
+static inline void __raw_write_seqcount_begin(seqcount_t *s)
{
s->sequence++;
smp_wmb();
}
-static inline void raw_write_seqcount_end(seqcount_t *s)
+static inline void raw_write_seqcount_begin(seqcount_t *s)
+{
+ preempt_disable_rt();
+ __raw_write_seqcount_begin(s);
+}
+
+static inline void __raw_write_seqcount_end(seqcount_t *s)
{
smp_wmb();
s->sequence++;
}
+static inline void raw_write_seqcount_end(seqcount_t *s)
+{
+ __raw_write_seqcount_end(s);
+ preempt_enable_rt();
+}
+
/**
* raw_write_seqcount_barrier - do a seq write barrier
* @s: pointer to seqcount_t
@@ -428,10 +438,33 @@ typedef struct {
/*
* Read side functions for starting and finalizing a read side section.
*/
+#ifndef CONFIG_PREEMPT_RT
static inline unsigned read_seqbegin(const seqlock_t *sl)
{
return read_seqcount_begin(&sl->seqcount);
}
+#else
+/*
+ * Starvation safe read side for RT
+ */
+static inline unsigned read_seqbegin(seqlock_t *sl)
+{
+ unsigned ret;
+
+repeat:
+ ret = READ_ONCE(sl->seqcount.sequence);
+ if (unlikely(ret & 1)) {
+ /*
+ * Take the lock and let the writer proceed (i.e. evtl
+ * boost it), otherwise we could loop here forever.
+ */
+ spin_unlock_wait(&sl->lock);
+ goto repeat;
+ }
+ smp_rmb();
+ return ret;
+}
+#endif
static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
{
@@ -446,36 +479,36 @@ static inline unsigned read_seqretry(con
static inline void write_seqlock(seqlock_t *sl)
{
spin_lock(&sl->lock);
- write_seqcount_begin(&sl->seqcount);
+ __raw_write_seqcount_begin(&sl->seqcount);
}
static inline void write_sequnlock(seqlock_t *sl)
{
- write_seqcount_end(&sl->seqcount);
+ __raw_write_seqcount_end(&sl->seqcount);
spin_unlock(&sl->lock);
}
static inline void write_seqlock_bh(seqlock_t *sl)
{
spin_lock_bh(&sl->lock);
- write_seqcount_begin(&sl->seqcount);
+ __raw_write_seqcount_begin(&sl->seqcount);
}
static inline void write_sequnlock_bh(seqlock_t *sl)
{
- write_seqcount_end(&sl->seqcount);
+ __raw_write_seqcount_end(&sl->seqcount);
spin_unlock_bh(&sl->lock);
}
static inline void write_seqlock_irq(seqlock_t *sl)
{
spin_lock_irq(&sl->lock);
- write_seqcount_begin(&sl->seqcount);
+ __raw_write_seqcount_begin(&sl->seqcount);
}
static inline void write_sequnlock_irq(seqlock_t *sl)
{
- write_seqcount_end(&sl->seqcount);
+ __raw_write_seqcount_end(&sl->seqcount);
spin_unlock_irq(&sl->lock);
}
@@ -484,7 +517,7 @@ static inline unsigned long __write_seql
unsigned long flags;
spin_lock_irqsave(&sl->lock, flags);
- write_seqcount_begin(&sl->seqcount);
+ __raw_write_seqcount_begin(&sl->seqcount);
return flags;
}
@@ -494,7 +527,7 @@ static inline unsigned long __write_seql
static inline void
write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
{
- write_seqcount_end(&sl->seqcount);
+ __raw_write_seqcount_end(&sl->seqcount);
spin_unlock_irqrestore(&sl->lock, flags);
}
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -459,7 +459,7 @@ static inline int neigh_hh_bridge(struct
}
#endif
-static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
+static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
{
unsigned int hh_alen = 0;
unsigned int seq;
@@ -502,7 +502,7 @@ static inline int neigh_hh_output(const
static inline int neigh_output(struct neighbour *n, struct sk_buff *skb,
bool skip_cache)
{
- const struct hh_cache *hh = &n->hh;
+ struct hh_cache *hh = &n->hh;
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache)
return neigh_hh_output(hh, skb);
@@ -543,7 +543,7 @@ struct neighbour_cb {
#define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb)
-static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
+static inline void neigh_ha_snapshot(char *dst, struct neighbour *n,
const struct net_device *dev)
{
unsigned int seq;

View File

@ -0,0 +1,128 @@
Date: Fri, 28 Oct 2016 23:05:11 +0200
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
To: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Anna Schumaker <anna.schumaker@netapp.com>,
linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org,
tglx@linutronix.de
Subject: NFSv4: replace seqcount_t with a seqlock_t
The raw_write_seqcount_begin() in nfs4_reclaim_open_state() causes a
preempt_disable() on -RT. The spin_lock()/spin_unlock() in that section does
not work.
The lockdep part was removed in commit
abbec2da13f0 ("NFS: Use raw_write_seqcount_begin/end int nfs4_reclaim_open_state")
because lockdep complained.
The whole seqcount thing was introduced in commit
c137afabe330 ("NFSv4: Allow the state manager to mark an open_owner as being recovered").
The recovery threads runs only once.
write_seqlock() does not work on !RT because it disables preemption and it the
writer side is preemptible (has to remain so despite the fact that it will
block readers).
Reported-by: kernel test robot <xiaolong.ye@intel.com>
Link: https://lkml.kernel.org/r/20161021164727.24485-1-bigeasy@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/nfs/delegation.c | 4 ++--
fs/nfs/nfs4_fs.h | 2 +-
fs/nfs/nfs4proc.c | 4 ++--
fs/nfs/nfs4state.c | 22 ++++++++++++++++------
4 files changed, 21 insertions(+), 11 deletions(-)
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -162,11 +162,11 @@ static int nfs_delegation_claim_opens(st
sp = state->owner;
/* Block nfs4_proc_unlck */
mutex_lock(&sp->so_delegreturn_mutex);
- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ seq = read_seqbegin(&sp->so_reclaim_seqlock);
err = nfs4_open_delegation_recall(ctx, state, stateid);
if (!err)
err = nfs_delegation_claim_locks(state, stateid);
- if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq))
err = -EAGAIN;
mutex_unlock(&sp->so_delegreturn_mutex);
put_nfs_open_context(ctx);
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -115,7 +115,7 @@ struct nfs4_state_owner {
unsigned long so_flags;
struct list_head so_states;
struct nfs_seqid_counter so_seqid;
- seqcount_t so_reclaim_seqcount;
+ seqlock_t so_reclaim_seqlock;
struct mutex so_delegreturn_mutex;
};
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2967,7 +2967,7 @@ static int _nfs4_open_and_get_state(stru
unsigned int seq;
int ret;
- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
dir_verifier = nfs_save_change_attribute(dir);
ret = _nfs4_proc_open(opendata, ctx);
@@ -3021,7 +3021,7 @@ static int _nfs4_open_and_get_state(stru
if (d_inode(dentry) == state->inode) {
nfs_inode_attach_open_context(ctx);
- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+ if (read_seqretry(&sp->so_reclaim_seqlock, seq))
nfs4_schedule_stateid_recovery(server, state);
}
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -508,7 +508,7 @@ nfs4_alloc_state_owner(struct nfs_server
nfs4_init_seqid_counter(&sp->so_seqid);
atomic_set(&sp->so_count, 1);
INIT_LIST_HEAD(&sp->so_lru);
- seqcount_init(&sp->so_reclaim_seqcount);
+ seqlock_init(&sp->so_reclaim_seqlock);
mutex_init(&sp->so_delegreturn_mutex);
return sp;
}
@@ -1616,8 +1616,12 @@ static int nfs4_reclaim_open_state(struc
* recovering after a network partition or a reboot from a
* server that doesn't support a grace period.
*/
+#ifdef CONFIG_PREEMPT_RT
+ write_seqlock(&sp->so_reclaim_seqlock);
+#else
+ write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount);
+#endif
spin_lock(&sp->so_lock);
- raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
restart:
list_for_each_entry(state, &sp->so_states, open_states) {
if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
@@ -1678,14 +1682,20 @@ static int nfs4_reclaim_open_state(struc
spin_lock(&sp->so_lock);
goto restart;
}
- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
+#ifdef CONFIG_PREEMPT_RT
+ write_sequnlock(&sp->so_reclaim_seqlock);
+#else
+ write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
+#endif
return 0;
out_err:
nfs4_put_open_state(state);
- spin_lock(&sp->so_lock);
- raw_write_seqcount_end(&sp->so_reclaim_seqcount);
- spin_unlock(&sp->so_lock);
+#ifdef CONFIG_PREEMPT_RT
+ write_sequnlock(&sp->so_reclaim_seqlock);
+#else
+ write_seqcount_end(&sp->so_reclaim_seqlock.seqcount);
+#endif
return status;
}

View File

@ -0,0 +1,298 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 14 Sep 2016 17:36:35 +0200
Subject: [PATCH] net/Qdisc: use a seqlock instead seqcount
The seqcount disables preemption on -RT while it is held which can't
remove. Also we don't want the reader to spin for ages if the writer is
scheduled out. The seqlock on the other hand will serialize / sleep on
the lock while writer is active.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/seqlock.h | 9 +++++++++
include/net/gen_stats.h | 11 ++++++-----
include/net/net_seq_lock.h | 15 +++++++++++++++
include/net/sch_generic.h | 19 +++++++++++++++++--
net/core/gen_estimator.c | 6 +++---
net/core/gen_stats.c | 12 ++++++------
net/sched/sch_api.c | 2 +-
net/sched/sch_generic.c | 13 +++++++++++++
8 files changed, 70 insertions(+), 17 deletions(-)
create mode 100644 include/net/net_seq_lock.h
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -482,6 +482,15 @@ static inline void write_seqlock(seqlock
__raw_write_seqcount_begin(&sl->seqcount);
}
+static inline int try_write_seqlock(seqlock_t *sl)
+{
+ if (spin_trylock(&sl->lock)) {
+ __raw_write_seqcount_begin(&sl->seqcount);
+ return 1;
+ }
+ return 0;
+}
+
static inline void write_sequnlock(seqlock_t *sl)
{
__raw_write_seqcount_end(&sl->seqcount);
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -6,6 +6,7 @@
#include <linux/socket.h>
#include <linux/rtnetlink.h>
#include <linux/pkt_sched.h>
+#include <net/net_seq_lock.h>
struct gnet_stats_basic_cpu {
struct gnet_stats_basic_packed bstats;
@@ -36,15 +37,15 @@ int gnet_stats_start_copy_compat(struct
spinlock_t *lock, struct gnet_dump *d,
int padattr);
-int gnet_stats_copy_basic(const seqcount_t *running,
+int gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
-void __gnet_stats_copy_basic(const seqcount_t *running,
+void __gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
-int gnet_stats_copy_basic_hw(const seqcount_t *running,
+int gnet_stats_copy_basic_hw(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
@@ -64,13 +65,13 @@ int gen_new_estimator(struct gnet_stats_
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running, struct nlattr *opt);
+ net_seqlock_t *running, struct nlattr *opt);
void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **ptr,
spinlock_t *lock,
- seqcount_t *running, struct nlattr *opt);
+ net_seqlock_t *running, struct nlattr *opt);
bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
struct gnet_stats_rate_est64 *sample);
--- /dev/null
+++ b/include/net/net_seq_lock.h
@@ -0,0 +1,15 @@
+#ifndef __NET_NET_SEQ_LOCK_H__
+#define __NET_NET_SEQ_LOCK_H__
+
+#ifdef CONFIG_PREEMPT_RT
+# define net_seqlock_t seqlock_t
+# define net_seq_begin(__r) read_seqbegin(__r)
+# define net_seq_retry(__r, __s) read_seqretry(__r, __s)
+
+#else
+# define net_seqlock_t seqcount_t
+# define net_seq_begin(__r) read_seqcount_begin(__r)
+# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s)
+#endif
+
+#endif
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -10,6 +10,7 @@
#include <linux/percpu.h>
#include <linux/dynamic_queue_limits.h>
#include <linux/list.h>
+#include <net/net_seq_lock.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
@@ -100,7 +101,7 @@ struct Qdisc {
struct sk_buff_head gso_skb ____cacheline_aligned_in_smp;
struct qdisc_skb_head q;
struct gnet_stats_basic_packed bstats;
- seqcount_t running;
+ net_seqlock_t running;
struct gnet_stats_queue qstats;
unsigned long state;
struct Qdisc *next_sched;
@@ -138,7 +139,11 @@ static inline bool qdisc_is_running(stru
{
if (qdisc->flags & TCQ_F_NOLOCK)
return spin_is_locked(&qdisc->seqlock);
+#ifdef CONFIG_PREEMPT_RT
+ return spin_is_locked(&qdisc->running.lock) ? true : false;
+#else
return (raw_read_seqcount(&qdisc->running) & 1) ? true : false;
+#endif
}
static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
@@ -162,17 +167,27 @@ static inline bool qdisc_run_begin(struc
} else if (qdisc_is_running(qdisc)) {
return false;
}
+#ifdef CONFIG_PREEMPT_RT
+ if (try_write_seqlock(&qdisc->running))
+ return true;
+ return false;
+#else
/* Variant of write_seqcount_begin() telling lockdep a trylock
* was attempted.
*/
raw_write_seqcount_begin(&qdisc->running);
seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_);
return true;
+#endif
}
static inline void qdisc_run_end(struct Qdisc *qdisc)
{
+#ifdef CONFIG_PREEMPT_RT
+ write_sequnlock(&qdisc->running);
+#else
write_seqcount_end(&qdisc->running);
+#endif
if (qdisc->flags & TCQ_F_NOLOCK)
spin_unlock(&qdisc->seqlock);
}
@@ -541,7 +556,7 @@ static inline spinlock_t *qdisc_root_sle
return qdisc_lock(root);
}
-static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
+static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
{
struct Qdisc *root = qdisc_root_sleeping(qdisc);
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -42,7 +42,7 @@
struct net_rate_estimator {
struct gnet_stats_basic_packed *bstats;
spinlock_t *stats_lock;
- seqcount_t *running;
+ net_seqlock_t *running;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
u8 ewma_log;
u8 intvl_log; /* period : (250ms << intvl_log) */
@@ -125,7 +125,7 @@ int gen_new_estimator(struct gnet_stats_
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running,
+ net_seqlock_t *running,
struct nlattr *opt)
{
struct gnet_estimator *parm = nla_data(opt);
@@ -223,7 +223,7 @@ int gen_replace_estimator(struct gnet_st
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running, struct nlattr *opt)
+ net_seqlock_t *running, struct nlattr *opt)
{
return gen_new_estimator(bstats, cpu_bstats, rate_est,
lock, running, opt);
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -138,7 +138,7 @@ static void
}
void
-__gnet_stats_copy_basic(const seqcount_t *running,
+__gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
@@ -151,15 +151,15 @@ void
}
do {
if (running)
- seq = read_seqcount_begin(running);
+ seq = net_seq_begin(running);
bstats->bytes = b->bytes;
bstats->packets = b->packets;
- } while (running && read_seqcount_retry(running, seq));
+ } while (running && net_seq_retry(running, seq));
}
EXPORT_SYMBOL(__gnet_stats_copy_basic);
static int
-___gnet_stats_copy_basic(const seqcount_t *running,
+___gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b,
@@ -200,7 +200,7 @@ static int
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic(const seqcount_t *running,
+gnet_stats_copy_basic(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
@@ -224,7 +224,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic_hw(const seqcount_t *running,
+gnet_stats_copy_basic_hw(net_seqlock_t *running,
struct gnet_dump *d,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b)
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1248,7 +1248,7 @@ static struct Qdisc *qdisc_create(struct
rcu_assign_pointer(sch->stab, stab);
}
if (tca[TCA_RATE]) {
- seqcount_t *running;
+ net_seqlock_t *running;
err = -EOPNOTSUPP;
if (sch->flags & TCQ_F_MQROOT) {
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -557,7 +557,11 @@ struct Qdisc noop_qdisc = {
.ops = &noop_qdisc_ops,
.q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
.dev_queue = &noop_netdev_queue,
+#ifdef CONFIG_PREEMPT_RT
+ .running = __SEQLOCK_UNLOCKED(noop_qdisc.running),
+#else
.running = SEQCNT_ZERO(noop_qdisc.running),
+#endif
.busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
.gso_skb = {
.next = (struct sk_buff *)&noop_qdisc.gso_skb,
@@ -853,7 +857,11 @@ struct Qdisc *qdisc_alloc(struct netdev_
spin_lock_init(&sch->busylock);
/* seqlock has the same scope of busylock, for NOLOCK qdisc */
spin_lock_init(&sch->seqlock);
+#ifdef CONFIG_PREEMPT_RT
+ seqlock_init(&sch->running);
+#else
seqcount_init(&sch->running);
+#endif
sch->ops = ops;
sch->flags = ops->static_flags;
@@ -867,7 +875,12 @@ struct Qdisc *qdisc_alloc(struct netdev_
if (sch != &noop_qdisc) {
lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key);
lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key);
+#ifdef CONFIG_PREEMPT_RT
+ lockdep_set_class(&sch->running.seqcount, &dev->qdisc_running_key);
+ lockdep_set_class(&sch->running.lock, &dev->qdisc_running_key);
+#else
lockdep_set_class(&sch->running, &dev->qdisc_running_key);
+#endif
}
return sch;

View File

@ -0,0 +1,106 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 20 Mar 2013 18:06:20 +0100
Subject: net: Add a mutex around devnet_rename_seq
On RT write_seqcount_begin() disables preemption and device_rename()
allocates memory with GFP_KERNEL and grabs later the sysfs_mutex
mutex. Serialize with a mutex and add use the non preemption disabling
__write_seqcount_begin().
To avoid writer starvation, let the reader grab the mutex and release
it when it detects a writer in progress. This keeps the normal case
(no reader on the fly) fast.
[ tglx: Instead of replacing the seqcount by a mutex, add the mutex ]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
net/core/dev.c | 34 ++++++++++++++++++++--------------
1 file changed, 20 insertions(+), 14 deletions(-)
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -195,6 +195,7 @@ static unsigned int napi_gen_id = NR_CPU
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
static seqcount_t devnet_rename_seq;
+static DEFINE_MUTEX(devnet_rename_mutex);
static inline void dev_base_seq_inc(struct net *net)
{
@@ -838,7 +839,8 @@ int netdev_get_name(struct net *net, cha
strcpy(name, dev->name);
rcu_read_unlock();
if (read_seqcount_retry(&devnet_rename_seq, seq)) {
- cond_resched();
+ mutex_lock(&devnet_rename_mutex);
+ mutex_unlock(&devnet_rename_mutex);
goto retry;
}
@@ -1115,20 +1117,17 @@ int dev_change_name(struct net_device *d
likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
return -EBUSY;
- write_seqcount_begin(&devnet_rename_seq);
+ mutex_lock(&devnet_rename_mutex);
+ __raw_write_seqcount_begin(&devnet_rename_seq);
- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- write_seqcount_end(&devnet_rename_seq);
- return 0;
- }
+ if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
+ goto outunlock;
memcpy(oldname, dev->name, IFNAMSIZ);
err = dev_get_valid_name(net, dev, newname);
- if (err < 0) {
- write_seqcount_end(&devnet_rename_seq);
- return err;
- }
+ if (err < 0)
+ goto outunlock;
if (oldname[0] && !strchr(oldname, '%'))
netdev_info(dev, "renamed from %s\n", oldname);
@@ -1141,11 +1140,12 @@ int dev_change_name(struct net_device *d
if (ret) {
memcpy(dev->name, oldname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
- write_seqcount_end(&devnet_rename_seq);
- return ret;
+ err = ret;
+ goto outunlock;
}
- write_seqcount_end(&devnet_rename_seq);
+ __raw_write_seqcount_end(&devnet_rename_seq);
+ mutex_unlock(&devnet_rename_mutex);
netdev_adjacent_rename_links(dev, oldname);
@@ -1166,7 +1166,8 @@ int dev_change_name(struct net_device *d
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- write_seqcount_begin(&devnet_rename_seq);
+ mutex_lock(&devnet_rename_mutex);
+ __raw_write_seqcount_begin(&devnet_rename_seq);
memcpy(dev->name, oldname, IFNAMSIZ);
memcpy(oldname, newname, IFNAMSIZ);
dev->name_assign_type = old_assign_type;
@@ -1179,6 +1180,11 @@ int dev_change_name(struct net_device *d
}
return err;
+
+outunlock:
+ __raw_write_seqcount_end(&devnet_rename_seq);
+ mutex_unlock(&devnet_rename_mutex);
+ return err;
}
/**

View File

@ -0,0 +1,70 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 18 Dec 2019 12:25:09 +0100
Subject: [PATCH] userfaultfd: Use a seqlock instead of seqcount
On RT write_seqcount_begin() disables preemption which leads to warning
in add_wait_queue() while the spinlock_t is acquired.
The waitqueue can't be converted to swait_queue because
userfaultfd_wake_function() is used as a custom wake function.
Use seqlock instead seqcount to avoid the preempt_disable() section
during add_wait_queue().
Cc: stable-rt@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/userfaultfd.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -61,7 +61,7 @@ struct userfaultfd_ctx {
/* waitqueue head for events */
wait_queue_head_t event_wqh;
/* a refile sequence protected by fault_pending_wqh lock */
- struct seqcount refile_seq;
+ seqlock_t refile_seq;
/* pseudo fd refcounting */
refcount_t refcount;
/* userfaultfd syscall flags */
@@ -1063,7 +1063,7 @@ static ssize_t userfaultfd_ctx_read(stru
* waitqueue could become empty if this is the
* only userfault.
*/
- write_seqcount_begin(&ctx->refile_seq);
+ write_seqlock(&ctx->refile_seq);
/*
* The fault_pending_wqh.lock prevents the uwq
@@ -1089,7 +1089,7 @@ static ssize_t userfaultfd_ctx_read(stru
list_del(&uwq->wq.entry);
add_wait_queue(&ctx->fault_wqh, &uwq->wq);
- write_seqcount_end(&ctx->refile_seq);
+ write_sequnlock(&ctx->refile_seq);
/* careful to always initialize msg if ret == 0 */
*msg = uwq->msg;
@@ -1262,11 +1262,11 @@ static __always_inline void wake_userfau
* sure we've userfaults to wake.
*/
do {
- seq = read_seqcount_begin(&ctx->refile_seq);
+ seq = read_seqbegin(&ctx->refile_seq);
need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
waitqueue_active(&ctx->fault_wqh);
cond_resched();
- } while (read_seqcount_retry(&ctx->refile_seq, seq));
+ } while (read_seqretry(&ctx->refile_seq, seq));
if (need_wakeup)
__wake_userfault(ctx, range);
}
@@ -1939,7 +1939,7 @@ static void init_once_userfaultfd_ctx(vo
init_waitqueue_head(&ctx->fault_wqh);
init_waitqueue_head(&ctx->event_wqh);
init_waitqueue_head(&ctx->fd_wqh);
- seqcount_init(&ctx->refile_seq);
+ seqlock_init(&ctx->refile_seq);
}
SYSCALL_DEFINE1(userfaultfd, int, flags)

View File

@ -0,0 +1,138 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 15 Sep 2016 10:51:27 +0200
Subject: [PATCH] fs/nfs: turn rmdir_sem into a semaphore
The RW semaphore had a reader side which used the _non_owner version
because it most likely took the reader lock in one thread and released it
in another which would cause lockdep to complain if the "regular"
version was used.
On -RT we need the owner because the rw lock is turned into a rtmutex.
The semaphores on the hand are "plain simple" and should work as
expected. We can't have multiple readers but on -RT we don't allow
multiple readers anyway so that is not a loss.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/nfs/dir.c | 8 ++++++++
fs/nfs/inode.c | 4 ++++
fs/nfs/unlink.c | 31 +++++++++++++++++++++++++++----
include/linux/nfs_fs.h | 4 ++++
4 files changed, 43 insertions(+), 4 deletions(-)
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1846,7 +1846,11 @@ int nfs_rmdir(struct inode *dir, struct
trace_nfs_rmdir_enter(dir, dentry);
if (d_really_is_positive(dentry)) {
+#ifdef CONFIG_PREEMPT_RT
+ down(&NFS_I(d_inode(dentry))->rmdir_sem);
+#else
down_write(&NFS_I(d_inode(dentry))->rmdir_sem);
+#endif
error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
/* Ensure the VFS deletes this inode */
switch (error) {
@@ -1856,7 +1860,11 @@ int nfs_rmdir(struct inode *dir, struct
case -ENOENT:
nfs_dentry_handle_enoent(dentry);
}
+#ifdef CONFIG_PREEMPT_RT
+ up(&NFS_I(d_inode(dentry))->rmdir_sem);
+#else
up_write(&NFS_I(d_inode(dentry))->rmdir_sem);
+#endif
} else
error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
trace_nfs_rmdir_exit(dir, dentry, error);
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2105,7 +2105,11 @@ static void init_once(void *foo)
atomic_long_set(&nfsi->nrequests, 0);
atomic_long_set(&nfsi->commit_info.ncommit, 0);
atomic_set(&nfsi->commit_info.rpcs_out, 0);
+#ifdef CONFIG_PREEMPT_RT
+ sema_init(&nfsi->rmdir_sem, 1);
+#else
init_rwsem(&nfsi->rmdir_sem);
+#endif
mutex_init(&nfsi->commit_mutex);
nfs4_init_once(nfsi);
}
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -53,6 +53,29 @@ static void nfs_async_unlink_done(struct
rpc_restart_call_prepare(task);
}
+#ifdef CONFIG_PREEMPT_RT
+static void nfs_down_anon(struct semaphore *sema)
+{
+ down(sema);
+}
+
+static void nfs_up_anon(struct semaphore *sema)
+{
+ up(sema);
+}
+
+#else
+static void nfs_down_anon(struct rw_semaphore *rwsem)
+{
+ down_read_non_owner(rwsem);
+}
+
+static void nfs_up_anon(struct rw_semaphore *rwsem)
+{
+ up_read_non_owner(rwsem);
+}
+#endif
+
/**
* nfs_async_unlink_release - Release the sillydelete data.
* @calldata: struct nfs_unlinkdata to release
@@ -66,7 +89,7 @@ static void nfs_async_unlink_release(voi
struct dentry *dentry = data->dentry;
struct super_block *sb = dentry->d_sb;
- up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
+ nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem);
d_lookup_done(dentry);
nfs_free_unlinkdata(data);
dput(dentry);
@@ -119,10 +142,10 @@ static int nfs_call_unlink(struct dentry
struct inode *dir = d_inode(dentry->d_parent);
struct dentry *alias;
- down_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ nfs_down_anon(&NFS_I(dir)->rmdir_sem);
alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq);
if (IS_ERR(alias)) {
- up_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ nfs_up_anon(&NFS_I(dir)->rmdir_sem);
return 0;
}
if (!d_in_lookup(alias)) {
@@ -144,7 +167,7 @@ static int nfs_call_unlink(struct dentry
ret = 0;
spin_unlock(&alias->d_lock);
dput(alias);
- up_read_non_owner(&NFS_I(dir)->rmdir_sem);
+ nfs_up_anon(&NFS_I(dir)->rmdir_sem);
/*
* If we'd displaced old cached devname, free it. At that
* point dentry is definitely not a root, so we won't need
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -165,7 +165,11 @@ struct nfs_inode {
/* Readers: in-flight sillydelete RPC calls */
/* Writers: rmdir */
+#ifdef CONFIG_PREEMPT_RT
+ struct semaphore rmdir_sem;
+#else
struct rw_semaphore rmdir_sem;
+#endif
struct mutex commit_mutex;
#if IS_ENABLED(CONFIG_NFS_V4)

View File

@ -0,0 +1,87 @@
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 20 Oct 2017 11:29:53 +0200
Subject: [PATCH] fs/dcache: disable preemption on i_dir_seq's write side
i_dir_seq is an opencoded seqcounter. Based on the code it looks like we
could have two writers in parallel despite the fact that the d_lock is
held. The problem is that during the write process on RT the preemption
is still enabled and if this process is interrupted by a reader with RT
priority then we lock up.
To avoid that lock up I am disabling the preemption during the update.
The rename of i_dir_seq is here to ensure to catch new write sides in
future.
Cc: stable-rt@vger.kernel.org
Reported-by: Oleg.Karfich@wago.com
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/dcache.c | 12 +++++++-----
fs/inode.c | 2 +-
include/linux/fs.h | 2 +-
3 files changed, 9 insertions(+), 7 deletions(-)
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2482,9 +2482,10 @@ EXPORT_SYMBOL(d_rehash);
static inline unsigned start_dir_add(struct inode *dir)
{
+ preempt_disable_rt();
for (;;) {
- unsigned n = dir->i_dir_seq;
- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+ unsigned n = dir->__i_dir_seq;
+ if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n)
return n;
cpu_relax();
}
@@ -2492,7 +2493,8 @@ static inline unsigned start_dir_add(str
static inline void end_dir_add(struct inode *dir, unsigned n)
{
- smp_store_release(&dir->i_dir_seq, n + 2);
+ smp_store_release(&dir->__i_dir_seq, n + 2);
+ preempt_enable_rt();
}
static void d_wait_lookup(struct dentry *dentry)
@@ -2525,7 +2527,7 @@ struct dentry *d_alloc_parallel(struct d
retry:
rcu_read_lock();
- seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
+ seq = smp_load_acquire(&parent->d_inode->__i_dir_seq);
r_seq = read_seqbegin(&rename_lock);
dentry = __d_lookup_rcu(parent, name, &d_seq);
if (unlikely(dentry)) {
@@ -2553,7 +2555,7 @@ struct dentry *d_alloc_parallel(struct d
}
hlist_bl_lock(b);
- if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
+ if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) {
hlist_bl_unlock(b);
rcu_read_unlock();
goto retry;
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -157,7 +157,7 @@ int inode_init_always(struct super_block
inode->i_bdev = NULL;
inode->i_cdev = NULL;
inode->i_link = NULL;
- inode->i_dir_seq = 0;
+ inode->__i_dir_seq = 0;
inode->i_rdev = 0;
inode->dirtied_when = 0;
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -717,7 +717,7 @@ struct inode {
struct block_device *i_bdev;
struct cdev *i_cdev;
char *i_link;
- unsigned i_dir_seq;
+ unsigned __i_dir_seq;
};
__u32 i_generation;

View File

@ -0,0 +1,118 @@
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Fri, 21 Jun 2013 15:07:25 -0400
Subject: list_bl: Make list head locking RT safe
As per changes in include/linux/jbd_common.h for avoiding the
bit_spin_locks on RT ("fs: jbd/jbd2: Make state lock and journal
head lock rt safe") we do the same thing here.
We use the non atomic __set_bit and __clear_bit inside the scope of
the lock to preserve the ability of the existing LIST_DEBUG code to
use the zero'th bit in the sanity checks.
As a bit spinlock, we had no lockdep visibility into the usage
of the list head locking. Now, if we were to implement it as a
standard non-raw spinlock, we would see:
BUG: sleeping function called from invalid context at kernel/rtmutex.c:658
in_atomic(): 1, irqs_disabled(): 0, pid: 122, name: udevd
5 locks held by udevd/122:
#0: (&sb->s_type->i_mutex_key#7/1){+.+.+.}, at: [<ffffffff811967e8>] lock_rename+0xe8/0xf0
#1: (rename_lock){+.+...}, at: [<ffffffff811a277c>] d_move+0x2c/0x60
#2: (&dentry->d_lock){+.+...}, at: [<ffffffff811a0763>] dentry_lock_for_move+0xf3/0x130
#3: (&dentry->d_lock/2){+.+...}, at: [<ffffffff811a0734>] dentry_lock_for_move+0xc4/0x130
#4: (&dentry->d_lock/3){+.+...}, at: [<ffffffff811a0747>] dentry_lock_for_move+0xd7/0x130
Pid: 122, comm: udevd Not tainted 3.4.47-rt62 #7
Call Trace:
[<ffffffff810b9624>] __might_sleep+0x134/0x1f0
[<ffffffff817a24d4>] rt_spin_lock+0x24/0x60
[<ffffffff811a0c4c>] __d_shrink+0x5c/0xa0
[<ffffffff811a1b2d>] __d_drop+0x1d/0x40
[<ffffffff811a24be>] __d_move+0x8e/0x320
[<ffffffff811a278e>] d_move+0x3e/0x60
[<ffffffff81199598>] vfs_rename+0x198/0x4c0
[<ffffffff8119b093>] sys_renameat+0x213/0x240
[<ffffffff817a2de5>] ? _raw_spin_unlock+0x35/0x60
[<ffffffff8107781c>] ? do_page_fault+0x1ec/0x4b0
[<ffffffff817a32ca>] ? retint_swapgs+0xe/0x13
[<ffffffff813eb0e6>] ? trace_hardirqs_on_thunk+0x3a/0x3f
[<ffffffff8119b0db>] sys_rename+0x1b/0x20
[<ffffffff817a3b96>] system_call_fastpath+0x1a/0x1f
Since we are only taking the lock during short lived list operations,
lets assume for now that it being raw won't be a significant latency
concern.
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
[julia@ni.com: Use #define instead static inline to avoid false positive from
lockdep]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
include/linux/list_bl.h | 30 ++++++++++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -3,6 +3,7 @@
#define _LINUX_LIST_BL_H
#include <linux/list.h>
+#include <linux/spinlock.h>
#include <linux/bit_spinlock.h>
/*
@@ -33,13 +34,24 @@
struct hlist_bl_head {
struct hlist_bl_node *first;
+#ifdef CONFIG_PREEMPT_RT
+ raw_spinlock_t lock;
+#endif
};
struct hlist_bl_node {
struct hlist_bl_node *next, **pprev;
};
-#define INIT_HLIST_BL_HEAD(ptr) \
- ((ptr)->first = NULL)
+
+#ifdef CONFIG_PREEMPT_RT
+#define INIT_HLIST_BL_HEAD(h) \
+do { \
+ (h)->first = NULL; \
+ raw_spin_lock_init(&(h)->lock); \
+} while (0)
+#else
+#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL
+#endif
static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{
@@ -145,12 +157,26 @@ static inline void hlist_bl_del_init(str
static inline void hlist_bl_lock(struct hlist_bl_head *b)
{
+#ifndef CONFIG_PREEMPT_RT
bit_spin_lock(0, (unsigned long *)b);
+#else
+ raw_spin_lock(&b->lock);
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+ __set_bit(0, (unsigned long *)b);
+#endif
+#endif
}
static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
+#ifndef CONFIG_PREEMPT_RT
__bit_spin_unlock(0, (unsigned long *)b);
+#else
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+ __clear_bit(0, (unsigned long *)b);
+#endif
+ raw_spin_unlock(&b->lock);
+#endif
}
static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)

View File

@ -0,0 +1,53 @@
From: Clark Williams <williams@redhat.com>
Date: Tue, 3 Jul 2018 13:34:30 -0500
Subject: [PATCH] fscache: initialize cookie hash table raw spinlocks
The fscache cookie mechanism uses a hash table of hlist_bl_head structures. The
PREEMPT_RT patcheset adds a raw spinlock to this structure and so on PREEMPT_RT
the structures get used uninitialized, causing warnings about bad magic numbers
when spinlock debugging is turned on.
Use the init function for fscache cookies.
Signed-off-by: Clark Williams <williams@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
fs/fscache/cookie.c | 8 ++++++++
fs/fscache/main.c | 1 +
include/linux/fscache.h | 1 +
3 files changed, 10 insertions(+)
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -958,3 +958,11 @@ int __fscache_check_consistency(struct f
return -ESTALE;
}
EXPORT_SYMBOL(__fscache_check_consistency);
+
+void __init fscache_cookie_init(void)
+{
+ int i;
+
+ for (i = 0; i < (1 << fscache_cookie_hash_shift) - 1; i++)
+ INIT_HLIST_BL_HEAD(&fscache_cookie_hash[i]);
+}
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -145,6 +145,7 @@ static int __init fscache_init(void)
ret = -ENOMEM;
goto error_cookie_jar;
}
+ fscache_cookie_init();
fscache_root = kobject_create_and_add("fscache", kernel_kobj);
if (!fscache_root)
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -226,6 +226,7 @@ extern void __fscache_readpages_cancel(s
extern void __fscache_disable_cookie(struct fscache_cookie *, const void *, bool);
extern void __fscache_enable_cookie(struct fscache_cookie *, const void *, loff_t,
bool (*)(void *), void *);
+extern void fscache_cookie_init(void);
/**
* fscache_register_netfs - Register a filesystem as desiring caching services

Some files were not shown because too many files have changed in this diff Show More