diff --git a/kernel/Makefile b/kernel/Makefile index 491621d23..2f351c6e6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -253,6 +253,7 @@ endef # ifeq ($(ARCH),x86_64) $(eval $(call kernel,5.4.28,5.4.x,$(EXTRA),$(DEBUG))) +$(eval $(call kernel,5.4.28,5.4.x,-rt,)) $(eval $(call kernel,4.19.113,4.19.x,$(EXTRA),$(DEBUG))) $(eval $(call kernel,4.19.113,4.19.x,,-dbg)) $(eval $(call kernel,4.19.106,4.19.x,-rt,)) @@ -260,6 +261,7 @@ $(eval $(call kernel,4.14.174,4.14.x,$(EXTRA),$(DEBUG))) else ifeq ($(ARCH),aarch64) $(eval $(call kernel,5.4.28,5.4.x,$(EXTRA),$(DEBUG))) +$(eval $(call kernel,5.4.28,5.4.x,-rt,)) $(eval $(call kernel,4.19.106,4.19.x,-rt,)) else ifeq ($(ARCH),s390x) diff --git a/kernel/config-5.4.x-aarch64-rt b/kernel/config-5.4.x-aarch64-rt new file mode 100644 index 000000000..59ef068ac --- /dev/null +++ b/kernel/config-5.4.x-aarch64-rt @@ -0,0 +1,20 @@ +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set +CONFIG_SLUB=y +# CONFIG_SLAB_FREELIST_HARDENED is not set +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_PREEMPT=y +CONFIG_PREEMPT_RT_BASE=y +CONFIG_HAVE_PREEMPT_LAZY=y +CONFIG_PREEMPT_LAZY=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT__LL is not set +# CONFIG_PREEMPT_RTB is not set +CONFIG_PREEMPT_RT_FULL=y +CONFIG_PREEMPT_COUNT=y +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set +CONFIG_DEBUG_PREEMPT=y +# CONFIG_PREEMPT_TRACER is not set +CONFIG_HZ_1000=y +CONFIG_HZ=1000 diff --git a/kernel/config-5.4.x-x86_64-rt b/kernel/config-5.4.x-x86_64-rt new file mode 100644 index 000000000..3c833eb4b --- /dev/null +++ b/kernel/config-5.4.x-x86_64-rt @@ -0,0 +1,22 @@ +CONFIG_RWSEM_GENERIC_SPINLOCK=y +# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set +CONFIG_PREEMPT_RCU=y +CONFIG_TASKS_RCU=y +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set +CONFIG_SLUB=y +# CONFIG_SLAB_FREELIST_HARDENED is not set +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_PREEMPT=y +CONFIG_PREEMPT_RT_BASE=y +CONFIG_HAVE_PREEMPT_LAZY=y +CONFIG_PREEMPT_LAZY=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT__LL is not set +# CONFIG_PREEMPT_RTB is not set +CONFIG_PREEMPT_RT_FULL=y +CONFIG_PREEMPT_COUNT=y +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set +CONFIG_DEBUG_PREEMPT=y +# CONFIG_PREEMPT_TRACER is not set diff --git a/kernel/patches-5.4.x-rt/0001-lib-smp_processor_id-Don-t-use-cpumask_equal.patch b/kernel/patches-5.4.x-rt/0001-lib-smp_processor_id-Don-t-use-cpumask_equal.patch new file mode 100644 index 000000000..e0bbebc1c --- /dev/null +++ b/kernel/patches-5.4.x-rt/0001-lib-smp_processor_id-Don-t-use-cpumask_equal.patch @@ -0,0 +1,35 @@ +From: Waiman Long +Date: Thu, 3 Oct 2019 16:36:08 -0400 +Subject: [PATCH] lib/smp_processor_id: Don't use cpumask_equal() + +The check_preemption_disabled() function uses cpumask_equal() to see +if the task is bounded to the current CPU only. cpumask_equal() calls +memcmp() to do the comparison. As x86 doesn't have __HAVE_ARCH_MEMCMP, +the slow memcmp() function in lib/string.c is used. + +On a RT kernel that call check_preemption_disabled() very frequently, +below is the perf-record output of a certain microbenchmark: + + 42.75% 2.45% testpmd [kernel.kallsyms] [k] check_preemption_disabled + 40.01% 39.97% testpmd [kernel.kallsyms] [k] memcmp + +We should avoid calling memcmp() in performance critical path. So the +cpumask_equal() call is now replaced with an equivalent simpler check. + +Signed-off-by: Waiman Long +Signed-off-by: Sebastian Andrzej Siewior +--- + lib/smp_processor_id.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/lib/smp_processor_id.c ++++ b/lib/smp_processor_id.c +@@ -23,7 +23,7 @@ unsigned int check_preemption_disabled(c + * Kernel threads bound to a single CPU can safely use + * smp_processor_id(): + */ +- if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu))) ++ if (current->nr_cpus_allowed == 1) + goto out; + + /* diff --git a/kernel/patches-5.4.x-rt/0002-0001-jbd2-Simplify-journal_unmap_buffer.patch b/kernel/patches-5.4.x-rt/0002-0001-jbd2-Simplify-journal_unmap_buffer.patch new file mode 100644 index 000000000..b4c4a38bb --- /dev/null +++ b/kernel/patches-5.4.x-rt/0002-0001-jbd2-Simplify-journal_unmap_buffer.patch @@ -0,0 +1,57 @@ +From: Thomas Gleixner +Date: Fri, 9 Aug 2019 14:42:27 +0200 +Subject: [PATCH 1/7] jbd2: Simplify journal_unmap_buffer() + +journal_unmap_buffer() checks first whether the buffer head is a journal. +If so it takes locks and then invokes jbd2_journal_grab_journal_head() +followed by another check whether this is journal head buffer. + +The double checking is pointless. + +Replace the initial check with jbd2_journal_grab_journal_head() which +alredy checks whether the buffer head is actually a journal. + +Allows also early access to the journal head pointer for the upcoming +conversion of state lock to a regular spinlock. + +Signed-off-by: Thomas Gleixner +Reviewed-by: Jan Kara +Cc: linux-ext4@vger.kernel.org +Cc: "Theodore Ts'o" +Signed-off-by: Jan Kara +Signed-off-by: Sebastian Andrzej Siewior +--- + fs/jbd2/transaction.c | 8 ++------ + 1 file changed, 2 insertions(+), 6 deletions(-) + +--- a/fs/jbd2/transaction.c ++++ b/fs/jbd2/transaction.c +@@ -2203,7 +2203,8 @@ static int journal_unmap_buffer(journal_ + * holding the page lock. --sct + */ + +- if (!buffer_jbd(bh)) ++ jh = jbd2_journal_grab_journal_head(bh); ++ if (!jh) + goto zap_buffer_unlocked; + + /* OK, we have data buffer in journaled mode */ +@@ -2211,10 +2212,6 @@ static int journal_unmap_buffer(journal_ + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + +- jh = jbd2_journal_grab_journal_head(bh); +- if (!jh) +- goto zap_buffer_no_jh; +- + /* + * We cannot remove the buffer from checkpoint lists until the + * transaction adding inode to orphan list (let's call it T) +@@ -2338,7 +2335,6 @@ static int journal_unmap_buffer(journal_ + */ + jh->b_modified = 0; + jbd2_journal_put_journal_head(jh); +-zap_buffer_no_jh: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + write_unlock(&journal->j_state_lock); diff --git a/kernel/patches-5.4.x-rt/0003-0002-jbd2-Remove-jbd_trylock_bh_state.patch b/kernel/patches-5.4.x-rt/0003-0002-jbd2-Remove-jbd_trylock_bh_state.patch new file mode 100644 index 000000000..bce5d83a7 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0003-0002-jbd2-Remove-jbd_trylock_bh_state.patch @@ -0,0 +1,30 @@ +From: Thomas Gleixner +Date: Fri, 9 Aug 2019 14:42:28 +0200 +Subject: [PATCH 2/7] jbd2: Remove jbd_trylock_bh_state() + +No users. + +Signed-off-by: Thomas Gleixner +Reviewed-by: Jan Kara +Cc: linux-ext4@vger.kernel.org +Cc: "Theodore Ts'o" +Signed-off-by: Jan Kara +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/jbd2.h | 5 ----- + 1 file changed, 5 deletions(-) + +--- a/include/linux/jbd2.h ++++ b/include/linux/jbd2.h +@@ -347,11 +347,6 @@ static inline void jbd_lock_bh_state(str + bit_spin_lock(BH_State, &bh->b_state); + } + +-static inline int jbd_trylock_bh_state(struct buffer_head *bh) +-{ +- return bit_spin_trylock(BH_State, &bh->b_state); +-} +- + static inline int jbd_is_locked_bh_state(struct buffer_head *bh) + { + return bit_spin_is_locked(BH_State, &bh->b_state); diff --git a/kernel/patches-5.4.x-rt/0004-0003-jbd2-Move-dropping-of-jh-reference-out-of-un-re-fili.patch b/kernel/patches-5.4.x-rt/0004-0003-jbd2-Move-dropping-of-jh-reference-out-of-un-re-fili.patch new file mode 100644 index 000000000..6464bbb11 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0004-0003-jbd2-Move-dropping-of-jh-reference-out-of-un-re-fili.patch @@ -0,0 +1,150 @@ +From: Jan Kara +Date: Fri, 9 Aug 2019 14:42:29 +0200 +Subject: [PATCH 3/7] jbd2: Move dropping of jh reference out of un/re-filing + functions + +__jbd2_journal_unfile_buffer() and __jbd2_journal_refile_buffer() drop +transaction's jh reference when they remove jh from a transaction. This +will be however inconvenient once we move state lock into journal_head +itself as we still need to unlock it and we'd need to grab jh reference +just for that. Move dropping of jh reference out of these functions into +the few callers. + +Signed-off-by: Jan Kara +Signed-off-by: Sebastian Andrzej Siewior +--- + fs/jbd2/commit.c | 5 ++++- + fs/jbd2/transaction.c | 23 +++++++++++++++-------- + include/linux/jbd2.h | 2 +- + 3 files changed, 20 insertions(+), 10 deletions(-) + +--- a/fs/jbd2/commit.c ++++ b/fs/jbd2/commit.c +@@ -920,6 +920,7 @@ void jbd2_journal_commit_transaction(jou + transaction_t *cp_transaction; + struct buffer_head *bh; + int try_to_free = 0; ++ bool drop_ref; + + jh = commit_transaction->t_forget; + spin_unlock(&journal->j_list_lock); +@@ -1028,8 +1029,10 @@ void jbd2_journal_commit_transaction(jou + try_to_free = 1; + } + JBUFFER_TRACE(jh, "refile or unfile buffer"); +- __jbd2_journal_refile_buffer(jh); ++ drop_ref = __jbd2_journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); ++ if (drop_ref) ++ jbd2_journal_put_journal_head(jh); + if (try_to_free) + release_buffer_page(bh); /* Drops bh reference */ + else +--- a/fs/jbd2/transaction.c ++++ b/fs/jbd2/transaction.c +@@ -1602,6 +1602,7 @@ int jbd2_journal_forget (handle_t *handl + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + } else { + __jbd2_journal_unfile_buffer(jh); ++ jbd2_journal_put_journal_head(jh); + if (!buffer_jbd(bh)) { + spin_unlock(&journal->j_list_lock); + goto not_jbd; +@@ -1975,17 +1976,15 @@ static void __jbd2_journal_temp_unlink_b + } + + /* +- * Remove buffer from all transactions. ++ * Remove buffer from all transactions. The caller is responsible for dropping ++ * the jh reference that belonged to the transaction. + * + * Called with bh_state lock and j_list_lock +- * +- * jh and bh may be already freed when this function returns. + */ + static void __jbd2_journal_unfile_buffer(struct journal_head *jh) + { + __jbd2_journal_temp_unlink_buffer(jh); + jh->b_transaction = NULL; +- jbd2_journal_put_journal_head(jh); + } + + void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) +@@ -1999,6 +1998,7 @@ void jbd2_journal_unfile_buffer(journal_ + __jbd2_journal_unfile_buffer(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); ++ jbd2_journal_put_journal_head(jh); + __brelse(bh); + } + +@@ -2137,6 +2137,7 @@ static int __dispose_buffer(struct journ + } else { + JBUFFER_TRACE(jh, "on running transaction"); + __jbd2_journal_unfile_buffer(jh); ++ jbd2_journal_put_journal_head(jh); + } + return may_free; + } +@@ -2502,9 +2503,11 @@ void jbd2_journal_file_buffer(struct jou + * Called under j_list_lock + * Called under jbd_lock_bh_state(jh2bh(jh)) + * +- * jh and bh may be already free when this function returns ++ * When this function returns true, there's no next transaction to refile to ++ * and the caller has to drop jh reference through ++ * jbd2_journal_put_journal_head(). + */ +-void __jbd2_journal_refile_buffer(struct journal_head *jh) ++bool __jbd2_journal_refile_buffer(struct journal_head *jh) + { + int was_dirty, jlist; + struct buffer_head *bh = jh2bh(jh); +@@ -2516,7 +2519,7 @@ void __jbd2_journal_refile_buffer(struct + /* If the buffer is now unused, just drop it. */ + if (jh->b_next_transaction == NULL) { + __jbd2_journal_unfile_buffer(jh); +- return; ++ return true; + } + + /* +@@ -2544,6 +2547,7 @@ void __jbd2_journal_refile_buffer(struct + + if (was_dirty) + set_buffer_jbddirty(bh); ++ return false; + } + + /* +@@ -2555,15 +2559,18 @@ void __jbd2_journal_refile_buffer(struct + void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) + { + struct buffer_head *bh = jh2bh(jh); ++ bool drop; + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); +- __jbd2_journal_refile_buffer(jh); ++ drop = __jbd2_journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_list_lock); + __brelse(bh); ++ if (drop) ++ jbd2_journal_put_journal_head(jh); + } + + /* +--- a/include/linux/jbd2.h ++++ b/include/linux/jbd2.h +@@ -1252,7 +1252,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM + + /* Filing buffers */ + extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); +-extern void __jbd2_journal_refile_buffer(struct journal_head *); ++extern bool __jbd2_journal_refile_buffer(struct journal_head *); + extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); + extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); + extern void __journal_free_buffer(struct journal_head *bh); diff --git a/kernel/patches-5.4.x-rt/0005-0004-jbd2-Drop-unnecessary-branch-from-jbd2_journal_forge.patch b/kernel/patches-5.4.x-rt/0005-0004-jbd2-Drop-unnecessary-branch-from-jbd2_journal_forge.patch new file mode 100644 index 000000000..07e46014b --- /dev/null +++ b/kernel/patches-5.4.x-rt/0005-0004-jbd2-Drop-unnecessary-branch-from-jbd2_journal_forge.patch @@ -0,0 +1,27 @@ +From: Jan Kara +Date: Fri, 9 Aug 2019 14:42:30 +0200 +Subject: [PATCH 4/7] jbd2: Drop unnecessary branch from jbd2_journal_forget() + +We have cleared both dirty & jbddirty bits from the bh. So there's no +difference between bforget() and brelse(). Thus there's no point jumping +to no_jbd branch. + +Signed-off-by: Jan Kara +Signed-off-by: Sebastian Andrzej Siewior +--- + fs/jbd2/transaction.c | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/fs/jbd2/transaction.c ++++ b/fs/jbd2/transaction.c +@@ -1603,10 +1603,6 @@ int jbd2_journal_forget (handle_t *handl + } else { + __jbd2_journal_unfile_buffer(jh); + jbd2_journal_put_journal_head(jh); +- if (!buffer_jbd(bh)) { +- spin_unlock(&journal->j_list_lock); +- goto not_jbd; +- } + } + spin_unlock(&journal->j_list_lock); + } else if (jh->b_transaction) { diff --git a/kernel/patches-5.4.x-rt/0006-0005-jbd2-Don-t-call-__bforget-unnecessarily.patch b/kernel/patches-5.4.x-rt/0006-0005-jbd2-Don-t-call-__bforget-unnecessarily.patch new file mode 100644 index 000000000..13b61b7f7 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0006-0005-jbd2-Don-t-call-__bforget-unnecessarily.patch @@ -0,0 +1,58 @@ +From: Jan Kara +Date: Fri, 9 Aug 2019 14:42:31 +0200 +Subject: [PATCH 5/7] jbd2: Don't call __bforget() unnecessarily + +jbd2_journal_forget() jumps to 'not_jbd' branch which calls __bforget() +in cases where the buffer is clean which is pointless. In case of failed +assertion, it can be even argued that it is safer not to touch buffer's +dirty bits. Also logically it makes more sense to just jump to 'drop' +and that will make logic also simpler when we switch bh_state_lock to a +spinlock. + +Signed-off-by: Jan Kara +Signed-off-by: Sebastian Andrzej Siewior +--- + fs/jbd2/transaction.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +--- a/fs/jbd2/transaction.c ++++ b/fs/jbd2/transaction.c +@@ -1554,7 +1554,7 @@ int jbd2_journal_forget (handle_t *handl + if (!J_EXPECT_JH(jh, !jh->b_committed_data, + "inconsistent data on disk")) { + err = -EIO; +- goto not_jbd; ++ goto drop; + } + + /* keep track of whether or not this transaction modified us */ +@@ -1644,7 +1644,7 @@ int jbd2_journal_forget (handle_t *handl + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "belongs to none transaction"); + spin_unlock(&journal->j_list_lock); +- goto not_jbd; ++ goto drop; + } + + /* +@@ -1654,7 +1654,7 @@ int jbd2_journal_forget (handle_t *handl + if (!buffer_dirty(bh)) { + __jbd2_journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); +- goto not_jbd; ++ goto drop; + } + + /* +@@ -1667,10 +1667,9 @@ int jbd2_journal_forget (handle_t *handl + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + spin_unlock(&journal->j_list_lock); + } +- ++drop: + jbd_unlock_bh_state(bh); + __brelse(bh); +-drop: + if (drop_reserve) { + /* no need to reserve log space for this block -bzzz */ + handle->h_buffer_credits++; diff --git a/kernel/patches-5.4.x-rt/0007-0006-jbd2-Make-state-lock-a-spinlock.patch b/kernel/patches-5.4.x-rt/0007-0006-jbd2-Make-state-lock-a-spinlock.patch new file mode 100644 index 000000000..41415b373 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0007-0006-jbd2-Make-state-lock-a-spinlock.patch @@ -0,0 +1,675 @@ +From: Thomas Gleixner +Date: Fri, 9 Aug 2019 14:42:32 +0200 +Subject: [PATCH 6/7] jbd2: Make state lock a spinlock + +Bit-spinlocks are problematic on PREEMPT_RT if functions which might sleep +on RT, e.g. spin_lock(), alloc/free(), are invoked inside the lock held +region because bit spinlocks disable preemption even on RT. + +A first attempt was to replace state lock with a spinlock placed in struct +buffer_head and make the locking conditional on PREEMPT_RT and +DEBUG_BIT_SPINLOCKS. + +Jan pointed out that there is a 4 byte hole in struct journal_head where a +regular spinlock fits in and he would not object to convert the state lock +to a spinlock unconditionally. + +Aside of solving the RT problem, this also gains lockdep coverage for the +journal head state lock (bit-spinlocks are not covered by lockdep as it's +hard to fit a lockdep map into a single bit). + +The trivial change would have been to convert the jbd_*lock_bh_state() +inlines, but that comes with the downside that these functions take a +buffer head pointer which needs to be converted to a journal head pointer +which adds another level of indirection. + +As almost all functions which use this lock have a journal head pointer +readily available, it makes more sense to remove the lock helper inlines +and write out spin_*lock() at all call sites. + +Fixup all locking comments as well. + +Suggested-by: Jan Kara +Signed-off-by: Thomas Gleixner +Signed-off-by: Jan Kara +Cc: "Theodore Ts'o" +Cc: Mark Fasheh +Cc: Joseph Qi +Cc: Joel Becker +Cc: Jan Kara +Cc: linux-ext4@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior +--- + fs/jbd2/commit.c | 8 +-- + fs/jbd2/journal.c | 10 ++-- + fs/jbd2/transaction.c | 100 ++++++++++++++++++++----------------------- + fs/ocfs2/suballoc.c | 19 ++++---- + include/linux/jbd2.h | 20 -------- + include/linux/journal-head.h | 21 ++++++--- + 6 files changed, 84 insertions(+), 94 deletions(-) + +--- a/fs/jbd2/commit.c ++++ b/fs/jbd2/commit.c +@@ -482,10 +482,10 @@ void jbd2_journal_commit_transaction(jou + if (jh->b_committed_data) { + struct buffer_head *bh = jh2bh(jh); + +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + jbd2_free(jh->b_committed_data, bh->b_size); + jh->b_committed_data = NULL; +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + } + jbd2_journal_refile_buffer(journal, jh); + } +@@ -930,7 +930,7 @@ void jbd2_journal_commit_transaction(jou + * done with it. + */ + get_bh(bh); +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); + + /* +@@ -1030,7 +1030,7 @@ void jbd2_journal_commit_transaction(jou + } + JBUFFER_TRACE(jh, "refile or unfile buffer"); + drop_ref = __jbd2_journal_refile_buffer(jh); +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + if (drop_ref) + jbd2_journal_put_journal_head(jh); + if (try_to_free) +--- a/fs/jbd2/journal.c ++++ b/fs/jbd2/journal.c +@@ -363,7 +363,7 @@ int jbd2_journal_write_metadata_buffer(t + /* keep subsequent assertions sane */ + atomic_set(&new_bh->b_count, 1); + +- jbd_lock_bh_state(bh_in); ++ spin_lock(&jh_in->b_state_lock); + repeat: + /* + * If a new transaction has already done a buffer copy-out, then +@@ -405,13 +405,13 @@ int jbd2_journal_write_metadata_buffer(t + if (need_copy_out && !done_copy_out) { + char *tmp; + +- jbd_unlock_bh_state(bh_in); ++ spin_unlock(&jh_in->b_state_lock); + tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); + if (!tmp) { + brelse(new_bh); + return -ENOMEM; + } +- jbd_lock_bh_state(bh_in); ++ spin_lock(&jh_in->b_state_lock); + if (jh_in->b_frozen_data) { + jbd2_free(tmp, bh_in->b_size); + goto repeat; +@@ -464,7 +464,7 @@ int jbd2_journal_write_metadata_buffer(t + __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + set_buffer_shadow(bh_in); +- jbd_unlock_bh_state(bh_in); ++ spin_unlock(&jh_in->b_state_lock); + + return do_escape | (done_copy_out << 1); + } +@@ -2407,6 +2407,8 @@ static struct journal_head *journal_allo + ret = kmem_cache_zalloc(jbd2_journal_head_cache, + GFP_NOFS | __GFP_NOFAIL); + } ++ if (ret) ++ spin_lock_init(&ret->b_state_lock); + return ret; + } + +--- a/fs/jbd2/transaction.c ++++ b/fs/jbd2/transaction.c +@@ -877,7 +877,7 @@ do_get_write_access(handle_t *handle, st + + start_lock = jiffies; + lock_buffer(bh); +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + + /* If it takes too long to lock the buffer, trace it */ + time_lock = jbd2_time_diff(start_lock, jiffies); +@@ -927,7 +927,7 @@ do_get_write_access(handle_t *handle, st + + error = -EROFS; + if (is_handle_aborted(handle)) { +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + goto out; + } + error = 0; +@@ -991,7 +991,7 @@ do_get_write_access(handle_t *handle, st + */ + if (buffer_shadow(bh)) { + JBUFFER_TRACE(jh, "on shadow: sleep"); +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); + goto repeat; + } +@@ -1012,7 +1012,7 @@ do_get_write_access(handle_t *handle, st + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, + GFP_NOFS | __GFP_NOFAIL); + goto repeat; +@@ -1031,7 +1031,7 @@ do_get_write_access(handle_t *handle, st + jh->b_next_transaction = transaction; + + done: +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + + /* + * If we are about to journal a buffer, then any revoke pending on it is +@@ -1173,7 +1173,7 @@ int jbd2_journal_get_create_access(handl + * that case: the transaction must have deleted the buffer for it to be + * reused here. + */ +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + J_ASSERT_JH(jh, (jh->b_transaction == transaction || + jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && +@@ -1208,7 +1208,7 @@ int jbd2_journal_get_create_access(handl + jh->b_next_transaction = transaction; + spin_unlock(&journal->j_list_lock); + } +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + + /* + * akpm: I added this. ext3_alloc_branch can pick up new indirect +@@ -1279,13 +1279,13 @@ int jbd2_journal_get_undo_access(handle_ + committed_data = jbd2_alloc(jh2bh(jh)->b_size, + GFP_NOFS|__GFP_NOFAIL); + +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + if (!jh->b_committed_data) { + /* Copy out the current buffer contents into the + * preserved, committed copy. */ + JBUFFER_TRACE(jh, "generate b_committed data"); + if (!committed_data) { +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + goto repeat; + } + +@@ -1293,7 +1293,7 @@ int jbd2_journal_get_undo_access(handle_ + committed_data = NULL; + memcpy(jh->b_committed_data, bh->b_data, bh->b_size); + } +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + out: + jbd2_journal_put_journal_head(jh); + if (unlikely(committed_data)) +@@ -1394,16 +1394,16 @@ int jbd2_journal_dirty_metadata(handle_t + */ + if (jh->b_transaction != transaction && + jh->b_next_transaction != transaction) { +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_next_transaction == transaction); +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + } + if (jh->b_modified == 1) { + /* If it's in our transaction it must be in BJ_Metadata list. */ + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) { +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + if (jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata) + pr_err("JBD2: assertion failure: h_type=%u " +@@ -1413,13 +1413,13 @@ int jbd2_journal_dirty_metadata(handle_t + jh->b_jlist); + J_ASSERT_JH(jh, jh->b_transaction != transaction || + jh->b_jlist == BJ_Metadata); +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + } + goto out; + } + + journal = transaction->t_journal; +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + + if (jh->b_modified == 0) { + /* +@@ -1505,7 +1505,7 @@ int jbd2_journal_dirty_metadata(handle_t + __jbd2_journal_file_buffer(jh, transaction, BJ_Metadata); + spin_unlock(&journal->j_list_lock); + out_unlock_bh: +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + out: + JBUFFER_TRACE(jh, "exit"); + return ret; +@@ -1543,11 +1543,13 @@ int jbd2_journal_forget (handle_t *handl + + BUFFER_TRACE(bh, "entry"); + +- jbd_lock_bh_state(bh); ++ jh = jbd2_journal_grab_journal_head(bh); ++ if (!jh) { ++ __bforget(bh); ++ return 0; ++ } + +- if (!buffer_jbd(bh)) +- goto not_jbd; +- jh = bh2jh(bh); ++ spin_lock(&jh->b_state_lock); + + /* Critical error: attempting to delete a bitmap buffer, maybe? + * Don't do any jbd operations, and return an error. */ +@@ -1668,18 +1670,14 @@ int jbd2_journal_forget (handle_t *handl + spin_unlock(&journal->j_list_lock); + } + drop: +- jbd_unlock_bh_state(bh); + __brelse(bh); ++ spin_unlock(&jh->b_state_lock); ++ jbd2_journal_put_journal_head(jh); + if (drop_reserve) { + /* no need to reserve log space for this block -bzzz */ + handle->h_buffer_credits++; + } + return err; +- +-not_jbd: +- jbd_unlock_bh_state(bh); +- __bforget(bh); +- goto drop; + } + + /** +@@ -1878,7 +1876,7 @@ int jbd2_journal_stop(handle_t *handle) + * + * j_list_lock is held. + * +- * jbd_lock_bh_state(jh2bh(jh)) is held. ++ * jh->b_state_lock is held. + */ + + static inline void +@@ -1902,7 +1900,7 @@ static inline void + * + * Called with j_list_lock held, and the journal may not be locked. + * +- * jbd_lock_bh_state(jh2bh(jh)) is held. ++ * jh->b_state_lock is held. + */ + + static inline void +@@ -1934,7 +1932,7 @@ static void __jbd2_journal_temp_unlink_b + transaction_t *transaction; + struct buffer_head *bh = jh2bh(jh); + +- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); ++ lockdep_assert_held(&jh->b_state_lock); + transaction = jh->b_transaction; + if (transaction) + assert_spin_locked(&transaction->t_journal->j_list_lock); +@@ -1988,11 +1986,11 @@ void jbd2_journal_unfile_buffer(journal_ + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + spin_lock(&journal->j_list_lock); + __jbd2_journal_unfile_buffer(jh); + spin_unlock(&journal->j_list_lock); +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); + __brelse(bh); + } +@@ -2000,7 +1998,7 @@ void jbd2_journal_unfile_buffer(journal_ + /* + * Called from jbd2_journal_try_to_free_buffers(). + * +- * Called under jbd_lock_bh_state(bh) ++ * Called under jh->b_state_lock + */ + static void + __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) +@@ -2087,10 +2085,10 @@ int jbd2_journal_try_to_free_buffers(jou + if (!jh) + continue; + +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + __journal_try_to_free_buffer(journal, bh); ++ spin_unlock(&jh->b_state_lock); + jbd2_journal_put_journal_head(jh); +- jbd_unlock_bh_state(bh); + if (buffer_jbd(bh)) + goto busy; + } while ((bh = bh->b_this_page) != head); +@@ -2111,7 +2109,7 @@ int jbd2_journal_try_to_free_buffers(jou + * + * Called under j_list_lock. + * +- * Called under jbd_lock_bh_state(bh). ++ * Called under jh->b_state_lock. + */ + static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) + { +@@ -2205,7 +2203,7 @@ static int journal_unmap_buffer(journal_ + + /* OK, we have data buffer in journaled mode */ + write_lock(&journal->j_state_lock); +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + spin_lock(&journal->j_list_lock); + + /* +@@ -2286,10 +2284,10 @@ static int journal_unmap_buffer(journal_ + * for commit and try again. + */ + if (partial_page) { +- jbd2_journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + write_unlock(&journal->j_state_lock); ++ jbd2_journal_put_journal_head(jh); + return -EBUSY; + } + /* +@@ -2303,10 +2301,10 @@ static int journal_unmap_buffer(journal_ + if (journal->j_running_transaction && buffer_jbddirty(bh)) + jh->b_next_transaction = journal->j_running_transaction; + jh->b_modified = 0; +- jbd2_journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + write_unlock(&journal->j_state_lock); ++ jbd2_journal_put_journal_head(jh); + return 0; + } else { + /* Good, the buffer belongs to the running transaction. +@@ -2330,10 +2328,10 @@ static int journal_unmap_buffer(journal_ + * here. + */ + jh->b_modified = 0; +- jbd2_journal_put_journal_head(jh); + spin_unlock(&journal->j_list_lock); +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + write_unlock(&journal->j_state_lock); ++ jbd2_journal_put_journal_head(jh); + zap_buffer_unlocked: + clear_buffer_dirty(bh); + J_ASSERT_BH(bh, !buffer_jbddirty(bh)); +@@ -2420,7 +2418,7 @@ void __jbd2_journal_file_buffer(struct j + int was_dirty = 0; + struct buffer_head *bh = jh2bh(jh); + +- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); ++ lockdep_assert_held(&jh->b_state_lock); + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); +@@ -2482,11 +2480,11 @@ void __jbd2_journal_file_buffer(struct j + void jbd2_journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) + { +- jbd_lock_bh_state(jh2bh(jh)); ++ spin_lock(&jh->b_state_lock); + spin_lock(&transaction->t_journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, jlist); + spin_unlock(&transaction->t_journal->j_list_lock); +- jbd_unlock_bh_state(jh2bh(jh)); ++ spin_unlock(&jh->b_state_lock); + } + + /* +@@ -2496,7 +2494,7 @@ void jbd2_journal_file_buffer(struct jou + * buffer on that transaction's metadata list. + * + * Called under j_list_lock +- * Called under jbd_lock_bh_state(jh2bh(jh)) ++ * Called under jh->b_state_lock + * + * When this function returns true, there's no next transaction to refile to + * and the caller has to drop jh reference through +@@ -2507,7 +2505,7 @@ bool __jbd2_journal_refile_buffer(struct + int was_dirty, jlist; + struct buffer_head *bh = jh2bh(jh); + +- J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); ++ lockdep_assert_held(&jh->b_state_lock); + if (jh->b_transaction) + assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); + +@@ -2553,17 +2551,13 @@ bool __jbd2_journal_refile_buffer(struct + */ + void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) + { +- struct buffer_head *bh = jh2bh(jh); + bool drop; + +- /* Get reference so that buffer cannot be freed before we unlock it */ +- get_bh(bh); +- jbd_lock_bh_state(bh); ++ spin_lock(&jh->b_state_lock); + spin_lock(&journal->j_list_lock); + drop = __jbd2_journal_refile_buffer(jh); +- jbd_unlock_bh_state(bh); ++ spin_unlock(&jh->b_state_lock); + spin_unlock(&journal->j_list_lock); +- __brelse(bh); + if (drop) + jbd2_journal_put_journal_head(jh); + } +--- a/fs/ocfs2/suballoc.c ++++ b/fs/ocfs2/suballoc.c +@@ -1252,6 +1252,7 @@ static int ocfs2_test_bg_bit_allocatable + int nr) + { + struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; ++ struct journal_head *jh; + int ret; + + if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) +@@ -1260,13 +1261,14 @@ static int ocfs2_test_bg_bit_allocatable + if (!buffer_jbd(bg_bh)) + return 1; + +- jbd_lock_bh_state(bg_bh); +- bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data; ++ jh = bh2jh(bg_bh); ++ spin_lock(&jh->b_state_lock); ++ bg = (struct ocfs2_group_desc *) jh->b_committed_data; + if (bg) + ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + else + ret = 1; +- jbd_unlock_bh_state(bg_bh); ++ spin_unlock(&jh->b_state_lock); + + return ret; + } +@@ -2387,6 +2389,7 @@ static int ocfs2_block_group_clear_bits( + int status; + unsigned int tmp; + struct ocfs2_group_desc *undo_bg = NULL; ++ struct journal_head *jh; + + /* The caller got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ +@@ -2405,10 +2408,10 @@ static int ocfs2_block_group_clear_bits( + goto bail; + } + ++ jh = bh2jh(group_bh); + if (undo_fn) { +- jbd_lock_bh_state(group_bh); +- undo_bg = (struct ocfs2_group_desc *) +- bh2jh(group_bh)->b_committed_data; ++ spin_lock(&jh->b_state_lock); ++ undo_bg = (struct ocfs2_group_desc *) jh->b_committed_data; + BUG_ON(!undo_bg); + } + +@@ -2423,7 +2426,7 @@ static int ocfs2_block_group_clear_bits( + le16_add_cpu(&bg->bg_free_bits_count, num_bits); + if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + if (undo_fn) +- jbd_unlock_bh_state(group_bh); ++ spin_unlock(&jh->b_state_lock); + return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", + (unsigned long long)le64_to_cpu(bg->bg_blkno), + le16_to_cpu(bg->bg_bits), +@@ -2432,7 +2435,7 @@ static int ocfs2_block_group_clear_bits( + } + + if (undo_fn) +- jbd_unlock_bh_state(group_bh); ++ spin_unlock(&jh->b_state_lock); + + ocfs2_journal_dirty(handle, group_bh); + bail: +--- a/include/linux/jbd2.h ++++ b/include/linux/jbd2.h +@@ -313,7 +313,6 @@ enum jbd_state_bits { + BH_Revoked, /* Has been revoked from the log */ + BH_RevokeValid, /* Revoked flag is valid */ + BH_JBDDirty, /* Is dirty but journaled */ +- BH_State, /* Pins most journal_head state */ + BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ + BH_Shadow, /* IO on shadow buffer is running */ + BH_Verified, /* Metadata block has been verified ok */ +@@ -342,21 +341,6 @@ static inline struct journal_head *bh2jh + return bh->b_private; + } + +-static inline void jbd_lock_bh_state(struct buffer_head *bh) +-{ +- bit_spin_lock(BH_State, &bh->b_state); +-} +- +-static inline int jbd_is_locked_bh_state(struct buffer_head *bh) +-{ +- return bit_spin_is_locked(BH_State, &bh->b_state); +-} +- +-static inline void jbd_unlock_bh_state(struct buffer_head *bh) +-{ +- bit_spin_unlock(BH_State, &bh->b_state); +-} +- + static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) + { + bit_spin_lock(BH_JournalHead, &bh->b_state); +@@ -551,9 +535,9 @@ struct transaction_chp_stats_s { + * ->jbd_lock_bh_journal_head() (This is "innermost") + * + * j_state_lock +- * ->jbd_lock_bh_state() ++ * ->b_state_lock + * +- * jbd_lock_bh_state() ++ * b_state_lock + * ->j_list_lock + * + * j_state_lock +--- a/include/linux/journal-head.h ++++ b/include/linux/journal-head.h +@@ -11,6 +11,8 @@ + #ifndef JOURNAL_HEAD_H_INCLUDED + #define JOURNAL_HEAD_H_INCLUDED + ++#include ++ + typedef unsigned int tid_t; /* Unique transaction ID */ + typedef struct transaction_s transaction_t; /* Compound transaction type */ + +@@ -24,13 +26,18 @@ struct journal_head { + struct buffer_head *b_bh; + + /* ++ * Protect the buffer head state ++ */ ++ spinlock_t b_state_lock; ++ ++ /* + * Reference count - see description in journal.c + * [jbd_lock_bh_journal_head()] + */ + int b_jcount; + + /* +- * Journalling list for this buffer [jbd_lock_bh_state()] ++ * Journalling list for this buffer [b_state_lock] + * NOTE: We *cannot* combine this with b_modified into a bitfield + * as gcc would then (which the C standard allows but which is + * very unuseful) make 64-bit accesses to the bitfield and clobber +@@ -41,20 +48,20 @@ struct journal_head { + /* + * This flag signals the buffer has been modified by + * the currently running transaction +- * [jbd_lock_bh_state()] ++ * [b_state_lock] + */ + unsigned b_modified; + + /* + * Copy of the buffer data frozen for writing to the log. +- * [jbd_lock_bh_state()] ++ * [b_state_lock] + */ + char *b_frozen_data; + + /* + * Pointer to a saved copy of the buffer containing no uncommitted + * deallocation references, so that allocations can avoid overwriting +- * uncommitted deletes. [jbd_lock_bh_state()] ++ * uncommitted deletes. [b_state_lock] + */ + char *b_committed_data; + +@@ -63,7 +70,7 @@ struct journal_head { + * metadata: either the running transaction or the committing + * transaction (if there is one). Only applies to buffers on a + * transaction's data or metadata journaling list. +- * [j_list_lock] [jbd_lock_bh_state()] ++ * [j_list_lock] [b_state_lock] + * Either of these locks is enough for reading, both are needed for + * changes. + */ +@@ -73,13 +80,13 @@ struct journal_head { + * Pointer to the running compound transaction which is currently + * modifying the buffer's metadata, if there was already a transaction + * committing it when the new transaction touched it. +- * [t_list_lock] [jbd_lock_bh_state()] ++ * [t_list_lock] [b_state_lock] + */ + transaction_t *b_next_transaction; + + /* + * Doubly-linked list of buffers on a transaction's data, metadata or +- * forget queue. [t_list_lock] [jbd_lock_bh_state()] ++ * forget queue. [t_list_lock] [b_state_lock] + */ + struct journal_head *b_tnext, *b_tprev; + diff --git a/kernel/patches-5.4.x-rt/0008-0007-jbd2-Free-journal-head-outside-of-locked-region.patch b/kernel/patches-5.4.x-rt/0008-0007-jbd2-Free-journal-head-outside-of-locked-region.patch new file mode 100644 index 000000000..e58eaf8c6 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0008-0007-jbd2-Free-journal-head-outside-of-locked-region.patch @@ -0,0 +1,88 @@ +From: Thomas Gleixner +Date: Fri, 9 Aug 2019 14:42:33 +0200 +Subject: [PATCH 7/7] jbd2: Free journal head outside of locked region + +On PREEMPT_RT bit-spinlocks have the same semantics as on PREEMPT_RT=n, +i.e. they disable preemption. That means functions which are not safe to be +called in preempt disabled context on RT trigger a might_sleep() assert. + +The journal head bit spinlock is mostly held for short code sequences with +trivial RT safe functionality, except for one place: + +jbd2_journal_put_journal_head() invokes __journal_remove_journal_head() +with the journal head bit spinlock held. __journal_remove_journal_head() +invokes kmem_cache_free() which must not be called with preemption disabled +on RT. + +Jan suggested to rework the removal function so the actual free happens +outside the bit-spinlocked region. + +Split it into two parts: + + - Do the sanity checks and the buffer head detach under the lock + + - Do the actual free after dropping the lock + +There is error case handling in the free part which needs to dereference +the b_size field of the now detached buffer head. Due to paranoia (caused +by ignorance) the size is retrieved in the detach function and handed into +the free function. Might be over-engineered, but better safe than sorry. + +This makes the journal head bit-spinlock usage RT compliant and also avoids +nested locking which is not covered by lockdep. + +Suggested-by: Jan Kara +Signed-off-by: Thomas Gleixner +Cc: linux-ext4@vger.kernel.org +Cc: "Theodore Ts'o" +Cc: Jan Kara +Signed-off-by: Jan Kara +Signed-off-by: Sebastian Andrzej Siewior +--- + fs/jbd2/journal.c | 20 ++++++++++++++------ + 1 file changed, 14 insertions(+), 6 deletions(-) + +--- a/fs/jbd2/journal.c ++++ b/fs/jbd2/journal.c +@@ -2528,17 +2528,23 @@ static void __journal_remove_journal_hea + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); ++ ++ /* Unlink before dropping the lock */ ++ bh->b_private = NULL; ++ jh->b_bh = NULL; /* debug, really */ ++ clear_buffer_jbd(bh); ++} ++ ++static void journal_release_journal_head(struct journal_head *jh, size_t b_size) ++{ + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); +- jbd2_free(jh->b_frozen_data, bh->b_size); ++ jbd2_free(jh->b_frozen_data, b_size); + } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); +- jbd2_free(jh->b_committed_data, bh->b_size); ++ jbd2_free(jh->b_committed_data, b_size); + } +- bh->b_private = NULL; +- jh->b_bh = NULL; /* debug, really */ +- clear_buffer_jbd(bh); + journal_free_journal_head(jh); + } + +@@ -2556,9 +2562,11 @@ void jbd2_journal_put_journal_head(struc + if (!jh->b_jcount) { + __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); ++ journal_release_journal_head(jh, bh->b_size); + __brelse(bh); +- } else ++ } else { + jbd_unlock_bh_journal_head(bh); ++ } + } + + /* diff --git a/kernel/patches-5.4.x-rt/0009-x86-ioapic-Rename-misnamed-functions.patch b/kernel/patches-5.4.x-rt/0009-x86-ioapic-Rename-misnamed-functions.patch new file mode 100644 index 000000000..547e5a889 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0009-x86-ioapic-Rename-misnamed-functions.patch @@ -0,0 +1,86 @@ +From: Thomas Gleixner +Date: Thu, 17 Oct 2019 12:19:02 +0200 +Subject: [PATCH] x86/ioapic: Rename misnamed functions + +ioapic_irqd_[un]mask() are misnomers as both functions do way more than +masking and unmasking the interrupt line. Both deal with the moving the +affinity of the interrupt within interrupt context. The mask/unmask is just +a tiny part of the functionality. + +Rename them to ioapic_prepare/finish_move(), fixup the call sites and +rename the related variables in the code to reflect what this is about. + +No functional change. + +Signed-off-by: Thomas Gleixner +Cc: Andy Shevchenko +Cc: Linus Torvalds +Cc: Peter Zijlstra +Cc: Sebastian Siewior +Link: https://lkml.kernel.org/r/20191017101938.412489856@linutronix.de +Signed-off-by: Ingo Molnar +Signed-off-by: Sebastian Andrzej Siewior +--- + arch/x86/kernel/apic/io_apic.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +--- a/arch/x86/kernel/apic/io_apic.c ++++ b/arch/x86/kernel/apic/io_apic.c +@@ -1725,7 +1725,7 @@ static bool io_apic_level_ack_pending(st + return false; + } + +-static inline bool ioapic_irqd_mask(struct irq_data *data) ++static inline bool ioapic_prepare_move(struct irq_data *data) + { + /* If we are moving the IRQ we need to mask it */ + if (unlikely(irqd_is_setaffinity_pending(data))) { +@@ -1736,9 +1736,9 @@ static inline bool ioapic_irqd_mask(stru + return false; + } + +-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) ++static inline void ioapic_finish_move(struct irq_data *data, bool moveit) + { +- if (unlikely(masked)) { ++ if (unlikely(moveit)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets +@@ -1773,11 +1773,11 @@ static inline void ioapic_irqd_unmask(st + } + } + #else +-static inline bool ioapic_irqd_mask(struct irq_data *data) ++static inline bool ioapic_prepare_move(struct irq_data *data) + { + return false; + } +-static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) ++static inline void ioapic_finish_move(struct irq_data *data, bool moveit) + { + } + #endif +@@ -1786,11 +1786,11 @@ static void ioapic_ack_level(struct irq_ + { + struct irq_cfg *cfg = irqd_cfg(irq_data); + unsigned long v; +- bool masked; ++ bool moveit; + int i; + + irq_complete_move(cfg); +- masked = ioapic_irqd_mask(irq_data); ++ moveit = ioapic_prepare_move(irq_data); + + /* + * It appears there is an erratum which affects at least version 0x11 +@@ -1845,7 +1845,7 @@ static void ioapic_ack_level(struct irq_ + eoi_ioapic_pin(cfg->vector, irq_data->chip_data); + } + +- ioapic_irqd_unmask(irq_data, masked); ++ ioapic_finish_move(irq_data, moveit); + } + + static void ioapic_ir_ack_level(struct irq_data *irq_data) diff --git a/kernel/patches-5.4.x-rt/0010-percpu-refcount-use-normal-instead-of-RCU-sched.patch b/kernel/patches-5.4.x-rt/0010-percpu-refcount-use-normal-instead-of-RCU-sched.patch new file mode 100644 index 000000000..03f68eae8 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0010-percpu-refcount-use-normal-instead-of-RCU-sched.patch @@ -0,0 +1,100 @@ +From: Sebastian Andrzej Siewior +Date: Wed, 4 Sep 2019 17:59:36 +0200 +Subject: [PATCH] percpu-refcount: use normal instead of RCU-sched" +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This is a revert of commit + a4244454df129 ("percpu-refcount: use RCU-sched insted of normal RCU") + +which claims the only reason for using RCU-sched is + "rcu_read_[un]lock() … are slightly more expensive than preempt_disable/enable()" + +and + "As the RCU critical sections are extremely short, using sched-RCU + shouldn't have any latency implications." + +The problem with RCU-sched is that it disables preemption and the +callback must not acquire any sleeping locks like spinlock_t on +PREEMPT_RT which is the case. + +Convert back to normal RCU. + +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/percpu-refcount.h | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +--- a/include/linux/percpu-refcount.h ++++ b/include/linux/percpu-refcount.h +@@ -186,14 +186,14 @@ static inline void percpu_ref_get_many(s + { + unsigned long __percpu *percpu_count; + +- rcu_read_lock_sched(); ++ rcu_read_lock(); + + if (__ref_is_percpu(ref, &percpu_count)) + this_cpu_add(*percpu_count, nr); + else + atomic_long_add(nr, &ref->count); + +- rcu_read_unlock_sched(); ++ rcu_read_unlock(); + } + + /** +@@ -223,7 +223,7 @@ static inline bool percpu_ref_tryget(str + unsigned long __percpu *percpu_count; + bool ret; + +- rcu_read_lock_sched(); ++ rcu_read_lock(); + + if (__ref_is_percpu(ref, &percpu_count)) { + this_cpu_inc(*percpu_count); +@@ -232,7 +232,7 @@ static inline bool percpu_ref_tryget(str + ret = atomic_long_inc_not_zero(&ref->count); + } + +- rcu_read_unlock_sched(); ++ rcu_read_unlock(); + + return ret; + } +@@ -257,7 +257,7 @@ static inline bool percpu_ref_tryget_liv + unsigned long __percpu *percpu_count; + bool ret = false; + +- rcu_read_lock_sched(); ++ rcu_read_lock(); + + if (__ref_is_percpu(ref, &percpu_count)) { + this_cpu_inc(*percpu_count); +@@ -266,7 +266,7 @@ static inline bool percpu_ref_tryget_liv + ret = atomic_long_inc_not_zero(&ref->count); + } + +- rcu_read_unlock_sched(); ++ rcu_read_unlock(); + + return ret; + } +@@ -285,14 +285,14 @@ static inline void percpu_ref_put_many(s + { + unsigned long __percpu *percpu_count; + +- rcu_read_lock_sched(); ++ rcu_read_lock(); + + if (__ref_is_percpu(ref, &percpu_count)) + this_cpu_sub(*percpu_count, nr); + else if (unlikely(atomic_long_sub_and_test(nr, &ref->count))) + ref->release(ref); + +- rcu_read_unlock_sched(); ++ rcu_read_unlock(); + } + + /** diff --git a/kernel/patches-5.4.x-rt/0011-drm-i915-Don-t-disable-interrupts-independently-of-t.patch b/kernel/patches-5.4.x-rt/0011-drm-i915-Don-t-disable-interrupts-independently-of-t.patch new file mode 100644 index 000000000..3831669a5 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0011-drm-i915-Don-t-disable-interrupts-independently-of-t.patch @@ -0,0 +1,70 @@ +From: Sebastian Andrzej Siewior +Date: Wed, 10 Apr 2019 11:01:37 +0200 +Subject: [PATCH] drm/i915: Don't disable interrupts independently of the + lock + +The locks (active.lock and rq->lock) need to be taken with disabled +interrupts. This is done in i915_request_retire() by disabling the +interrupts independently of the locks itself. +While local_irq_disable()+spin_lock() equals spin_lock_irq() on vanilla +it does not on PREEMPT_RT. +Chris Wilson confirmed that local_irq_disable() was just introduced as +an optimisation to avoid enabling/disabling interrupts during +lock/unlock combo. + +Enable/disable interrupts as part of the locking instruction. + +Cc: Chris Wilson +Signed-off-by: Sebastian Andrzej Siewior +--- + drivers/gpu/drm/i915/i915_request.c | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +--- a/drivers/gpu/drm/i915/i915_request.c ++++ b/drivers/gpu/drm/i915/i915_request.c +@@ -205,14 +205,14 @@ static void remove_from_engine(struct i9 + * check that the rq still belongs to the newly locked engine. + */ + locked = READ_ONCE(rq->engine); +- spin_lock(&locked->active.lock); ++ spin_lock_irq(&locked->active.lock); + while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) { + spin_unlock(&locked->active.lock); + spin_lock(&engine->active.lock); + locked = engine; + } + list_del(&rq->sched.link); +- spin_unlock(&locked->active.lock); ++ spin_unlock_irq(&locked->active.lock); + } + + static bool i915_request_retire(struct i915_request *rq) +@@ -272,8 +272,6 @@ static bool i915_request_retire(struct i + active->retire(active, rq); + } + +- local_irq_disable(); +- + /* + * We only loosely track inflight requests across preemption, + * and so we may find ourselves attempting to retire a _completed_ +@@ -282,7 +280,7 @@ static bool i915_request_retire(struct i + */ + remove_from_engine(rq); + +- spin_lock(&rq->lock); ++ spin_lock_irq(&rq->lock); + i915_request_mark_complete(rq); + if (!i915_request_signaled(rq)) + dma_fence_signal_locked(&rq->fence); +@@ -297,9 +295,7 @@ static bool i915_request_retire(struct i + __notify_execute_cb(rq); + } + GEM_BUG_ON(!list_empty(&rq->execute_cb)); +- spin_unlock(&rq->lock); +- +- local_irq_enable(); ++ spin_unlock_irq(&rq->lock); + + remove_from_client(rq); + list_del(&rq->link); diff --git a/kernel/patches-5.4.x-rt/0012-block-Don-t-disable-interrupts-in-trigger_softirq.patch b/kernel/patches-5.4.x-rt/0012-block-Don-t-disable-interrupts-in-trigger_softirq.patch new file mode 100644 index 000000000..6be945444 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0012-block-Don-t-disable-interrupts-in-trigger_softirq.patch @@ -0,0 +1,35 @@ +From: Sebastian Andrzej Siewior +Date: Fri, 15 Nov 2019 21:37:22 +0100 +Subject: [PATCH] block: Don't disable interrupts in trigger_softirq() + +trigger_softirq() is always invoked as a SMP-function call which is +always invoked with disables interrupts. + +Don't disable interrupt in trigger_softirq() because interrupts are +already disabled. + +Signed-off-by: Sebastian Andrzej Siewior +--- + block/blk-softirq.c | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/block/blk-softirq.c ++++ b/block/blk-softirq.c +@@ -42,17 +42,13 @@ static __latent_entropy void blk_done_so + static void trigger_softirq(void *data) + { + struct request *rq = data; +- unsigned long flags; + struct list_head *list; + +- local_irq_save(flags); + list = this_cpu_ptr(&blk_cpu_done); + list_add_tail(&rq->ipi_list, list); + + if (list->next == &rq->ipi_list) + raise_softirq_irqoff(BLOCK_SOFTIRQ); +- +- local_irq_restore(flags); + } + + /* diff --git a/kernel/patches-5.4.x-rt/0013-arm64-KVM-compute_layout-before-altenates-are-applie.patch b/kernel/patches-5.4.x-rt/0013-arm64-KVM-compute_layout-before-altenates-are-applie.patch new file mode 100644 index 000000000..3e0b433de --- /dev/null +++ b/kernel/patches-5.4.x-rt/0013-arm64-KVM-compute_layout-before-altenates-are-applie.patch @@ -0,0 +1,89 @@ +From: Sebastian Andrzej Siewior +Date: Thu, 26 Jul 2018 09:13:42 +0200 +Subject: [PATCH] arm64: KVM: Invoke compute_layout() before alternatives are + applied + +compute_layout() is invoked as part of an alternative fixup under +stop_machine(). This function invokes get_random_long() which acquires a +sleeping lock on -RT which can not be acquired in this context. + +Rename compute_layout() to kvm_compute_layout() and invoke it before +stop_machine() applies the alternatives. Add a __init prefix to +kvm_compute_layout() because the caller has it, too (and so the code can be +discarded after boot). + +Signed-off-by: Sebastian Andrzej Siewior +--- + arch/arm64/include/asm/kvm_mmu.h | 1 + + arch/arm64/kernel/smp.c | 4 ++++ + arch/arm64/kvm/va_layout.c | 8 +------- + 3 files changed, 6 insertions(+), 7 deletions(-) + +--- a/arch/arm64/include/asm/kvm_mmu.h ++++ b/arch/arm64/include/asm/kvm_mmu.h +@@ -91,6 +91,7 @@ alternative_cb_end + + void kvm_update_va_mask(struct alt_instr *alt, + __le32 *origptr, __le32 *updptr, int nr_inst); ++void kvm_compute_layout(void); + + static inline unsigned long __kern_hyp_va(unsigned long v) + { +--- a/arch/arm64/kernel/smp.c ++++ b/arch/arm64/kernel/smp.c +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -39,6 +40,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -408,6 +410,8 @@ static void __init hyp_mode_check(void) + "CPU: CPUs started in inconsistent modes"); + else + pr_info("CPU: All CPU(s) started at EL1\n"); ++ if (IS_ENABLED(CONFIG_KVM_ARM_HOST)) ++ kvm_compute_layout(); + } + + void __init smp_cpus_done(unsigned int max_cpus) +--- a/arch/arm64/kvm/va_layout.c ++++ b/arch/arm64/kvm/va_layout.c +@@ -22,7 +22,7 @@ static u8 tag_lsb; + static u64 tag_val; + static u64 va_mask; + +-static void compute_layout(void) ++__init void kvm_compute_layout(void) + { + phys_addr_t idmap_addr = __pa_symbol(__hyp_idmap_text_start); + u64 hyp_va_msb; +@@ -110,9 +110,6 @@ void __init kvm_update_va_mask(struct al + + BUG_ON(nr_inst != 5); + +- if (!has_vhe() && !va_mask) +- compute_layout(); +- + for (i = 0; i < nr_inst; i++) { + u32 rd, rn, insn, oinsn; + +@@ -156,9 +153,6 @@ void kvm_patch_vector_branch(struct alt_ + return; + } + +- if (!va_mask) +- compute_layout(); +- + /* + * Compute HYP VA by using the same computation as kern_hyp_va() + */ diff --git a/kernel/patches-5.4.x-rt/0014-net-sched-dev_deactivate_many-use-msleep-1-instead-o.patch b/kernel/patches-5.4.x-rt/0014-net-sched-dev_deactivate_many-use-msleep-1-instead-o.patch new file mode 100644 index 000000000..93e5ea78d --- /dev/null +++ b/kernel/patches-5.4.x-rt/0014-net-sched-dev_deactivate_many-use-msleep-1-instead-o.patch @@ -0,0 +1,57 @@ +From: Marc Kleine-Budde +Date: Wed, 5 Mar 2014 00:49:47 +0100 +Subject: net: sched: Use msleep() instead of yield() + +On PREEMPT_RT enabled systems the interrupt handler run as threads at prio 50 +(by default). If a high priority userspace process tries to shut down a busy +network interface it might spin in a yield loop waiting for the device to +become idle. With the interrupt thread having a lower priority than the +looping process it might never be scheduled and so result in a deadlock on UP +systems. + +With Magic SysRq the following backtrace can be produced: + +> test_app R running 0 174 168 0x00000000 +> [] (__schedule+0x220/0x3fc) from [] (preempt_schedule_irq+0x48/0x80) +> [] (preempt_schedule_irq+0x48/0x80) from [] (svc_preempt+0x8/0x20) +> [] (svc_preempt+0x8/0x20) from [] (local_bh_enable+0x18/0x88) +> [] (local_bh_enable+0x18/0x88) from [] (dev_deactivate_many+0x220/0x264) +> [] (dev_deactivate_many+0x220/0x264) from [] (__dev_close_many+0x64/0xd4) +> [] (__dev_close_many+0x64/0xd4) from [] (__dev_close+0x28/0x3c) +> [] (__dev_close+0x28/0x3c) from [] (__dev_change_flags+0x88/0x130) +> [] (__dev_change_flags+0x88/0x130) from [] (dev_change_flags+0x10/0x48) +> [] (dev_change_flags+0x10/0x48) from [] (do_setlink+0x370/0x7ec) +> [] (do_setlink+0x370/0x7ec) from [] (rtnl_newlink+0x2b4/0x450) +> [] (rtnl_newlink+0x2b4/0x450) from [] (rtnetlink_rcv_msg+0x158/0x1f4) +> [] (rtnetlink_rcv_msg+0x158/0x1f4) from [] (netlink_rcv_skb+0xac/0xc0) +> [] (netlink_rcv_skb+0xac/0xc0) from [] (rtnetlink_rcv+0x18/0x24) +> [] (rtnetlink_rcv+0x18/0x24) from [] (netlink_unicast+0x13c/0x198) +> [] (netlink_unicast+0x13c/0x198) from [] (netlink_sendmsg+0x264/0x2e0) +> [] (netlink_sendmsg+0x264/0x2e0) from [] (sock_sendmsg+0x78/0x98) +> [] (sock_sendmsg+0x78/0x98) from [] (___sys_sendmsg.part.25+0x268/0x278) +> [] (___sys_sendmsg.part.25+0x268/0x278) from [] (__sys_sendmsg+0x48/0x78) +> [] (__sys_sendmsg+0x48/0x78) from [] (ret_fast_syscall+0x0/0x2c) + +This patch works around the problem by replacing yield() by msleep(1), giving +the interrupt thread time to finish, similar to other changes contained in the +rt patch set. Using wait_for_completion() instead would probably be a better +solution. + + +Signed-off-by: Marc Kleine-Budde +Signed-off-by: Sebastian Andrzej Siewior +--- + net/sched/sch_generic.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/sched/sch_generic.c ++++ b/net/sched/sch_generic.c +@@ -1215,7 +1215,7 @@ void dev_deactivate_many(struct list_hea + /* Wait for outstanding qdisc_run calls. */ + list_for_each_entry(dev, head, close_list) { + while (some_qdisc_is_busy(dev)) +- yield(); ++ msleep(1); + /* The new qdisc is assigned at this point so we can safely + * unwind stale skb lists and qdisc statistics + */ diff --git a/kernel/patches-5.4.x-rt/0015-mm-vmalloc-remove-preempt_disable-enable-when-doing-.patch b/kernel/patches-5.4.x-rt/0015-mm-vmalloc-remove-preempt_disable-enable-when-doing-.patch new file mode 100644 index 000000000..53d97653c --- /dev/null +++ b/kernel/patches-5.4.x-rt/0015-mm-vmalloc-remove-preempt_disable-enable-when-doing-.patch @@ -0,0 +1,105 @@ +From: "Uladzislau Rezki (Sony)" +Date: Sat, 30 Nov 2019 17:54:33 -0800 +Subject: [PATCH] mm/vmalloc: remove preempt_disable/enable when doing + preloading + +Some background. The preemption was disabled before to guarantee that a +preloaded object is available for a CPU, it was stored for. That was +achieved by combining the disabling the preemption and taking the spin +lock while the ne_fit_preload_node is checked. + +The aim was to not allocate in atomic context when spinlock is taken +later, for regular vmap allocations. But that approach conflicts with +CONFIG_PREEMPT_RT philosophy. It means that calling spin_lock() with +disabled preemption is forbidden in the CONFIG_PREEMPT_RT kernel. + +Therefore, get rid of preempt_disable() and preempt_enable() when the +preload is done for splitting purpose. As a result we do not guarantee +now that a CPU is preloaded, instead we minimize the case when it is +not, with this change, by populating the per cpu preload pointer under +the vmap_area_lock. + +This implies that at least each caller that has done the preallocation +will not fallback to an atomic allocation later. It is possible that +the preallocation would be pointless or that no preallocation is done +because of the race but the data shows that this is really rare. + +For example i run the special test case that follows the preload pattern +and path. 20 "unbind" threads run it and each does 1000000 allocations. +Only 3.5 times among 1000000 a CPU was not preloaded. So it can happen +but the number is negligible. + +[mhocko@suse.com: changelog additions] +Link: http://lkml.kernel.org/r/20191016095438.12391-1-urezki@gmail.com +Fixes: 82dd23e84be3 ("mm/vmalloc.c: preload a CPU with one object for split purpose") +Signed-off-by: Uladzislau Rezki (Sony) +Reviewed-by: Steven Rostedt (VMware) +Acked-by: Sebastian Andrzej Siewior +Acked-by: Daniel Wagner +Acked-by: Michal Hocko +Cc: Hillf Danton +Cc: Matthew Wilcox +Cc: Oleksiy Avramchenko +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sebastian Andrzej Siewior +--- + mm/vmalloc.c | 37 ++++++++++++++++++++----------------- + 1 file changed, 20 insertions(+), 17 deletions(-) + +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -1077,31 +1077,34 @@ static struct vmap_area *alloc_vmap_area + + retry: + /* +- * Preload this CPU with one extra vmap_area object to ensure +- * that we have it available when fit type of free area is +- * NE_FIT_TYPE. ++ * Preload this CPU with one extra vmap_area object. It is used ++ * when fit type of free area is NE_FIT_TYPE. Please note, it ++ * does not guarantee that an allocation occurs on a CPU that ++ * is preloaded, instead we minimize the case when it is not. ++ * It can happen because of cpu migration, because there is a ++ * race until the below spinlock is taken. + * + * The preload is done in non-atomic context, thus it allows us + * to use more permissive allocation masks to be more stable under +- * low memory condition and high memory pressure. ++ * low memory condition and high memory pressure. In rare case, ++ * if not preloaded, GFP_NOWAIT is used. + * +- * Even if it fails we do not really care about that. Just proceed +- * as it is. "overflow" path will refill the cache we allocate from. ++ * Set "pva" to NULL here, because of "retry" path. + */ +- preempt_disable(); +- if (!__this_cpu_read(ne_fit_preload_node)) { +- preempt_enable(); +- pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node); +- preempt_disable(); ++ pva = NULL; + +- if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) { +- if (pva) +- kmem_cache_free(vmap_area_cachep, pva); +- } +- } ++ if (!this_cpu_read(ne_fit_preload_node)) ++ /* ++ * Even if it fails we do not really care about that. ++ * Just proceed as it is. If needed "overflow" path ++ * will refill the cache we allocate from. ++ */ ++ pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node); + + spin_lock(&vmap_area_lock); +- preempt_enable(); ++ ++ if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) ++ kmem_cache_free(vmap_area_cachep, pva); + + /* + * If an allocation fails, the "vend" address is diff --git a/kernel/patches-5.4.x-rt/0016-KVM-arm-arm64-Let-the-timer-expire-in-hardirq-contex.patch b/kernel/patches-5.4.x-rt/0016-KVM-arm-arm64-Let-the-timer-expire-in-hardirq-contex.patch new file mode 100644 index 000000000..bc84f1c3d --- /dev/null +++ b/kernel/patches-5.4.x-rt/0016-KVM-arm-arm64-Let-the-timer-expire-in-hardirq-contex.patch @@ -0,0 +1,46 @@ +From: Thomas Gleixner +Date: Tue, 13 Aug 2019 14:29:41 +0200 +Subject: [PATCH] KVM: arm/arm64: Let the timer expire in hardirq context + on RT + +The timers are canceled from an preempt-notifier which is invoked with +disabled preemption which is not allowed on PREEMPT_RT. +The timer callback is short so in could be invoked in hard-IRQ context +on -RT. + +Let the timer expire on hard-IRQ context even on -RT. + +Signed-off-by: Thomas Gleixner +Acked-by: Marc Zyngier +Tested-by: Julien Grall +Signed-off-by: Sebastian Andrzej Siewior +--- + virt/kvm/arm/arch_timer.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/virt/kvm/arm/arch_timer.c ++++ b/virt/kvm/arm/arch_timer.c +@@ -80,7 +80,7 @@ static inline bool userspace_irqchip(str + static void soft_timer_start(struct hrtimer *hrt, u64 ns) + { + hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), +- HRTIMER_MODE_ABS); ++ HRTIMER_MODE_ABS_HARD); + } + + static void soft_timer_cancel(struct hrtimer *hrt) +@@ -697,11 +697,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu + update_vtimer_cntvoff(vcpu, kvm_phys_timer_read()); + ptimer->cntvoff = 0; + +- hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ++ hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + timer->bg_timer.function = kvm_bg_timer_expire; + +- hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); +- hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ++ hrtimer_init(&vtimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); ++ hrtimer_init(&ptimer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + vtimer->hrtimer.function = kvm_hrtimer_expire; + ptimer->hrtimer.function = kvm_hrtimer_expire; + diff --git a/kernel/patches-5.4.x-rt/0017-time-sched_clock-Expire-timer-in-hardirq-context.patch b/kernel/patches-5.4.x-rt/0017-time-sched_clock-Expire-timer-in-hardirq-context.patch new file mode 100644 index 000000000..75d291a1b --- /dev/null +++ b/kernel/patches-5.4.x-rt/0017-time-sched_clock-Expire-timer-in-hardirq-context.patch @@ -0,0 +1,55 @@ +From: "Ahmed S. Darwish" +Date: Mon, 9 Mar 2020 18:15:29 +0000 +Subject: [PATCH] time/sched_clock: Expire timer in hardirq context + +To minimize latency, PREEMPT_RT kernels expires hrtimers in preemptible +softirq context by default. This can be overriden by marking the timer's +expiry with HRTIMER_MODE_HARD. + +sched_clock_timer is missing this annotation: if its callback is preempted +and the duration of the preemption exceeds the wrap around time of the +underlying clocksource, sched clock will get out of sync. + +Mark the sched_clock_timer for expiry in hard interrupt context. + +Signed-off-by: Ahmed S. Darwish +Signed-off-by: Thomas Gleixner +Link: https://lkml.kernel.org/r/20200309181529.26558-1-a.darwish@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/time/sched_clock.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/kernel/time/sched_clock.c ++++ b/kernel/time/sched_clock.c +@@ -207,7 +207,8 @@ sched_clock_register(u64 (*read)(void), + + if (sched_clock_timer.function != NULL) { + /* update timeout for clock wrap */ +- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); ++ hrtimer_start(&sched_clock_timer, cd.wrap_kt, ++ HRTIMER_MODE_REL_HARD); + } + + r = rate; +@@ -251,9 +252,9 @@ void __init generic_sched_clock_init(voi + * Start the timer to keep sched_clock() properly updated and + * sets the initial epoch. + */ +- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); + sched_clock_timer.function = sched_clock_poll; +- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); ++ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); + } + + /* +@@ -290,7 +291,7 @@ void sched_clock_resume(void) + struct clock_read_data *rd = &cd.read_data[0]; + + rd->epoch_cyc = cd.actual_read_sched_clock(); +- hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); ++ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); + rd->read_sched_clock = cd.actual_read_sched_clock; + } + diff --git a/kernel/patches-5.4.x-rt/0018-0001-printk-rb-add-printk-ring-buffer-documentation.patch b/kernel/patches-5.4.x-rt/0018-0001-printk-rb-add-printk-ring-buffer-documentation.patch new file mode 100644 index 000000000..c50c2e4ef --- /dev/null +++ b/kernel/patches-5.4.x-rt/0018-0001-printk-rb-add-printk-ring-buffer-documentation.patch @@ -0,0 +1,393 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:39 +0100 +Subject: [PATCH 01/25] printk-rb: add printk ring buffer documentation + +The full documentation file for the printk ring buffer. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + Documentation/printk-ringbuffer.txt | 377 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 377 insertions(+) + create mode 100644 Documentation/printk-ringbuffer.txt + +--- /dev/null ++++ b/Documentation/printk-ringbuffer.txt +@@ -0,0 +1,377 @@ ++struct printk_ringbuffer ++------------------------ ++John Ogness ++ ++Overview ++~~~~~~~~ ++As the name suggests, this ring buffer was implemented specifically to serve ++the needs of the printk() infrastructure. The ring buffer itself is not ++specific to printk and could be used for other purposes. _However_, the ++requirements and semantics of printk are rather unique. If you intend to use ++this ring buffer for anything other than printk, you need to be very clear on ++its features, behavior, and pitfalls. ++ ++Features ++^^^^^^^^ ++The printk ring buffer has the following features: ++ ++- single global buffer ++- resides in initialized data section (available at early boot) ++- lockless readers ++- supports multiple writers ++- supports multiple non-consuming readers ++- safe from any context (including NMI) ++- groups bytes into variable length blocks (referenced by entries) ++- entries tagged with sequence numbers ++ ++Behavior ++^^^^^^^^ ++Since the printk ring buffer readers are lockless, there exists no ++synchronization between readers and writers. Basically writers are the tasks ++in control and may overwrite any and all committed data at any time and from ++any context. For this reason readers can miss entries if they are overwritten ++before the reader was able to access the data. The reader API implementation ++is such that reader access to entries is atomic, so there is no risk of ++readers having to deal with partial or corrupt data. Also, entries are ++tagged with sequence numbers so readers can recognize if entries were missed. ++ ++Writing to the ring buffer consists of 2 steps. First a writer must reserve ++an entry of desired size. After this step the writer has exclusive access ++to the memory region. Once the data has been written to memory, it needs to ++be committed to the ring buffer. After this step the entry has been inserted ++into the ring buffer and assigned an appropriate sequence number. ++ ++Once committed, a writer must no longer access the data directly. This is ++because the data may have been overwritten and no longer exists. If a ++writer must access the data, it should either keep a private copy before ++committing the entry or use the reader API to gain access to the data. ++ ++Because of how the data backend is implemented, entries that have been ++reserved but not yet committed act as barriers, preventing future writers ++from filling the ring buffer beyond the location of the reserved but not ++yet committed entry region. For this reason it is *important* that writers ++perform both reserve and commit as quickly as possible. Also, be aware that ++preemption and local interrupts are disabled and writing to the ring buffer ++is processor-reentrant locked during the reserve/commit window. Writers in ++NMI contexts can still preempt any other writers, but as long as these ++writers do not write a large amount of data with respect to the ring buffer ++size, this should not become an issue. ++ ++API ++~~~ ++ ++Declaration ++^^^^^^^^^^^ ++The printk ring buffer can be instantiated as a static structure: ++ ++ /* declare a static struct printk_ringbuffer */ ++ #define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr) ++ ++The value of szbits specifies the size of the ring buffer in bits. The ++cpulockptr field is a pointer to a prb_cpulock struct that is used to ++perform processor-reentrant spin locking for the writers. It is specified ++externally because it may be used for multiple ring buffers (or other ++code) to synchronize writers without risk of deadlock. ++ ++Here is an example of a declaration of a printk ring buffer specifying a ++32KB (2^15) ring buffer: ++ ++.... ++DECLARE_STATIC_PRINTKRB_CPULOCK(rb_cpulock); ++DECLARE_STATIC_PRINTKRB(rb, 15, &rb_cpulock); ++.... ++ ++If writers will be using multiple ring buffers and the ordering of that usage ++is not clear, the same prb_cpulock should be used for both ring buffers. ++ ++Writer API ++^^^^^^^^^^ ++The writer API consists of 2 functions. The first is to reserve an entry in ++the ring buffer, the second is to commit that data to the ring buffer. The ++reserved entry information is stored within a provided `struct prb_handle`. ++ ++ /* reserve an entry */ ++ char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb, ++ unsigned int size); ++ ++ /* commit a reserved entry to the ring buffer */ ++ void prb_commit(struct prb_handle *h); ++ ++Here is an example of a function to write data to a ring buffer: ++ ++.... ++int write_data(struct printk_ringbuffer *rb, char *data, int size) ++{ ++ struct prb_handle h; ++ char *buf; ++ ++ buf = prb_reserve(&h, rb, size); ++ if (!buf) ++ return -1; ++ memcpy(buf, data, size); ++ prb_commit(&h); ++ ++ return 0; ++} ++.... ++ ++Pitfalls ++++++++++ ++Be aware that prb_reserve() can fail. A retry might be successful, but it ++depends entirely on whether or not the next part of the ring buffer to ++overwrite belongs to reserved but not yet committed entries of other writers. ++Writers can use the prb_inc_lost() function to allow readers to notice that a ++message was lost. ++ ++Reader API ++^^^^^^^^^^ ++The reader API utilizes a `struct prb_iterator` to track the reader's ++position in the ring buffer. ++ ++ /* declare a pre-initialized static iterator for a ring buffer */ ++ #define DECLARE_STATIC_PRINTKRB_ITER(name, rbaddr) ++ ++ /* initialize iterator for a ring buffer (if static macro NOT used) */ ++ void prb_iter_init(struct prb_iterator *iter, ++ struct printk_ringbuffer *rb, u64 *seq); ++ ++ /* make a deep copy of an iterator */ ++ void prb_iter_copy(struct prb_iterator *dest, ++ struct prb_iterator *src); ++ ++ /* non-blocking, advance to next entry (and read the data) */ ++ int prb_iter_next(struct prb_iterator *iter, char *buf, ++ int size, u64 *seq); ++ ++ /* blocking, advance to next entry (and read the data) */ ++ int prb_iter_wait_next(struct prb_iterator *iter, char *buf, ++ int size, u64 *seq); ++ ++ /* position iterator at the entry seq */ ++ int prb_iter_seek(struct prb_iterator *iter, u64 seq); ++ ++ /* read data at current position */ ++ int prb_iter_data(struct prb_iterator *iter, char *buf, ++ int size, u64 *seq); ++ ++Typically prb_iter_data() is not needed because the data can be retrieved ++directly with prb_iter_next(). ++ ++Here is an example of a non-blocking function that will read all the data in ++a ring buffer: ++ ++.... ++void read_all_data(struct printk_ringbuffer *rb, char *buf, int size) ++{ ++ struct prb_iterator iter; ++ u64 prev_seq = 0; ++ u64 seq; ++ int ret; ++ ++ prb_iter_init(&iter, rb, NULL); ++ ++ for (;;) { ++ ret = prb_iter_next(&iter, buf, size, &seq); ++ if (ret > 0) { ++ if (seq != ++prev_seq) { ++ /* "seq - prev_seq" entries missed */ ++ prev_seq = seq; ++ } ++ /* process buf here */ ++ } else if (ret == 0) { ++ /* hit the end, done */ ++ break; ++ } else if (ret < 0) { ++ /* ++ * iterator is invalid, a writer overtook us, reset the ++ * iterator and keep going, entries were missed ++ */ ++ prb_iter_init(&iter, rb, NULL); ++ } ++ } ++} ++.... ++ ++Pitfalls ++++++++++ ++The reader's iterator can become invalid at any time because the reader was ++overtaken by a writer. Typically the reader should reset the iterator back ++to the current oldest entry (which will be newer than the entry the reader ++was at) and continue, noting the number of entries that were missed. ++ ++Utility API ++^^^^^^^^^^^ ++Several functions are available as convenience for external code. ++ ++ /* query the size of the data buffer */ ++ int prb_buffer_size(struct printk_ringbuffer *rb); ++ ++ /* skip a seq number to signify a lost record */ ++ void prb_inc_lost(struct printk_ringbuffer *rb); ++ ++ /* processor-reentrant spin lock */ ++ void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); ++ ++ /* processor-reentrant spin unlock */ ++ void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); ++ ++Pitfalls ++++++++++ ++Although the value returned by prb_buffer_size() does represent an absolute ++upper bound, the amount of data that can be stored within the ring buffer ++is actually less because of the additional storage space of a header for each ++entry. ++ ++The prb_lock() and prb_unlock() functions can be used to synchronize between ++ring buffer writers and other external activities. The function of a ++processor-reentrant spin lock is to disable preemption and local interrupts ++and synchronize against other processors. It does *not* protect against ++multiple contexts of a single processor, i.e NMI. ++ ++Implementation ++~~~~~~~~~~~~~~ ++This section describes several of the implementation concepts and details to ++help developers better understand the code. ++ ++Entries ++^^^^^^^ ++All ring buffer data is stored within a single static byte array. The reason ++for this is to ensure that any pointers to the data (past and present) will ++always point to valid memory. This is important because the lockless readers ++may be preempted for long periods of time and when they resume may be working ++with expired pointers. ++ ++Entries are identified by start index and size. (The start index plus size ++is the start index of the next entry.) The start index is not simply an ++offset into the byte array, but rather a logical position (lpos) that maps ++directly to byte array offsets. ++ ++For example, for a byte array of 1000, an entry may have have a start index ++of 100. Another entry may have a start index of 1100. And yet another 2100. ++All of these entry are pointing to the same memory region, but only the most ++recent entry is valid. The other entries are pointing to valid memory, but ++represent entries that have been overwritten. ++ ++Note that due to overflowing, the most recent entry is not necessarily the one ++with the highest lpos value. Indeed, the printk ring buffer initializes its ++data such that an overflow happens relatively quickly in order to validate the ++handling of this situation. The implementation assumes that an lpos (unsigned ++long) will never completely wrap while a reader is preempted. If this were to ++become an issue, the seq number (which never wraps) could be used to increase ++the robustness of handling this situation. ++ ++Buffer Wrapping ++^^^^^^^^^^^^^^^ ++If an entry starts near the end of the byte array but would extend beyond it, ++a special terminating entry (size = -1) is inserted into the byte array and ++the real entry is placed at the beginning of the byte array. This can waste ++space at the end of the byte array, but simplifies the implementation by ++allowing writers to always work with contiguous buffers. ++ ++Note that the size field is the first 4 bytes of the entry header. Also note ++that calc_next() always ensures that there are at least 4 bytes left at the ++end of the byte array to allow room for a terminating entry. ++ ++Ring Buffer Pointers ++^^^^^^^^^^^^^^^^^^^^ ++Three pointers (lpos values) are used to manage the ring buffer: ++ ++ - _tail_: points to the oldest entry ++ - _head_: points to where the next new committed entry will be ++ - _reserve_: points to where the next new reserved entry will be ++ ++These pointers always maintain a logical ordering: ++ ++ tail <= head <= reserve ++ ++The reserve pointer moves forward when a writer reserves a new entry. The ++head pointer moves forward when a writer commits a new entry. ++ ++The reserve pointer cannot overwrite the tail pointer in a wrap situation. In ++such a situation, the tail pointer must be "pushed forward", thus ++invalidating that oldest entry. Readers identify if they are accessing a ++valid entry by ensuring their entry pointer is `>= tail && < head`. ++ ++If the tail pointer is equal to the head pointer, it cannot be pushed and any ++reserve operation will fail. The only resolution is for writers to commit ++their reserved entries. ++ ++Processor-Reentrant Locking ++^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++The purpose of the processor-reentrant locking is to limit the interruption ++scenarios of writers to 2 contexts. This allows for a simplified ++implementation where: ++ ++- The reserve/commit window only exists on 1 processor at a time. A reserve ++ can never fail due to uncommitted entries of other processors. ++ ++- When committing entries, it is trivial to handle the situation when ++ subsequent entries have already been committed, i.e. managing the head ++ pointer. ++ ++Performance ++~~~~~~~~~~~ ++Some basic tests were performed on a quad Intel(R) Xeon(R) CPU E5-2697 v4 at ++2.30GHz (36 cores / 72 threads). All tests involved writing a total of ++32,000,000 records at an average of 33 bytes each. Each writer was pinned to ++its own CPU and would write as fast as it could until a total of 32,000,000 ++records were written. All tests involved 2 readers that were both pinned ++together to another CPU. Each reader would read as fast as it could and track ++how many of the 32,000,000 records it could read. All tests used a ring buffer ++of 16KB in size, which holds around 350 records (header + data for each ++entry). ++ ++The only difference between the tests is the number of writers (and thus also ++the number of records per writer). As more writers are added, the time to ++write a record increases. This is because data pointers, modified via cmpxchg, ++and global data access in general become more contended. ++ ++1 writer ++^^^^^^^^ ++ runtime: 0m 18s ++ reader1: 16219900/32000000 (50%) records ++ reader2: 16141582/32000000 (50%) records ++ ++2 writers ++^^^^^^^^^ ++ runtime: 0m 32s ++ reader1: 16327957/32000000 (51%) records ++ reader2: 16313988/32000000 (50%) records ++ ++4 writers ++^^^^^^^^^ ++ runtime: 0m 42s ++ reader1: 16421642/32000000 (51%) records ++ reader2: 16417224/32000000 (51%) records ++ ++8 writers ++^^^^^^^^^ ++ runtime: 0m 43s ++ reader1: 16418300/32000000 (51%) records ++ reader2: 16432222/32000000 (51%) records ++ ++16 writers ++^^^^^^^^^^ ++ runtime: 0m 54s ++ reader1: 16539189/32000000 (51%) records ++ reader2: 16542711/32000000 (51%) records ++ ++32 writers ++^^^^^^^^^^ ++ runtime: 1m 13s ++ reader1: 16731808/32000000 (52%) records ++ reader2: 16735119/32000000 (52%) records ++ ++Comments ++^^^^^^^^ ++It is particularly interesting to compare/contrast the 1-writer and 32-writer ++tests. Despite the writing of the 32,000,000 records taking over 4 times ++longer, the readers (which perform no cmpxchg) were still unable to keep up. ++This shows that the memory contention between the increasing number of CPUs ++also has a dramatic effect on readers. ++ ++It should also be noted that in all cases each reader was able to read >=50% ++of the records. This means that a single reader would have been able to keep ++up with the writer(s) in all cases, becoming slightly easier as more writers ++are added. This was the purpose of pinning 2 readers to 1 CPU: to observe how ++maximum reader performance changes. diff --git a/kernel/patches-5.4.x-rt/0019-0002-printk-rb-add-prb-locking-functions.patch b/kernel/patches-5.4.x-rt/0019-0002-printk-rb-add-prb-locking-functions.patch new file mode 100644 index 000000000..29e2b17ff --- /dev/null +++ b/kernel/patches-5.4.x-rt/0019-0002-printk-rb-add-prb-locking-functions.patch @@ -0,0 +1,158 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:40 +0100 +Subject: [PATCH 02/25] printk-rb: add prb locking functions + +Add processor-reentrant spin locking functions. These allow +restricting the number of possible contexts to 2, which can simplify +implementing code that also supports NMI interruptions. + + prb_lock(); + + /* + * This code is synchronized with all contexts + * except an NMI on the same processor. + */ + + prb_unlock(); + +In order to support printk's emergency messages, a +processor-reentrant spin lock will be used to control raw access to +the emergency console. However, it must be the same +processor-reentrant spin lock as the one used by the ring buffer, +otherwise a deadlock can occur: + + CPU1: printk lock -> emergency -> serial lock + CPU2: serial lock -> printk lock + +By making the processor-reentrant implemtation available externally, +printk can use the same atomic_t for the ring buffer as for the +emergency console and thus avoid the above deadlock. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/printk_ringbuffer.h | 24 +++++++++++ + lib/Makefile | 2 + lib/printk_ringbuffer.c | 77 ++++++++++++++++++++++++++++++++++++++ + 3 files changed, 102 insertions(+), 1 deletion(-) + create mode 100644 include/linux/printk_ringbuffer.h + create mode 100644 lib/printk_ringbuffer.c + +--- /dev/null ++++ b/include/linux/printk_ringbuffer.h +@@ -0,0 +1,24 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _LINUX_PRINTK_RINGBUFFER_H ++#define _LINUX_PRINTK_RINGBUFFER_H ++ ++#include ++#include ++ ++struct prb_cpulock { ++ atomic_t owner; ++ unsigned long __percpu *irqflags; ++}; ++ ++#define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \ ++static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \ ++static struct prb_cpulock name = { \ ++ .owner = ATOMIC_INIT(-1), \ ++ .irqflags = &_##name##_percpu_irqflags, \ ++} ++ ++/* utility functions */ ++void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); ++void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store); ++ ++#endif /*_LINUX_PRINTK_RINGBUFFER_H */ +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -26,7 +26,7 @@ endif + + lib-y := ctype.o string.o vsprintf.o cmdline.o \ + rbtree.o radix-tree.o timerqueue.o xarray.o \ +- idr.o extable.o \ ++ idr.o extable.o printk_ringbuffer.o \ + sha1.o chacha.o irq_regs.o argv_split.o \ + flex_proportions.o ratelimit.o show_mem.o \ + is_single_threaded.o plist.o decompress.o kobject_uevent.o \ +--- /dev/null ++++ b/lib/printk_ringbuffer.c +@@ -0,0 +1,77 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include ++#include ++ ++static bool __prb_trylock(struct prb_cpulock *cpu_lock, ++ unsigned int *cpu_store) ++{ ++ unsigned long *flags; ++ unsigned int cpu; ++ ++ cpu = get_cpu(); ++ ++ *cpu_store = atomic_read(&cpu_lock->owner); ++ /* memory barrier to ensure the current lock owner is visible */ ++ smp_rmb(); ++ if (*cpu_store == -1) { ++ flags = per_cpu_ptr(cpu_lock->irqflags, cpu); ++ local_irq_save(*flags); ++ if (atomic_try_cmpxchg_acquire(&cpu_lock->owner, ++ cpu_store, cpu)) { ++ return true; ++ } ++ local_irq_restore(*flags); ++ } else if (*cpu_store == cpu) { ++ return true; ++ } ++ ++ put_cpu(); ++ return false; ++} ++ ++/* ++ * prb_lock: Perform a processor-reentrant spin lock. ++ * @cpu_lock: A pointer to the lock object. ++ * @cpu_store: A "flags" pointer to store lock status information. ++ * ++ * If no processor has the lock, the calling processor takes the lock and ++ * becomes the owner. If the calling processor is already the owner of the ++ * lock, this function succeeds immediately. If lock is locked by another ++ * processor, this function spins until the calling processor becomes the ++ * owner. ++ * ++ * It is safe to call this function from any context and state. ++ */ ++void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store) ++{ ++ for (;;) { ++ if (__prb_trylock(cpu_lock, cpu_store)) ++ break; ++ cpu_relax(); ++ } ++} ++ ++/* ++ * prb_unlock: Perform a processor-reentrant spin unlock. ++ * @cpu_lock: A pointer to the lock object. ++ * @cpu_store: A "flags" object storing lock status information. ++ * ++ * Release the lock. The calling processor must be the owner of the lock. ++ * ++ * It is safe to call this function from any context and state. ++ */ ++void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store) ++{ ++ unsigned long *flags; ++ unsigned int cpu; ++ ++ cpu = atomic_read(&cpu_lock->owner); ++ atomic_set_release(&cpu_lock->owner, cpu_store); ++ ++ if (cpu_store == -1) { ++ flags = per_cpu_ptr(cpu_lock->irqflags, cpu); ++ local_irq_restore(*flags); ++ } ++ ++ put_cpu(); ++} diff --git a/kernel/patches-5.4.x-rt/0020-0003-printk-rb-define-ring-buffer-struct-and-initializer.patch b/kernel/patches-5.4.x-rt/0020-0003-printk-rb-define-ring-buffer-struct-and-initializer.patch new file mode 100644 index 000000000..9080713b1 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0020-0003-printk-rb-define-ring-buffer-struct-and-initializer.patch @@ -0,0 +1,57 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:41 +0100 +Subject: [PATCH 03/25] printk-rb: define ring buffer struct and initializer + +See Documentation/printk-ringbuffer.txt for details about the +initializer arguments. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/printk_ringbuffer.h | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + +--- a/include/linux/printk_ringbuffer.h ++++ b/include/linux/printk_ringbuffer.h +@@ -10,6 +10,20 @@ struct prb_cpulock { + unsigned long __percpu *irqflags; + }; + ++struct printk_ringbuffer { ++ void *buffer; ++ unsigned int size_bits; ++ ++ u64 seq; ++ ++ atomic_long_t tail; ++ atomic_long_t head; ++ atomic_long_t reserve; ++ ++ struct prb_cpulock *cpulock; ++ atomic_t ctx; ++}; ++ + #define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \ + static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \ + static struct prb_cpulock name = { \ +@@ -17,6 +31,20 @@ static struct prb_cpulock name = { \ + .irqflags = &_##name##_percpu_irqflags, \ + } + ++#define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr) \ ++static char _##name##_buffer[1 << (szbits)] \ ++ __aligned(__alignof__(long)); \ ++static struct printk_ringbuffer name = { \ ++ .buffer = &_##name##_buffer[0], \ ++ .size_bits = szbits, \ ++ .seq = 0, \ ++ .tail = ATOMIC_LONG_INIT(-111 * sizeof(long)), \ ++ .head = ATOMIC_LONG_INIT(-111 * sizeof(long)), \ ++ .reserve = ATOMIC_LONG_INIT(-111 * sizeof(long)), \ ++ .cpulock = cpulockptr, \ ++ .ctx = ATOMIC_INIT(0), \ ++} ++ + /* utility functions */ + void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); + void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store); diff --git a/kernel/patches-5.4.x-rt/0021-0004-printk-rb-add-writer-interface.patch b/kernel/patches-5.4.x-rt/0021-0004-printk-rb-add-writer-interface.patch new file mode 100644 index 000000000..e5f29a10e --- /dev/null +++ b/kernel/patches-5.4.x-rt/0021-0004-printk-rb-add-writer-interface.patch @@ -0,0 +1,233 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:42 +0100 +Subject: [PATCH 04/25] printk-rb: add writer interface + +Add the writer functions prb_reserve() and prb_commit(). These make +use of processor-reentrant spin locks to limit the number of possible +interruption scenarios for the writers. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/printk_ringbuffer.h | 17 +++ + lib/printk_ringbuffer.c | 172 ++++++++++++++++++++++++++++++++++++++ + 2 files changed, 189 insertions(+) + +--- a/include/linux/printk_ringbuffer.h ++++ b/include/linux/printk_ringbuffer.h +@@ -24,6 +24,18 @@ struct printk_ringbuffer { + atomic_t ctx; + }; + ++struct prb_entry { ++ unsigned int size; ++ u64 seq; ++ char data[0]; ++}; ++ ++struct prb_handle { ++ struct printk_ringbuffer *rb; ++ unsigned int cpu; ++ struct prb_entry *entry; ++}; ++ + #define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \ + static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \ + static struct prb_cpulock name = { \ +@@ -45,6 +57,11 @@ static struct printk_ringbuffer name = { + .ctx = ATOMIC_INIT(0), \ + } + ++/* writer interface */ ++char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb, ++ unsigned int size); ++void prb_commit(struct prb_handle *h); ++ + /* utility functions */ + void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); + void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store); +--- a/lib/printk_ringbuffer.c ++++ b/lib/printk_ringbuffer.c +@@ -2,6 +2,14 @@ + #include + #include + ++#define PRB_SIZE(rb) (1 << rb->size_bits) ++#define PRB_SIZE_BITMASK(rb) (PRB_SIZE(rb) - 1) ++#define PRB_INDEX(rb, lpos) (lpos & PRB_SIZE_BITMASK(rb)) ++#define PRB_WRAPS(rb, lpos) (lpos >> rb->size_bits) ++#define PRB_WRAP_LPOS(rb, lpos, xtra) \ ++ ((PRB_WRAPS(rb, lpos) + xtra) << rb->size_bits) ++#define PRB_DATA_ALIGN sizeof(long) ++ + static bool __prb_trylock(struct prb_cpulock *cpu_lock, + unsigned int *cpu_store) + { +@@ -75,3 +83,167 @@ void prb_unlock(struct prb_cpulock *cpu_ + + put_cpu(); + } ++ ++static struct prb_entry *to_entry(struct printk_ringbuffer *rb, ++ unsigned long lpos) ++{ ++ char *buffer = rb->buffer; ++ buffer += PRB_INDEX(rb, lpos); ++ return (struct prb_entry *)buffer; ++} ++ ++static int calc_next(struct printk_ringbuffer *rb, unsigned long tail, ++ unsigned long lpos, int size, unsigned long *calced_next) ++{ ++ unsigned long next_lpos; ++ int ret = 0; ++again: ++ next_lpos = lpos + size; ++ if (next_lpos - tail > PRB_SIZE(rb)) ++ return -1; ++ ++ if (PRB_WRAPS(rb, lpos) != PRB_WRAPS(rb, next_lpos)) { ++ lpos = PRB_WRAP_LPOS(rb, next_lpos, 0); ++ ret |= 1; ++ goto again; ++ } ++ ++ *calced_next = next_lpos; ++ return ret; ++} ++ ++static bool push_tail(struct printk_ringbuffer *rb, unsigned long tail) ++{ ++ unsigned long new_tail; ++ struct prb_entry *e; ++ unsigned long head; ++ ++ if (tail != atomic_long_read(&rb->tail)) ++ return true; ++ ++ e = to_entry(rb, tail); ++ if (e->size != -1) ++ new_tail = tail + e->size; ++ else ++ new_tail = PRB_WRAP_LPOS(rb, tail, 1); ++ ++ /* make sure the new tail does not overtake the head */ ++ head = atomic_long_read(&rb->head); ++ if (head - new_tail > PRB_SIZE(rb)) ++ return false; ++ ++ atomic_long_cmpxchg(&rb->tail, tail, new_tail); ++ return true; ++} ++ ++/* ++ * prb_commit: Commit a reserved entry to the ring buffer. ++ * @h: An entry handle referencing the data entry to commit. ++ * ++ * Commit data that has been reserved using prb_reserve(). Once the data ++ * block has been committed, it can be invalidated at any time. If a writer ++ * is interested in using the data after committing, the writer should make ++ * its own copy first or use the prb_iter_ reader functions to access the ++ * data in the ring buffer. ++ * ++ * It is safe to call this function from any context and state. ++ */ ++void prb_commit(struct prb_handle *h) ++{ ++ struct printk_ringbuffer *rb = h->rb; ++ struct prb_entry *e; ++ unsigned long head; ++ unsigned long res; ++ ++ for (;;) { ++ if (atomic_read(&rb->ctx) != 1) { ++ /* the interrupted context will fixup head */ ++ atomic_dec(&rb->ctx); ++ break; ++ } ++ /* assign sequence numbers before moving head */ ++ head = atomic_long_read(&rb->head); ++ res = atomic_long_read(&rb->reserve); ++ while (head != res) { ++ e = to_entry(rb, head); ++ if (e->size == -1) { ++ head = PRB_WRAP_LPOS(rb, head, 1); ++ continue; ++ } ++ e->seq = ++rb->seq; ++ head += e->size; ++ } ++ atomic_long_set_release(&rb->head, res); ++ atomic_dec(&rb->ctx); ++ ++ if (atomic_long_read(&rb->reserve) == res) ++ break; ++ atomic_inc(&rb->ctx); ++ } ++ ++ prb_unlock(rb->cpulock, h->cpu); ++} ++ ++/* ++ * prb_reserve: Reserve an entry within a ring buffer. ++ * @h: An entry handle to be setup and reference an entry. ++ * @rb: A ring buffer to reserve data within. ++ * @size: The number of bytes to reserve. ++ * ++ * Reserve an entry of at least @size bytes to be used by the caller. If ++ * successful, the data region of the entry belongs to the caller and cannot ++ * be invalidated by any other task/context. For this reason, the caller ++ * should call prb_commit() as quickly as possible in order to avoid preventing ++ * other tasks/contexts from reserving data in the case that the ring buffer ++ * has wrapped. ++ * ++ * It is safe to call this function from any context and state. ++ * ++ * Returns a pointer to the reserved entry (and @h is setup to reference that ++ * entry) or NULL if it was not possible to reserve data. ++ */ ++char *prb_reserve(struct prb_handle *h, struct printk_ringbuffer *rb, ++ unsigned int size) ++{ ++ unsigned long tail, res1, res2; ++ int ret; ++ ++ if (size == 0) ++ return NULL; ++ size += sizeof(struct prb_entry); ++ size += PRB_DATA_ALIGN - 1; ++ size &= ~(PRB_DATA_ALIGN - 1); ++ if (size >= PRB_SIZE(rb)) ++ return NULL; ++ ++ h->rb = rb; ++ prb_lock(rb->cpulock, &h->cpu); ++ ++ atomic_inc(&rb->ctx); ++ ++ do { ++ for (;;) { ++ tail = atomic_long_read(&rb->tail); ++ res1 = atomic_long_read(&rb->reserve); ++ ret = calc_next(rb, tail, res1, size, &res2); ++ if (ret >= 0) ++ break; ++ if (!push_tail(rb, tail)) { ++ prb_commit(h); ++ return NULL; ++ } ++ } ++ } while (!atomic_long_try_cmpxchg_acquire(&rb->reserve, &res1, res2)); ++ ++ h->entry = to_entry(rb, res1); ++ ++ if (ret) { ++ /* handle wrap */ ++ h->entry->size = -1; ++ h->entry = to_entry(rb, PRB_WRAP_LPOS(rb, res2, 0)); ++ } ++ ++ h->entry->size = size; ++ ++ return &h->entry->data[0]; ++} diff --git a/kernel/patches-5.4.x-rt/0022-0005-printk-rb-add-basic-non-blocking-reading-interface.patch b/kernel/patches-5.4.x-rt/0022-0005-printk-rb-add-basic-non-blocking-reading-interface.patch new file mode 100644 index 000000000..e583c1932 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0022-0005-printk-rb-add-basic-non-blocking-reading-interface.patch @@ -0,0 +1,259 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:43 +0100 +Subject: [PATCH 05/25] printk-rb: add basic non-blocking reading interface + +Add reader iterator static declaration/initializer, dynamic +initializer, and functions to iterate and retrieve ring buffer data. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/printk_ringbuffer.h | 20 ++++ + lib/printk_ringbuffer.c | 190 ++++++++++++++++++++++++++++++++++++++ + 2 files changed, 210 insertions(+) + +--- a/include/linux/printk_ringbuffer.h ++++ b/include/linux/printk_ringbuffer.h +@@ -43,6 +43,19 @@ static struct prb_cpulock name = { \ + .irqflags = &_##name##_percpu_irqflags, \ + } + ++#define PRB_INIT ((unsigned long)-1) ++ ++#define DECLARE_STATIC_PRINTKRB_ITER(name, rbaddr) \ ++static struct prb_iterator name = { \ ++ .rb = rbaddr, \ ++ .lpos = PRB_INIT, \ ++} ++ ++struct prb_iterator { ++ struct printk_ringbuffer *rb; ++ unsigned long lpos; ++}; ++ + #define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr) \ + static char _##name##_buffer[1 << (szbits)] \ + __aligned(__alignof__(long)); \ +@@ -62,6 +75,13 @@ char *prb_reserve(struct prb_handle *h, + unsigned int size); + void prb_commit(struct prb_handle *h); + ++/* reader interface */ ++void prb_iter_init(struct prb_iterator *iter, struct printk_ringbuffer *rb, ++ u64 *seq); ++void prb_iter_copy(struct prb_iterator *dest, struct prb_iterator *src); ++int prb_iter_next(struct prb_iterator *iter, char *buf, int size, u64 *seq); ++int prb_iter_data(struct prb_iterator *iter, char *buf, int size, u64 *seq); ++ + /* utility functions */ + void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); + void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store); +--- a/lib/printk_ringbuffer.c ++++ b/lib/printk_ringbuffer.c +@@ -1,5 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + #include ++#include ++#include + #include + + #define PRB_SIZE(rb) (1 << rb->size_bits) +@@ -8,6 +10,7 @@ + #define PRB_WRAPS(rb, lpos) (lpos >> rb->size_bits) + #define PRB_WRAP_LPOS(rb, lpos, xtra) \ + ((PRB_WRAPS(rb, lpos) + xtra) << rb->size_bits) ++#define PRB_DATA_SIZE(e) (e->size - sizeof(struct prb_entry)) + #define PRB_DATA_ALIGN sizeof(long) + + static bool __prb_trylock(struct prb_cpulock *cpu_lock, +@@ -247,3 +250,190 @@ char *prb_reserve(struct prb_handle *h, + + return &h->entry->data[0]; + } ++ ++/* ++ * prb_iter_copy: Copy an iterator. ++ * @dest: The iterator to copy to. ++ * @src: The iterator to copy from. ++ * ++ * Make a deep copy of an iterator. This is particularly useful for making ++ * backup copies of an iterator in case a form of rewinding it needed. ++ * ++ * It is safe to call this function from any context and state. But ++ * note that this function is not atomic. Callers should not make copies ++ * to/from iterators that can be accessed by other tasks/contexts. ++ */ ++void prb_iter_copy(struct prb_iterator *dest, struct prb_iterator *src) ++{ ++ memcpy(dest, src, sizeof(*dest)); ++} ++ ++/* ++ * prb_iter_init: Initialize an iterator for a ring buffer. ++ * @iter: The iterator to initialize. ++ * @rb: A ring buffer to that @iter should iterate. ++ * @seq: The sequence number of the position preceding the first record. ++ * May be NULL. ++ * ++ * Initialize an iterator to be used with a specified ring buffer. If @seq ++ * is non-NULL, it will be set such that prb_iter_next() will provide a ++ * sequence value of "@seq + 1" if no records were missed. ++ * ++ * It is safe to call this function from any context and state. ++ */ ++void prb_iter_init(struct prb_iterator *iter, struct printk_ringbuffer *rb, ++ u64 *seq) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ iter->rb = rb; ++ iter->lpos = PRB_INIT; ++ ++ if (!seq) ++ return; ++ ++ for (;;) { ++ struct prb_iterator tmp_iter; ++ int ret; ++ ++ prb_iter_copy(&tmp_iter, iter); ++ ++ ret = prb_iter_next(&tmp_iter, NULL, 0, seq); ++ if (ret < 0) ++ continue; ++ ++ if (ret == 0) ++ *seq = 0; ++ else ++ (*seq)--; ++ break; ++ } ++} ++ ++static bool is_valid(struct printk_ringbuffer *rb, unsigned long lpos) ++{ ++ unsigned long head, tail; ++ ++ tail = atomic_long_read(&rb->tail); ++ head = atomic_long_read(&rb->head); ++ head -= tail; ++ lpos -= tail; ++ ++ if (lpos >= head) ++ return false; ++ return true; ++} ++ ++/* ++ * prb_iter_data: Retrieve the record data at the current position. ++ * @iter: Iterator tracking the current position. ++ * @buf: A buffer to store the data of the record. May be NULL. ++ * @size: The size of @buf. (Ignored if @buf is NULL.) ++ * @seq: The sequence number of the record. May be NULL. ++ * ++ * If @iter is at a record, provide the data and/or sequence number of that ++ * record (if specified by the caller). ++ * ++ * It is safe to call this function from any context and state. ++ * ++ * Returns >=0 if the current record contains valid data (returns 0 if @buf ++ * is NULL or returns the size of the data block if @buf is non-NULL) or ++ * -EINVAL if @iter is now invalid. ++ */ ++int prb_iter_data(struct prb_iterator *iter, char *buf, int size, u64 *seq) ++{ ++ struct printk_ringbuffer *rb = iter->rb; ++ unsigned long lpos = iter->lpos; ++ unsigned int datsize = 0; ++ struct prb_entry *e; ++ ++ if (buf || seq) { ++ e = to_entry(rb, lpos); ++ if (!is_valid(rb, lpos)) ++ return -EINVAL; ++ /* memory barrier to ensure valid lpos */ ++ smp_rmb(); ++ if (buf) { ++ datsize = PRB_DATA_SIZE(e); ++ /* memory barrier to ensure load of datsize */ ++ smp_rmb(); ++ if (!is_valid(rb, lpos)) ++ return -EINVAL; ++ if (PRB_INDEX(rb, lpos) + datsize > ++ PRB_SIZE(rb) - PRB_DATA_ALIGN) { ++ return -EINVAL; ++ } ++ if (size > datsize) ++ size = datsize; ++ memcpy(buf, &e->data[0], size); ++ } ++ if (seq) ++ *seq = e->seq; ++ /* memory barrier to ensure loads of entry data */ ++ smp_rmb(); ++ } ++ ++ if (!is_valid(rb, lpos)) ++ return -EINVAL; ++ ++ return datsize; ++} ++ ++/* ++ * prb_iter_next: Advance to the next record. ++ * @iter: Iterator tracking the current position. ++ * @buf: A buffer to store the data of the next record. May be NULL. ++ * @size: The size of @buf. (Ignored if @buf is NULL.) ++ * @seq: The sequence number of the next record. May be NULL. ++ * ++ * If a next record is available, @iter is advanced and (if specified) ++ * the data and/or sequence number of that record are provided. ++ * ++ * It is safe to call this function from any context and state. ++ * ++ * Returns 1 if @iter was advanced, 0 if @iter is at the end of the list, or ++ * -EINVAL if @iter is now invalid. ++ */ ++int prb_iter_next(struct prb_iterator *iter, char *buf, int size, u64 *seq) ++{ ++ struct printk_ringbuffer *rb = iter->rb; ++ unsigned long next_lpos; ++ struct prb_entry *e; ++ unsigned int esize; ++ ++ if (iter->lpos == PRB_INIT) { ++ next_lpos = atomic_long_read(&rb->tail); ++ } else { ++ if (!is_valid(rb, iter->lpos)) ++ return -EINVAL; ++ /* memory barrier to ensure valid lpos */ ++ smp_rmb(); ++ e = to_entry(rb, iter->lpos); ++ esize = e->size; ++ /* memory barrier to ensure load of size */ ++ smp_rmb(); ++ if (!is_valid(rb, iter->lpos)) ++ return -EINVAL; ++ next_lpos = iter->lpos + esize; ++ } ++ if (next_lpos == atomic_long_read(&rb->head)) ++ return 0; ++ if (!is_valid(rb, next_lpos)) ++ return -EINVAL; ++ /* memory barrier to ensure valid lpos */ ++ smp_rmb(); ++ ++ iter->lpos = next_lpos; ++ e = to_entry(rb, iter->lpos); ++ esize = e->size; ++ /* memory barrier to ensure load of size */ ++ smp_rmb(); ++ if (!is_valid(rb, iter->lpos)) ++ return -EINVAL; ++ if (esize == -1) ++ iter->lpos = PRB_WRAP_LPOS(rb, iter->lpos, 1); ++ ++ if (prb_iter_data(iter, buf, size, seq) < 0) ++ return -EINVAL; ++ ++ return 1; ++} diff --git a/kernel/patches-5.4.x-rt/0023-0006-printk-rb-add-blocking-reader-support.patch b/kernel/patches-5.4.x-rt/0023-0006-printk-rb-add-blocking-reader-support.patch new file mode 100644 index 000000000..5988d20f4 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0023-0006-printk-rb-add-blocking-reader-support.patch @@ -0,0 +1,161 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:44 +0100 +Subject: [PATCH 06/25] printk-rb: add blocking reader support + +Add a blocking read function for readers. An irq_work function is +used to signal the wait queue so that write notification can +be triggered from any context. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/printk_ringbuffer.h | 20 +++++++++++++ + lib/printk_ringbuffer.c | 55 ++++++++++++++++++++++++++++++++++++++ + 2 files changed, 75 insertions(+) + +--- a/include/linux/printk_ringbuffer.h ++++ b/include/linux/printk_ringbuffer.h +@@ -2,8 +2,10 @@ + #ifndef _LINUX_PRINTK_RINGBUFFER_H + #define _LINUX_PRINTK_RINGBUFFER_H + ++#include + #include + #include ++#include + + struct prb_cpulock { + atomic_t owner; +@@ -22,6 +24,10 @@ struct printk_ringbuffer { + + struct prb_cpulock *cpulock; + atomic_t ctx; ++ ++ struct wait_queue_head *wq; ++ atomic_long_t wq_counter; ++ struct irq_work *wq_work; + }; + + struct prb_entry { +@@ -59,6 +65,15 @@ struct prb_iterator { + #define DECLARE_STATIC_PRINTKRB(name, szbits, cpulockptr) \ + static char _##name##_buffer[1 << (szbits)] \ + __aligned(__alignof__(long)); \ ++static DECLARE_WAIT_QUEUE_HEAD(_##name##_wait); \ ++static void _##name##_wake_work_func(struct irq_work *irq_work) \ ++{ \ ++ wake_up_interruptible_all(&_##name##_wait); \ ++} \ ++static struct irq_work _##name##_wake_work = { \ ++ .func = _##name##_wake_work_func, \ ++ .flags = IRQ_WORK_LAZY, \ ++}; \ + static struct printk_ringbuffer name = { \ + .buffer = &_##name##_buffer[0], \ + .size_bits = szbits, \ +@@ -68,6 +83,9 @@ static struct printk_ringbuffer name = { + .reserve = ATOMIC_LONG_INIT(-111 * sizeof(long)), \ + .cpulock = cpulockptr, \ + .ctx = ATOMIC_INIT(0), \ ++ .wq = &_##name##_wait, \ ++ .wq_counter = ATOMIC_LONG_INIT(0), \ ++ .wq_work = &_##name##_wake_work, \ + } + + /* writer interface */ +@@ -80,6 +98,8 @@ void prb_iter_init(struct prb_iterator * + u64 *seq); + void prb_iter_copy(struct prb_iterator *dest, struct prb_iterator *src); + int prb_iter_next(struct prb_iterator *iter, char *buf, int size, u64 *seq); ++int prb_iter_wait_next(struct prb_iterator *iter, char *buf, int size, ++ u64 *seq); + int prb_iter_data(struct prb_iterator *iter, char *buf, int size, u64 *seq); + + /* utility functions */ +--- a/lib/printk_ringbuffer.c ++++ b/lib/printk_ringbuffer.c +@@ -1,4 +1,5 @@ + // SPDX-License-Identifier: GPL-2.0 ++#include + #include + #include + #include +@@ -154,6 +155,7 @@ static bool push_tail(struct printk_ring + void prb_commit(struct prb_handle *h) + { + struct printk_ringbuffer *rb = h->rb; ++ bool changed = false; + struct prb_entry *e; + unsigned long head; + unsigned long res; +@@ -175,6 +177,7 @@ void prb_commit(struct prb_handle *h) + } + e->seq = ++rb->seq; + head += e->size; ++ changed = true; + } + atomic_long_set_release(&rb->head, res); + atomic_dec(&rb->ctx); +@@ -185,6 +188,18 @@ void prb_commit(struct prb_handle *h) + } + + prb_unlock(rb->cpulock, h->cpu); ++ ++ if (changed) { ++ atomic_long_inc(&rb->wq_counter); ++ if (wq_has_sleeper(rb->wq)) { ++#ifdef CONFIG_IRQ_WORK ++ irq_work_queue(rb->wq_work); ++#else ++ if (!in_nmi()) ++ wake_up_interruptible_all(rb->wq); ++#endif ++ } ++ } + } + + /* +@@ -437,3 +452,43 @@ int prb_iter_next(struct prb_iterator *i + + return 1; + } ++ ++/* ++ * prb_iter_wait_next: Advance to the next record, blocking if none available. ++ * @iter: Iterator tracking the current position. ++ * @buf: A buffer to store the data of the next record. May be NULL. ++ * @size: The size of @buf. (Ignored if @buf is NULL.) ++ * @seq: The sequence number of the next record. May be NULL. ++ * ++ * If a next record is already available, this function works like ++ * prb_iter_next(). Otherwise block interruptible until a next record is ++ * available. ++ * ++ * When a next record is available, @iter is advanced and (if specified) ++ * the data and/or sequence number of that record are provided. ++ * ++ * This function might sleep. ++ * ++ * Returns 1 if @iter was advanced, -EINVAL if @iter is now invalid, or ++ * -ERESTARTSYS if interrupted by a signal. ++ */ ++int prb_iter_wait_next(struct prb_iterator *iter, char *buf, int size, u64 *seq) ++{ ++ unsigned long last_seen; ++ int ret; ++ ++ for (;;) { ++ last_seen = atomic_long_read(&iter->rb->wq_counter); ++ ++ ret = prb_iter_next(iter, buf, size, seq); ++ if (ret != 0) ++ break; ++ ++ ret = wait_event_interruptible(*iter->rb->wq, ++ last_seen != atomic_long_read(&iter->rb->wq_counter)); ++ if (ret < 0) ++ break; ++ } ++ ++ return ret; ++} diff --git a/kernel/patches-5.4.x-rt/0024-0007-printk-rb-add-functionality-required-by-printk.patch b/kernel/patches-5.4.x-rt/0024-0007-printk-rb-add-functionality-required-by-printk.patch new file mode 100644 index 000000000..5d5365b87 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0024-0007-printk-rb-add-functionality-required-by-printk.patch @@ -0,0 +1,159 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:45 +0100 +Subject: [PATCH 07/25] printk-rb: add functionality required by printk + +The printk subsystem needs to be able to query the size of the ring +buffer, seek to specific entries within the ring buffer, and track +if records could not be stored in the ring buffer. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/printk_ringbuffer.h | 5 ++ + lib/printk_ringbuffer.c | 95 ++++++++++++++++++++++++++++++++++++++ + 2 files changed, 100 insertions(+) + +--- a/include/linux/printk_ringbuffer.h ++++ b/include/linux/printk_ringbuffer.h +@@ -17,6 +17,7 @@ struct printk_ringbuffer { + unsigned int size_bits; + + u64 seq; ++ atomic_long_t lost; + + atomic_long_t tail; + atomic_long_t head; +@@ -78,6 +79,7 @@ static struct printk_ringbuffer name = { + .buffer = &_##name##_buffer[0], \ + .size_bits = szbits, \ + .seq = 0, \ ++ .lost = ATOMIC_LONG_INIT(0), \ + .tail = ATOMIC_LONG_INIT(-111 * sizeof(long)), \ + .head = ATOMIC_LONG_INIT(-111 * sizeof(long)), \ + .reserve = ATOMIC_LONG_INIT(-111 * sizeof(long)), \ +@@ -100,9 +102,12 @@ void prb_iter_copy(struct prb_iterator * + int prb_iter_next(struct prb_iterator *iter, char *buf, int size, u64 *seq); + int prb_iter_wait_next(struct prb_iterator *iter, char *buf, int size, + u64 *seq); ++int prb_iter_seek(struct prb_iterator *iter, u64 seq); + int prb_iter_data(struct prb_iterator *iter, char *buf, int size, u64 *seq); + + /* utility functions */ ++int prb_buffer_size(struct printk_ringbuffer *rb); ++void prb_inc_lost(struct printk_ringbuffer *rb); + void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store); + void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store); + +--- a/lib/printk_ringbuffer.c ++++ b/lib/printk_ringbuffer.c +@@ -175,11 +175,16 @@ void prb_commit(struct prb_handle *h) + head = PRB_WRAP_LPOS(rb, head, 1); + continue; + } ++ while (atomic_long_read(&rb->lost)) { ++ atomic_long_dec(&rb->lost); ++ rb->seq++; ++ } + e->seq = ++rb->seq; + head += e->size; + changed = true; + } + atomic_long_set_release(&rb->head, res); ++ + atomic_dec(&rb->ctx); + + if (atomic_long_read(&rb->reserve) == res) +@@ -492,3 +497,93 @@ int prb_iter_wait_next(struct prb_iterat + + return ret; + } ++ ++/* ++ * prb_iter_seek: Seek forward to a specific record. ++ * @iter: Iterator to advance. ++ * @seq: Record number to advance to. ++ * ++ * Advance @iter such that a following call to prb_iter_data() will provide ++ * the contents of the specified record. If a record is specified that does ++ * not yet exist, advance @iter to the end of the record list. ++ * ++ * Note that iterators cannot be rewound. So if a record is requested that ++ * exists but is previous to @iter in position, @iter is considered invalid. ++ * ++ * It is safe to call this function from any context and state. ++ * ++ * Returns 1 on succces, 0 if specified record does not yet exist (@iter is ++ * now at the end of the list), or -EINVAL if @iter is now invalid. ++ */ ++int prb_iter_seek(struct prb_iterator *iter, u64 seq) ++{ ++ u64 cur_seq; ++ int ret; ++ ++ /* first check if the iterator is already at the wanted seq */ ++ if (seq == 0) { ++ if (iter->lpos == PRB_INIT) ++ return 1; ++ else ++ return -EINVAL; ++ } ++ if (iter->lpos != PRB_INIT) { ++ if (prb_iter_data(iter, NULL, 0, &cur_seq) >= 0) { ++ if (cur_seq == seq) ++ return 1; ++ if (cur_seq > seq) ++ return -EINVAL; ++ } ++ } ++ ++ /* iterate to find the wanted seq */ ++ for (;;) { ++ ret = prb_iter_next(iter, NULL, 0, &cur_seq); ++ if (ret <= 0) ++ break; ++ ++ if (cur_seq == seq) ++ break; ++ ++ if (cur_seq > seq) { ++ ret = -EINVAL; ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++/* ++ * prb_buffer_size: Get the size of the ring buffer. ++ * @rb: The ring buffer to get the size of. ++ * ++ * Return the number of bytes used for the ring buffer entry storage area. ++ * Note that this area stores both entry header and entry data. Therefore ++ * this represents an upper bound to the amount of data that can be stored ++ * in the ring buffer. ++ * ++ * It is safe to call this function from any context and state. ++ * ++ * Returns the size in bytes of the entry storage area. ++ */ ++int prb_buffer_size(struct printk_ringbuffer *rb) ++{ ++ return PRB_SIZE(rb); ++} ++ ++/* ++ * prb_inc_lost: Increment the seq counter to signal a lost record. ++ * @rb: The ring buffer to increment the seq of. ++ * ++ * Increment the seq counter so that a seq number is intentially missing ++ * for the readers. This allows readers to identify that a record is ++ * missing. A writer will typically use this function if prb_reserve() ++ * fails. ++ * ++ * It is safe to call this function from any context and state. ++ */ ++void prb_inc_lost(struct printk_ringbuffer *rb) ++{ ++ atomic_long_inc(&rb->lost); ++} diff --git a/kernel/patches-5.4.x-rt/0025-0008-printk-add-ring-buffer-and-kthread.patch b/kernel/patches-5.4.x-rt/0025-0008-printk-add-ring-buffer-and-kthread.patch new file mode 100644 index 000000000..4bd53be65 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0025-0008-printk-add-ring-buffer-and-kthread.patch @@ -0,0 +1,168 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:46 +0100 +Subject: [PATCH 08/25] printk: add ring buffer and kthread + +The printk ring buffer provides an NMI-safe interface for writing +messages to a ring buffer. Using such a buffer for alleviates printk +callers from the current burdens of disabled preemption while calling +the console drivers (and possibly printing out many messages that +another task put into the log buffer). + +Create a ring buffer to be used for storing messages to be +printed to the consoles. + +Create a dedicated printk kthread to block on the ring buffer +and call the console drivers for the read messages. + +NOTE: The printk_delay is relocated to _after_ the message is + printed, where it makes more sense. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 105 insertions(+) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -45,6 +45,8 @@ + #include + #include + #include ++#include ++#include + #include + #include + #include +@@ -417,7 +419,12 @@ DEFINE_RAW_SPINLOCK(logbuf_lock); + printk_safe_exit_irqrestore(flags); \ + } while (0) + ++DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock); ++ + #ifdef CONFIG_PRINTK ++/* record buffer */ ++DECLARE_STATIC_PRINTKRB(printk_rb, CONFIG_LOG_BUF_SHIFT, &printk_cpulock); ++ + DECLARE_WAIT_QUEUE_HEAD(log_wait); + /* the next printk record to read by syslog(READ) or /proc/kmsg */ + static u64 syslog_seq; +@@ -780,6 +787,10 @@ static ssize_t msg_print_ext_body(char * + return p - buf; + } + ++#define PRINTK_SPRINT_MAX (LOG_LINE_MAX + PREFIX_MAX) ++#define PRINTK_RECORD_MAX (sizeof(struct printk_log) + \ ++ CONSOLE_EXT_LOG_MAX + PRINTK_SPRINT_MAX) ++ + /* /dev/kmsg - userspace message inject/listen interface */ + struct devkmsg_user { + u64 seq; +@@ -1620,6 +1631,34 @@ SYSCALL_DEFINE3(syslog, int, type, char + return do_syslog(type, buf, len, SYSLOG_FROM_READER); + } + ++static void format_text(struct printk_log *msg, u64 seq, ++ char *ext_text, size_t *ext_len, ++ char *text, size_t *len, bool time) ++{ ++ if (suppress_message_printing(msg->level)) { ++ /* ++ * Skip record that has level above the console ++ * loglevel and update each console's local seq. ++ */ ++ *len = 0; ++ *ext_len = 0; ++ return; ++ } ++ ++ *len = msg_print_text(msg, console_msg_format & MSG_FORMAT_SYSLOG, ++ time, text, PRINTK_SPRINT_MAX); ++ if (nr_ext_console_drivers) { ++ *ext_len = msg_print_ext_header(ext_text, CONSOLE_EXT_LOG_MAX, ++ msg, seq); ++ *ext_len += msg_print_ext_body(ext_text + *ext_len, ++ CONSOLE_EXT_LOG_MAX - *ext_len, ++ log_dict(msg), msg->dict_len, ++ log_text(msg), msg->text_len); ++ } else { ++ *ext_len = 0; ++ } ++} ++ + /* + * Special console_lock variants that help to reduce the risk of soft-lockups. + * They allow to pass console_lock to another printk() call using a busy wait. +@@ -2974,6 +3013,72 @@ void wake_up_klogd(void) + preempt_enable(); + } + ++static int printk_kthread_func(void *data) ++{ ++ struct prb_iterator iter; ++ struct printk_log *msg; ++ size_t ext_len; ++ char *ext_text; ++ u64 master_seq; ++ size_t len; ++ char *text; ++ char *buf; ++ int ret; ++ ++ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); ++ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL); ++ buf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL); ++ if (!ext_text || !text || !buf) ++ return -1; ++ ++ prb_iter_init(&iter, &printk_rb, NULL); ++ ++ /* the printk kthread never exits */ ++ for (;;) { ++ ret = prb_iter_wait_next(&iter, buf, ++ PRINTK_RECORD_MAX, &master_seq); ++ if (ret == -ERESTARTSYS) { ++ continue; ++ } else if (ret < 0) { ++ /* iterator invalid, start over */ ++ prb_iter_init(&iter, &printk_rb, NULL); ++ continue; ++ } ++ ++ msg = (struct printk_log *)buf; ++ format_text(msg, master_seq, ext_text, &ext_len, text, ++ &len, printk_time); ++ ++ console_lock(); ++ if (len > 0 || ext_len > 0) { ++ call_console_drivers(ext_text, ext_len, text, len); ++ boot_delay_msec(msg->level); ++ printk_delay(); ++ } ++ console_unlock(); ++ } ++ ++ kfree(ext_text); ++ kfree(text); ++ kfree(buf); ++ ++ return 0; ++} ++ ++static int __init init_printk_kthread(void) ++{ ++ struct task_struct *thread; ++ ++ thread = kthread_run(printk_kthread_func, NULL, "printk"); ++ if (IS_ERR(thread)) { ++ pr_err("printk: unable to create printing thread\n"); ++ return PTR_ERR(thread); ++ } ++ ++ return 0; ++} ++late_initcall(init_printk_kthread); ++ + void defer_console_output(void) + { + preempt_disable(); diff --git a/kernel/patches-5.4.x-rt/0026-0009-printk-remove-exclusive-console-hack.patch b/kernel/patches-5.4.x-rt/0026-0009-printk-remove-exclusive-console-hack.patch new file mode 100644 index 000000000..7369eb3e8 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0026-0009-printk-remove-exclusive-console-hack.patch @@ -0,0 +1,101 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:47 +0100 +Subject: [PATCH 09/25] printk: remove exclusive console hack + +In order to support printing the printk log history when new +consoles are registered, a global exclusive_console variable is +temporarily set. This only works because printk runs with +preemption disabled. + +When console printing is moved to a fully preemptible dedicated +kthread, this hack no longer works. + +Remove exclusive_console usage. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 30 ++++-------------------------- + 1 file changed, 4 insertions(+), 26 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -269,11 +269,6 @@ static void __up_console_sem(unsigned lo + static int console_locked, console_suspended; + + /* +- * If exclusive_console is non-NULL then only this console is to be printed to. +- */ +-static struct console *exclusive_console; +- +-/* + * Array of consoles built from command line options (console=) + */ + +@@ -443,7 +438,6 @@ static u32 log_next_idx; + /* the next printk record to write to the console */ + static u64 console_seq; + static u32 console_idx; +-static u64 exclusive_console_stop_seq; + + /* the next printk record to read after the last 'clear' command */ + static u64 clear_seq; +@@ -1815,8 +1809,6 @@ static void call_console_drivers(const c + return; + + for_each_console(con) { +- if (exclusive_console && con != exclusive_console) +- continue; + if (!(con->flags & CON_ENABLED)) + continue; + if (!con->write) +@@ -2109,7 +2101,6 @@ static u64 syslog_seq; + static u32 syslog_idx; + static u64 console_seq; + static u32 console_idx; +-static u64 exclusive_console_stop_seq; + static u64 log_first_seq; + static u32 log_first_idx; + static u64 log_next_seq; +@@ -2478,12 +2469,6 @@ void console_unlock(void) + goto skip; + } + +- /* Output to all consoles once old messages replayed. */ +- if (unlikely(exclusive_console && +- console_seq >= exclusive_console_stop_seq)) { +- exclusive_console = NULL; +- } +- + len += msg_print_text(msg, + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time, text + len, sizeof(text) - len); +@@ -2809,17 +2794,6 @@ void register_console(struct console *ne + * for us. + */ + logbuf_lock_irqsave(flags); +- /* +- * We're about to replay the log buffer. Only do this to the +- * just-registered console to avoid excessive message spam to +- * the already-registered consoles. +- * +- * Set exclusive_console with disabled interrupts to reduce +- * race window with eventual console_flush_on_panic() that +- * ignores console_lock. +- */ +- exclusive_console = newcon; +- exclusive_console_stop_seq = console_seq; + console_seq = syslog_seq; + console_idx = syslog_idx; + logbuf_unlock_irqrestore(flags); +@@ -2833,6 +2807,10 @@ void register_console(struct console *ne + * boot consoles, real consoles, etc - this is to ensure that end + * users know there might be something in the kernel's log buffer that + * went to the bootconsole (that they do not see on the real console) ++ * ++ * This message is also important because it will trigger the ++ * printk kthread to begin dumping the log buffer to the newly ++ * registered console. + */ + pr_info("%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , diff --git a/kernel/patches-5.4.x-rt/0027-0010-printk-redirect-emit-store-to-new-ringbuffer.patch b/kernel/patches-5.4.x-rt/0027-0010-printk-redirect-emit-store-to-new-ringbuffer.patch new file mode 100644 index 000000000..0355541c4 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0027-0010-printk-redirect-emit-store-to-new-ringbuffer.patch @@ -0,0 +1,437 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:48 +0100 +Subject: [PATCH 10/25] printk: redirect emit/store to new ringbuffer + +vprintk_emit and vprintk_store are the main functions that all printk +variants eventually go through. Change these to store the message in +the new printk ring buffer that the printk kthread is reading. + +Remove functions no longer in use because of the changes to +vprintk_emit and vprintk_store. + +In order to handle interrupts and NMIs, a second per-cpu ring buffer +(sprint_rb) is added. This ring buffer is used for NMI-safe memory +allocation in order to format the printk messages. + +NOTE: LOG_CONT is ignored for now and handled as individual messages. + LOG_CONT functions are masked behind "#if 0" blocks until their + functionality can be restored + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 325 +++++++------------------------------------------ + 1 file changed, 51 insertions(+), 274 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -517,90 +517,6 @@ static u32 log_next(u32 idx) + return idx + msg->len; + } + +-/* +- * Check whether there is enough free space for the given message. +- * +- * The same values of first_idx and next_idx mean that the buffer +- * is either empty or full. +- * +- * If the buffer is empty, we must respect the position of the indexes. +- * They cannot be reset to the beginning of the buffer. +- */ +-static int logbuf_has_space(u32 msg_size, bool empty) +-{ +- u32 free; +- +- if (log_next_idx > log_first_idx || empty) +- free = max(log_buf_len - log_next_idx, log_first_idx); +- else +- free = log_first_idx - log_next_idx; +- +- /* +- * We need space also for an empty header that signalizes wrapping +- * of the buffer. +- */ +- return free >= msg_size + sizeof(struct printk_log); +-} +- +-static int log_make_free_space(u32 msg_size) +-{ +- while (log_first_seq < log_next_seq && +- !logbuf_has_space(msg_size, false)) { +- /* drop old messages until we have enough contiguous space */ +- log_first_idx = log_next(log_first_idx); +- log_first_seq++; +- } +- +- if (clear_seq < log_first_seq) { +- clear_seq = log_first_seq; +- clear_idx = log_first_idx; +- } +- +- /* sequence numbers are equal, so the log buffer is empty */ +- if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) +- return 0; +- +- return -ENOMEM; +-} +- +-/* compute the message size including the padding bytes */ +-static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) +-{ +- u32 size; +- +- size = sizeof(struct printk_log) + text_len + dict_len; +- *pad_len = (-size) & (LOG_ALIGN - 1); +- size += *pad_len; +- +- return size; +-} +- +-/* +- * Define how much of the log buffer we could take at maximum. The value +- * must be greater than two. Note that only half of the buffer is available +- * when the index points to the middle. +- */ +-#define MAX_LOG_TAKE_PART 4 +-static const char trunc_msg[] = ""; +- +-static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, +- u16 *dict_len, u32 *pad_len) +-{ +- /* +- * The message should not take the whole buffer. Otherwise, it might +- * get removed too soon. +- */ +- u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; +- if (*text_len > max_text_len) +- *text_len = max_text_len; +- /* enable the warning message */ +- *trunc_msg_len = strlen(trunc_msg); +- /* disable the "dict" completely */ +- *dict_len = 0; +- /* compute the size again, count also the warning message */ +- return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); +-} +- + /* insert record into the buffer, discard old ones, update heads */ + static int log_store(u32 caller_id, int facility, int level, + enum log_flags flags, u64 ts_nsec, +@@ -608,57 +524,39 @@ static int log_store(u32 caller_id, int + const char *text, u16 text_len) + { + struct printk_log *msg; +- u32 size, pad_len; +- u16 trunc_msg_len = 0; +- +- /* number of '\0' padding bytes to next message */ +- size = msg_used_size(text_len, dict_len, &pad_len); ++ struct prb_handle h; ++ char *rbuf; ++ u32 size; + +- if (log_make_free_space(size)) { +- /* truncate the message if it is too long for empty buffer */ +- size = truncate_msg(&text_len, &trunc_msg_len, +- &dict_len, &pad_len); +- /* survive when the log buffer is too small for trunc_msg */ +- if (log_make_free_space(size)) +- return 0; +- } ++ size = sizeof(*msg) + text_len + dict_len; + +- if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { ++ rbuf = prb_reserve(&h, &printk_rb, size); ++ if (!rbuf) { + /* +- * This message + an additional empty header does not fit +- * at the end of the buffer. Add an empty header with len == 0 +- * to signify a wrap around. ++ * An emergency message would have been printed, but ++ * it cannot be stored in the log. + */ +- memset(log_buf + log_next_idx, 0, sizeof(struct printk_log)); +- log_next_idx = 0; ++ prb_inc_lost(&printk_rb); ++ return 0; + } + + /* fill message */ +- msg = (struct printk_log *)(log_buf + log_next_idx); ++ msg = (struct printk_log *)rbuf; + memcpy(log_text(msg), text, text_len); + msg->text_len = text_len; +- if (trunc_msg_len) { +- memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); +- msg->text_len += trunc_msg_len; +- } + memcpy(log_dict(msg), dict, dict_len); + msg->dict_len = dict_len; + msg->facility = facility; + msg->level = level & 7; + msg->flags = flags & 0x1f; +- if (ts_nsec > 0) +- msg->ts_nsec = ts_nsec; +- else +- msg->ts_nsec = local_clock(); ++ msg->ts_nsec = ts_nsec; + #ifdef CONFIG_PRINTK_CALLER + msg->caller_id = caller_id; + #endif +- memset(log_dict(msg) + dict_len, 0, pad_len); + msg->len = size; + + /* insert message */ +- log_next_idx += msg->len; +- log_next_seq++; ++ prb_commit(&h); + + return msg->text_len; + } +@@ -1729,70 +1627,6 @@ static int console_lock_spinning_disable + return 1; + } + +-/** +- * console_trylock_spinning - try to get console_lock by busy waiting +- * +- * This allows to busy wait for the console_lock when the current +- * owner is running in specially marked sections. It means that +- * the current owner is running and cannot reschedule until it +- * is ready to lose the lock. +- * +- * Return: 1 if we got the lock, 0 othrewise +- */ +-static int console_trylock_spinning(void) +-{ +- struct task_struct *owner = NULL; +- bool waiter; +- bool spin = false; +- unsigned long flags; +- +- if (console_trylock()) +- return 1; +- +- printk_safe_enter_irqsave(flags); +- +- raw_spin_lock(&console_owner_lock); +- owner = READ_ONCE(console_owner); +- waiter = READ_ONCE(console_waiter); +- if (!waiter && owner && owner != current) { +- WRITE_ONCE(console_waiter, true); +- spin = true; +- } +- raw_spin_unlock(&console_owner_lock); +- +- /* +- * If there is an active printk() writing to the +- * consoles, instead of having it write our data too, +- * see if we can offload that load from the active +- * printer, and do some printing ourselves. +- * Go into a spin only if there isn't already a waiter +- * spinning, and there is an active printer, and +- * that active printer isn't us (recursive printk?). +- */ +- if (!spin) { +- printk_safe_exit_irqrestore(flags); +- return 0; +- } +- +- /* We spin waiting for the owner to release us */ +- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +- /* Owner will clear console_waiter on hand off */ +- while (READ_ONCE(console_waiter)) +- cpu_relax(); +- spin_release(&console_owner_dep_map, 1, _THIS_IP_); +- +- printk_safe_exit_irqrestore(flags); +- /* +- * The owner passed the console lock to us. +- * Since we did not spin on console lock, annotate +- * this as a trylock. Otherwise lockdep will +- * complain. +- */ +- mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); +- +- return 1; +-} +- + /* + * Call the console drivers, asking them to write out + * log_buf[start] to log_buf[end - 1]. +@@ -1813,7 +1647,7 @@ static void call_console_drivers(const c + continue; + if (!con->write) + continue; +- if (!cpu_online(smp_processor_id()) && ++ if (!cpu_online(raw_smp_processor_id()) && + !(con->flags & CON_ANYTIME)) + continue; + if (con->flags & CON_EXTENDED) +@@ -1843,6 +1677,8 @@ static inline u32 printk_caller_id(void) + 0x80000000 + raw_smp_processor_id(); + } + ++/* FIXME: no support for LOG_CONT */ ++#if 0 + /* + * Continuation lines are buffered, and not committed to the record buffer + * until the line is complete, or a race forces it. The line fragments +@@ -1898,56 +1734,45 @@ static bool cont_add(u32 caller_id, int + + return true; + } ++#endif /* 0 */ + +-static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len) +-{ +- const u32 caller_id = printk_caller_id(); +- +- /* +- * If an earlier line was buffered, and we're a continuation +- * write from the same context, try to add it to the buffer. +- */ +- if (cont.len) { +- if (cont.caller_id == caller_id && (lflags & LOG_CONT)) { +- if (cont_add(caller_id, facility, level, lflags, text, text_len)) +- return text_len; +- } +- /* Otherwise, make sure it's flushed */ +- cont_flush(); +- } +- +- /* Skip empty continuation lines that couldn't be added - they just flush */ +- if (!text_len && (lflags & LOG_CONT)) +- return 0; +- +- /* If it doesn't end in a newline, try to buffer the current line */ +- if (!(lflags & LOG_NEWLINE)) { +- if (cont_add(caller_id, facility, level, lflags, text, text_len)) +- return text_len; +- } +- +- /* Store it in the record log */ +- return log_store(caller_id, facility, level, lflags, 0, +- dict, dictlen, text, text_len); +-} +- +-/* Must be called under logbuf_lock. */ + int vprintk_store(int facility, int level, + const char *dict, size_t dictlen, + const char *fmt, va_list args) + { +- static char textbuf[LOG_LINE_MAX]; +- char *text = textbuf; +- size_t text_len; ++ return vprintk_emit(facility, level, dict, dictlen, fmt, args); ++} ++ ++/* ring buffer used as memory allocator for temporary sprint buffers */ ++DECLARE_STATIC_PRINTKRB(sprint_rb, ++ ilog2(PRINTK_RECORD_MAX + sizeof(struct prb_entry) + ++ sizeof(long)) + 2, &printk_cpulock); ++ ++asmlinkage int vprintk_emit(int facility, int level, ++ const char *dict, size_t dictlen, ++ const char *fmt, va_list args) ++{ ++ const u32 caller_id = printk_caller_id(); + enum log_flags lflags = 0; ++ int printed_len = 0; ++ struct prb_handle h; ++ size_t text_len; ++ u64 ts_nsec; ++ char *text; ++ char *rbuf; + +- /* +- * The printf needs to come first; we need the syslog +- * prefix which might be passed-in as a parameter. +- */ +- text_len = vscnprintf(text, sizeof(textbuf), fmt, args); ++ ts_nsec = local_clock(); ++ ++ rbuf = prb_reserve(&h, &sprint_rb, PRINTK_SPRINT_MAX); ++ if (!rbuf) { ++ prb_inc_lost(&printk_rb); ++ return printed_len; ++ } ++ ++ text = rbuf; ++ text_len = vscnprintf(text, PRINTK_SPRINT_MAX, fmt, args); + +- /* mark and strip a trailing newline */ ++ /* strip and flag a trailing newline */ + if (text_len && text[text_len-1] == '\n') { + text_len--; + lflags |= LOG_NEWLINE; +@@ -1978,58 +1803,10 @@ int vprintk_store(int facility, int leve + if (dict) + lflags |= LOG_NEWLINE; + +- return log_output(facility, level, lflags, +- dict, dictlen, text, text_len); +-} +- +-asmlinkage int vprintk_emit(int facility, int level, +- const char *dict, size_t dictlen, +- const char *fmt, va_list args) +-{ +- int printed_len; +- bool in_sched = false, pending_output; +- unsigned long flags; +- u64 curr_log_seq; +- +- /* Suppress unimportant messages after panic happens */ +- if (unlikely(suppress_printk)) +- return 0; +- +- if (level == LOGLEVEL_SCHED) { +- level = LOGLEVEL_DEFAULT; +- in_sched = true; +- } +- +- boot_delay_msec(level); +- printk_delay(); +- +- /* This stops the holder of console_sem just where we want him */ +- logbuf_lock_irqsave(flags); +- curr_log_seq = log_next_seq; +- printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args); +- pending_output = (curr_log_seq != log_next_seq); +- logbuf_unlock_irqrestore(flags); +- +- /* If called from the scheduler, we can not call up(). */ +- if (!in_sched && pending_output) { +- /* +- * Disable preemption to avoid being preempted while holding +- * console_sem which would prevent anyone from printing to +- * console +- */ +- preempt_disable(); +- /* +- * Try to acquire and then immediately release the console +- * semaphore. The release will print out buffers and wake up +- * /dev/kmsg and syslog() users. +- */ +- if (console_trylock_spinning()) +- console_unlock(); +- preempt_enable(); +- } ++ printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, ++ dict, dictlen, text, text_len); + +- if (pending_output) +- wake_up_klogd(); ++ prb_commit(&h); + return printed_len; + } + EXPORT_SYMBOL(vprintk_emit); +@@ -2494,7 +2271,7 @@ void console_unlock(void) + console_lock_spinning_enable(); + + stop_critical_timings(); /* don't trace print latency */ +- call_console_drivers(ext_text, ext_len, text, len); ++ //call_console_drivers(ext_text, ext_len, text, len); + start_critical_timings(); + + if (console_lock_spinning_disable_and_check()) { diff --git a/kernel/patches-5.4.x-rt/0028-0011-printk_safe-remove-printk-safe-code.patch b/kernel/patches-5.4.x-rt/0028-0011-printk_safe-remove-printk-safe-code.patch new file mode 100644 index 000000000..aa8be9a69 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0028-0011-printk_safe-remove-printk-safe-code.patch @@ -0,0 +1,699 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:49 +0100 +Subject: [PATCH 11/25] printk_safe: remove printk safe code + +vprintk variants are now NMI-safe so there is no longer a need for +the "safe" calls. + +NOTE: This also removes printk flushing functionality. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + arch/powerpc/kernel/traps.c | 1 + arch/powerpc/kernel/watchdog.c | 5 + include/linux/hardirq.h | 2 + include/linux/printk.h | 27 -- + init/main.c | 1 + kernel/kexec_core.c | 1 + kernel/panic.c | 3 + kernel/printk/Makefile | 1 + kernel/printk/internal.h | 30 -- + kernel/printk/printk.c | 13 - + kernel/printk/printk_safe.c | 415 ----------------------------------------- + kernel/trace/trace.c | 2 + lib/nmi_backtrace.c | 6 + 13 files changed, 7 insertions(+), 500 deletions(-) + delete mode 100644 kernel/printk/printk_safe.c + +--- a/arch/powerpc/kernel/traps.c ++++ b/arch/powerpc/kernel/traps.c +@@ -171,7 +171,6 @@ extern void panic_flush_kmsg_start(void) + + extern void panic_flush_kmsg_end(void) + { +- printk_safe_flush_on_panic(); + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(0); + debug_locks_off(); +--- a/arch/powerpc/kernel/watchdog.c ++++ b/arch/powerpc/kernel/watchdog.c +@@ -181,11 +181,6 @@ static void watchdog_smp_panic(int cpu, + + wd_smp_unlock(&flags); + +- printk_safe_flush(); +- /* +- * printk_safe_flush() seems to require another print +- * before anything actually goes out to console. +- */ + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + +--- a/include/linux/hardirq.h ++++ b/include/linux/hardirq.h +@@ -68,7 +68,6 @@ extern void irq_exit(void); + #define nmi_enter() \ + do { \ + arch_nmi_enter(); \ +- printk_nmi_enter(); \ + lockdep_off(); \ + ftrace_nmi_enter(); \ + BUG_ON(in_nmi()); \ +@@ -85,7 +84,6 @@ extern void irq_exit(void); + preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ + ftrace_nmi_exit(); \ + lockdep_on(); \ +- printk_nmi_exit(); \ + arch_nmi_exit(); \ + } while (0) + +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -146,18 +146,6 @@ static inline __printf(1, 2) __cold + void early_printk(const char *s, ...) { } + #endif + +-#ifdef CONFIG_PRINTK_NMI +-extern void printk_nmi_enter(void); +-extern void printk_nmi_exit(void); +-extern void printk_nmi_direct_enter(void); +-extern void printk_nmi_direct_exit(void); +-#else +-static inline void printk_nmi_enter(void) { } +-static inline void printk_nmi_exit(void) { } +-static inline void printk_nmi_direct_enter(void) { } +-static inline void printk_nmi_direct_exit(void) { } +-#endif /* PRINTK_NMI */ +- + #ifdef CONFIG_PRINTK + asmlinkage __printf(5, 0) + int vprintk_emit(int facility, int level, +@@ -202,9 +190,6 @@ void __init setup_log_buf(int early); + void dump_stack_print_info(const char *log_lvl); + void show_regs_print_info(const char *log_lvl); + extern asmlinkage void dump_stack(void) __cold; +-extern void printk_safe_init(void); +-extern void printk_safe_flush(void); +-extern void printk_safe_flush_on_panic(void); + #else + static inline __printf(1, 0) + int vprintk(const char *s, va_list args) +@@ -268,18 +253,6 @@ static inline void show_regs_print_info( + static inline void dump_stack(void) + { + } +- +-static inline void printk_safe_init(void) +-{ +-} +- +-static inline void printk_safe_flush(void) +-{ +-} +- +-static inline void printk_safe_flush_on_panic(void) +-{ +-} + #endif + + extern int kptr_restrict; +--- a/init/main.c ++++ b/init/main.c +@@ -694,7 +694,6 @@ asmlinkage __visible void __init start_k + boot_init_stack_canary(); + + time_init(); +- printk_safe_init(); + perf_event_init(); + profile_init(); + call_function_init(); +--- a/kernel/kexec_core.c ++++ b/kernel/kexec_core.c +@@ -972,7 +972,6 @@ void crash_kexec(struct pt_regs *regs) + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + if (old_cpu == PANIC_CPU_INVALID) { + /* This is the 1st CPU which comes here, so go ahead. */ +- printk_safe_flush_on_panic(); + __crash_kexec(regs); + + /* +--- a/kernel/panic.c ++++ b/kernel/panic.c +@@ -237,7 +237,6 @@ void panic(const char *fmt, ...) + * Bypass the panic_cpu check and call __crash_kexec directly. + */ + if (!_crash_kexec_post_notifiers) { +- printk_safe_flush_on_panic(); + __crash_kexec(NULL); + + /* +@@ -261,8 +260,6 @@ void panic(const char *fmt, ...) + */ + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + +- /* Call flush even twice. It tries harder with a single online CPU */ +- printk_safe_flush_on_panic(); + kmsg_dump(KMSG_DUMP_PANIC); + + /* +--- a/kernel/printk/Makefile ++++ b/kernel/printk/Makefile +@@ -1,4 +1,3 @@ + # SPDX-License-Identifier: GPL-2.0-only + obj-y = printk.o +-obj-$(CONFIG_PRINTK) += printk_safe.o + obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o +--- a/kernel/printk/internal.h ++++ b/kernel/printk/internal.h +@@ -20,32 +20,6 @@ int vprintk_store(int facility, int leve + __printf(1, 0) int vprintk_default(const char *fmt, va_list args); + __printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); + __printf(1, 0) int vprintk_func(const char *fmt, va_list args); +-void __printk_safe_enter(void); +-void __printk_safe_exit(void); +- +-#define printk_safe_enter_irqsave(flags) \ +- do { \ +- local_irq_save(flags); \ +- __printk_safe_enter(); \ +- } while (0) +- +-#define printk_safe_exit_irqrestore(flags) \ +- do { \ +- __printk_safe_exit(); \ +- local_irq_restore(flags); \ +- } while (0) +- +-#define printk_safe_enter_irq() \ +- do { \ +- local_irq_disable(); \ +- __printk_safe_enter(); \ +- } while (0) +- +-#define printk_safe_exit_irq() \ +- do { \ +- __printk_safe_exit(); \ +- local_irq_enable(); \ +- } while (0) + + void defer_console_output(void); + +@@ -58,10 +32,10 @@ void defer_console_output(void); + * semaphore and some of console functions (console_unlock()/etc.), so + * printk-safe must preserve the existing local IRQ guarantees. + */ ++#endif /* CONFIG_PRINTK */ ++ + #define printk_safe_enter_irqsave(flags) local_irq_save(flags) + #define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) + + #define printk_safe_enter_irq() local_irq_disable() + #define printk_safe_exit_irq() local_irq_enable() +- +-#endif /* CONFIG_PRINTK */ +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1736,13 +1736,6 @@ static bool cont_add(u32 caller_id, int + } + #endif /* 0 */ + +-int vprintk_store(int facility, int level, +- const char *dict, size_t dictlen, +- const char *fmt, va_list args) +-{ +- return vprintk_emit(facility, level, dict, dictlen, fmt, args); +-} +- + /* ring buffer used as memory allocator for temporary sprint buffers */ + DECLARE_STATIC_PRINTKRB(sprint_rb, + ilog2(PRINTK_RECORD_MAX + sizeof(struct prb_entry) + +@@ -1811,6 +1804,11 @@ asmlinkage int vprintk_emit(int facility + } + EXPORT_SYMBOL(vprintk_emit); + ++__printf(1, 0) int vprintk_func(const char *fmt, va_list args) ++{ ++ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); ++} ++ + asmlinkage int vprintk(const char *fmt, va_list args) + { + return vprintk_func(fmt, args); +@@ -3211,5 +3209,4 @@ void kmsg_dump_rewind(struct kmsg_dumper + logbuf_unlock_irqrestore(flags); + } + EXPORT_SYMBOL_GPL(kmsg_dump_rewind); +- + #endif +--- a/kernel/printk/printk_safe.c ++++ /dev/null +@@ -1,415 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-or-later +-/* +- * printk_safe.c - Safe printk for printk-deadlock-prone contexts +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include "internal.h" +- +-/* +- * printk() could not take logbuf_lock in NMI context. Instead, +- * it uses an alternative implementation that temporary stores +- * the strings into a per-CPU buffer. The content of the buffer +- * is later flushed into the main ring buffer via IRQ work. +- * +- * The alternative implementation is chosen transparently +- * by examinig current printk() context mask stored in @printk_context +- * per-CPU variable. +- * +- * The implementation allows to flush the strings also from another CPU. +- * There are situations when we want to make sure that all buffers +- * were handled or when IRQs are blocked. +- */ +-static int printk_safe_irq_ready __read_mostly; +- +-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ +- sizeof(atomic_t) - \ +- sizeof(atomic_t) - \ +- sizeof(struct irq_work)) +- +-struct printk_safe_seq_buf { +- atomic_t len; /* length of written data */ +- atomic_t message_lost; +- struct irq_work work; /* IRQ work that flushes the buffer */ +- unsigned char buffer[SAFE_LOG_BUF_LEN]; +-}; +- +-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq); +-static DEFINE_PER_CPU(int, printk_context); +- +-#ifdef CONFIG_PRINTK_NMI +-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); +-#endif +- +-/* Get flushed in a more safe context. */ +-static void queue_flush_work(struct printk_safe_seq_buf *s) +-{ +- if (printk_safe_irq_ready) +- irq_work_queue(&s->work); +-} +- +-/* +- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe +- * have dedicated buffers, because otherwise printk-safe preempted by +- * NMI-printk would have overwritten the NMI messages. +- * +- * The messages are flushed from irq work (or from panic()), possibly, +- * from other CPU, concurrently with printk_safe_log_store(). Should this +- * happen, printk_safe_log_store() will notice the buffer->len mismatch +- * and repeat the write. +- */ +-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, +- const char *fmt, va_list args) +-{ +- int add; +- size_t len; +- va_list ap; +- +-again: +- len = atomic_read(&s->len); +- +- /* The trailing '\0' is not counted into len. */ +- if (len >= sizeof(s->buffer) - 1) { +- atomic_inc(&s->message_lost); +- queue_flush_work(s); +- return 0; +- } +- +- /* +- * Make sure that all old data have been read before the buffer +- * was reset. This is not needed when we just append data. +- */ +- if (!len) +- smp_rmb(); +- +- va_copy(ap, args); +- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap); +- va_end(ap); +- if (!add) +- return 0; +- +- /* +- * Do it once again if the buffer has been flushed in the meantime. +- * Note that atomic_cmpxchg() is an implicit memory barrier that +- * makes sure that the data were written before updating s->len. +- */ +- if (atomic_cmpxchg(&s->len, len, len + add) != len) +- goto again; +- +- queue_flush_work(s); +- return add; +-} +- +-static inline void printk_safe_flush_line(const char *text, int len) +-{ +- /* +- * Avoid any console drivers calls from here, because we may be +- * in NMI or printk_safe context (when in panic). The messages +- * must go only into the ring buffer at this stage. Consoles will +- * get explicitly called later when a crashdump is not generated. +- */ +- printk_deferred("%.*s", len, text); +-} +- +-/* printk part of the temporary buffer line by line */ +-static int printk_safe_flush_buffer(const char *start, size_t len) +-{ +- const char *c, *end; +- bool header; +- +- c = start; +- end = start + len; +- header = true; +- +- /* Print line by line. */ +- while (c < end) { +- if (*c == '\n') { +- printk_safe_flush_line(start, c - start + 1); +- start = ++c; +- header = true; +- continue; +- } +- +- /* Handle continuous lines or missing new line. */ +- if ((c + 1 < end) && printk_get_level(c)) { +- if (header) { +- c = printk_skip_level(c); +- continue; +- } +- +- printk_safe_flush_line(start, c - start); +- start = c++; +- header = true; +- continue; +- } +- +- header = false; +- c++; +- } +- +- /* Check if there was a partial line. Ignore pure header. */ +- if (start < end && !header) { +- static const char newline[] = KERN_CONT "\n"; +- +- printk_safe_flush_line(start, end - start); +- printk_safe_flush_line(newline, strlen(newline)); +- } +- +- return len; +-} +- +-static void report_message_lost(struct printk_safe_seq_buf *s) +-{ +- int lost = atomic_xchg(&s->message_lost, 0); +- +- if (lost) +- printk_deferred("Lost %d message(s)!\n", lost); +-} +- +-/* +- * Flush data from the associated per-CPU buffer. The function +- * can be called either via IRQ work or independently. +- */ +-static void __printk_safe_flush(struct irq_work *work) +-{ +- static raw_spinlock_t read_lock = +- __RAW_SPIN_LOCK_INITIALIZER(read_lock); +- struct printk_safe_seq_buf *s = +- container_of(work, struct printk_safe_seq_buf, work); +- unsigned long flags; +- size_t len; +- int i; +- +- /* +- * The lock has two functions. First, one reader has to flush all +- * available message to make the lockless synchronization with +- * writers easier. Second, we do not want to mix messages from +- * different CPUs. This is especially important when printing +- * a backtrace. +- */ +- raw_spin_lock_irqsave(&read_lock, flags); +- +- i = 0; +-more: +- len = atomic_read(&s->len); +- +- /* +- * This is just a paranoid check that nobody has manipulated +- * the buffer an unexpected way. If we printed something then +- * @len must only increase. Also it should never overflow the +- * buffer size. +- */ +- if ((i && i >= len) || len > sizeof(s->buffer)) { +- const char *msg = "printk_safe_flush: internal error\n"; +- +- printk_safe_flush_line(msg, strlen(msg)); +- len = 0; +- } +- +- if (!len) +- goto out; /* Someone else has already flushed the buffer. */ +- +- /* Make sure that data has been written up to the @len */ +- smp_rmb(); +- i += printk_safe_flush_buffer(s->buffer + i, len - i); +- +- /* +- * Check that nothing has got added in the meantime and truncate +- * the buffer. Note that atomic_cmpxchg() is an implicit memory +- * barrier that makes sure that the data were copied before +- * updating s->len. +- */ +- if (atomic_cmpxchg(&s->len, len, 0) != len) +- goto more; +- +-out: +- report_message_lost(s); +- raw_spin_unlock_irqrestore(&read_lock, flags); +-} +- +-/** +- * printk_safe_flush - flush all per-cpu nmi buffers. +- * +- * The buffers are flushed automatically via IRQ work. This function +- * is useful only when someone wants to be sure that all buffers have +- * been flushed at some point. +- */ +-void printk_safe_flush(void) +-{ +- int cpu; +- +- for_each_possible_cpu(cpu) { +-#ifdef CONFIG_PRINTK_NMI +- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work); +-#endif +- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work); +- } +-} +- +-/** +- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system +- * goes down. +- * +- * Similar to printk_safe_flush() but it can be called even in NMI context when +- * the system goes down. It does the best effort to get NMI messages into +- * the main ring buffer. +- * +- * Note that it could try harder when there is only one CPU online. +- */ +-void printk_safe_flush_on_panic(void) +-{ +- /* +- * Make sure that we could access the main ring buffer. +- * Do not risk a double release when more CPUs are up. +- */ +- if (raw_spin_is_locked(&logbuf_lock)) { +- if (num_online_cpus() > 1) +- return; +- +- debug_locks_off(); +- raw_spin_lock_init(&logbuf_lock); +- } +- +- printk_safe_flush(); +-} +- +-#ifdef CONFIG_PRINTK_NMI +-/* +- * Safe printk() for NMI context. It uses a per-CPU buffer to +- * store the message. NMIs are not nested, so there is always only +- * one writer running. But the buffer might get flushed from another +- * CPU, so we need to be careful. +- */ +-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) +-{ +- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); +- +- return printk_safe_log_store(s, fmt, args); +-} +- +-void notrace printk_nmi_enter(void) +-{ +- this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK); +-} +- +-void notrace printk_nmi_exit(void) +-{ +- this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK); +-} +- +-/* +- * Marks a code that might produce many messages in NMI context +- * and the risk of losing them is more critical than eventual +- * reordering. +- * +- * It has effect only when called in NMI context. Then printk() +- * will try to store the messages into the main logbuf directly +- * and use the per-CPU buffers only as a fallback when the lock +- * is not available. +- */ +-void printk_nmi_direct_enter(void) +-{ +- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) +- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK); +-} +- +-void printk_nmi_direct_exit(void) +-{ +- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); +-} +- +-#else +- +-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) +-{ +- return 0; +-} +- +-#endif /* CONFIG_PRINTK_NMI */ +- +-/* +- * Lock-less printk(), to avoid deadlocks should the printk() recurse +- * into itself. It uses a per-CPU buffer to store the message, just like +- * NMI. +- */ +-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) +-{ +- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); +- +- return printk_safe_log_store(s, fmt, args); +-} +- +-/* Can be preempted by NMI. */ +-void __printk_safe_enter(void) +-{ +- this_cpu_inc(printk_context); +-} +- +-/* Can be preempted by NMI. */ +-void __printk_safe_exit(void) +-{ +- this_cpu_dec(printk_context); +-} +- +-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) +-{ +- /* +- * Try to use the main logbuf even in NMI. But avoid calling console +- * drivers that might have their own locks. +- */ +- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) && +- raw_spin_trylock(&logbuf_lock)) { +- int len; +- +- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); +- raw_spin_unlock(&logbuf_lock); +- defer_console_output(); +- return len; +- } +- +- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ +- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) +- return vprintk_nmi(fmt, args); +- +- /* Use extra buffer to prevent a recursion deadlock in safe mode. */ +- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) +- return vprintk_safe(fmt, args); +- +- /* No obstacles. */ +- return vprintk_default(fmt, args); +-} +- +-void __init printk_safe_init(void) +-{ +- int cpu; +- +- for_each_possible_cpu(cpu) { +- struct printk_safe_seq_buf *s; +- +- s = &per_cpu(safe_print_seq, cpu); +- init_irq_work(&s->work, __printk_safe_flush); +- +-#ifdef CONFIG_PRINTK_NMI +- s = &per_cpu(nmi_print_seq, cpu); +- init_irq_work(&s->work, __printk_safe_flush); +-#endif +- } +- +- /* +- * In the highly unlikely event that a NMI were to trigger at +- * this moment. Make sure IRQ work is set up before this +- * variable is set. +- */ +- barrier(); +- printk_safe_irq_ready = 1; +- +- /* Flush pending messages that did not have scheduled IRQ works. */ +- printk_safe_flush(); +-} +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -8929,7 +8929,6 @@ void ftrace_dump(enum ftrace_dump_mode o + tracing_off(); + + local_irq_save(flags); +- printk_nmi_direct_enter(); + + /* Simulate the iterator */ + trace_init_global_iter(&iter); +@@ -9006,7 +9005,6 @@ void ftrace_dump(enum ftrace_dump_mode o + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); + } + atomic_dec(&dump_running); +- printk_nmi_direct_exit(); + local_irq_restore(flags); + } + EXPORT_SYMBOL_GPL(ftrace_dump); +--- a/lib/nmi_backtrace.c ++++ b/lib/nmi_backtrace.c +@@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const + touch_softlockup_watchdog(); + } + +- /* +- * Force flush any remote buffers that might be stuck in IRQ context +- * and therefore could not run their irq_work. +- */ +- printk_safe_flush(); +- + clear_bit_unlock(0, &backtrace_flag); + put_cpu(); + } diff --git a/kernel/patches-5.4.x-rt/0029-0012-printk-minimize-console-locking-implementation.patch b/kernel/patches-5.4.x-rt/0029-0012-printk-minimize-console-locking-implementation.patch new file mode 100644 index 000000000..c509bd4e5 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0029-0012-printk-minimize-console-locking-implementation.patch @@ -0,0 +1,329 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:50 +0100 +Subject: [PATCH 12/25] printk: minimize console locking implementation + +Since printing of the printk buffer is now handled by the printk +kthread, minimize the console locking functions to just handle +locking of the console. + +NOTE: With this console_flush_on_panic will no longer flush. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 255 ------------------------------------------------- + 1 file changed, 1 insertion(+), 254 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -227,19 +227,7 @@ static int nr_ext_console_drivers; + + static int __down_trylock_console_sem(unsigned long ip) + { +- int lock_failed; +- unsigned long flags; +- +- /* +- * Here and in __up_console_sem() we need to be in safe mode, +- * because spindump/WARN/etc from under console ->lock will +- * deadlock in printk()->down_trylock_console_sem() otherwise. +- */ +- printk_safe_enter_irqsave(flags); +- lock_failed = down_trylock(&console_sem); +- printk_safe_exit_irqrestore(flags); +- +- if (lock_failed) ++ if (down_trylock(&console_sem)) + return 1; + mutex_acquire(&console_lock_dep_map, 0, 1, ip); + return 0; +@@ -248,13 +236,9 @@ static int __down_trylock_console_sem(un + + static void __up_console_sem(unsigned long ip) + { +- unsigned long flags; +- + mutex_release(&console_lock_dep_map, 1, ip); + +- printk_safe_enter_irqsave(flags); + up(&console_sem); +- printk_safe_exit_irqrestore(flags); + } + #define up_console_sem() __up_console_sem(_RET_IP_) + +@@ -1552,82 +1536,6 @@ static void format_text(struct printk_lo + } + + /* +- * Special console_lock variants that help to reduce the risk of soft-lockups. +- * They allow to pass console_lock to another printk() call using a busy wait. +- */ +- +-#ifdef CONFIG_LOCKDEP +-static struct lockdep_map console_owner_dep_map = { +- .name = "console_owner" +-}; +-#endif +- +-static DEFINE_RAW_SPINLOCK(console_owner_lock); +-static struct task_struct *console_owner; +-static bool console_waiter; +- +-/** +- * console_lock_spinning_enable - mark beginning of code where another +- * thread might safely busy wait +- * +- * This basically converts console_lock into a spinlock. This marks +- * the section where the console_lock owner can not sleep, because +- * there may be a waiter spinning (like a spinlock). Also it must be +- * ready to hand over the lock at the end of the section. +- */ +-static void console_lock_spinning_enable(void) +-{ +- raw_spin_lock(&console_owner_lock); +- console_owner = current; +- raw_spin_unlock(&console_owner_lock); +- +- /* The waiter may spin on us after setting console_owner */ +- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +-} +- +-/** +- * console_lock_spinning_disable_and_check - mark end of code where another +- * thread was able to busy wait and check if there is a waiter +- * +- * This is called at the end of the section where spinning is allowed. +- * It has two functions. First, it is a signal that it is no longer +- * safe to start busy waiting for the lock. Second, it checks if +- * there is a busy waiter and passes the lock rights to her. +- * +- * Important: Callers lose the lock if there was a busy waiter. +- * They must not touch items synchronized by console_lock +- * in this case. +- * +- * Return: 1 if the lock rights were passed, 0 otherwise. +- */ +-static int console_lock_spinning_disable_and_check(void) +-{ +- int waiter; +- +- raw_spin_lock(&console_owner_lock); +- waiter = READ_ONCE(console_waiter); +- console_owner = NULL; +- raw_spin_unlock(&console_owner_lock); +- +- if (!waiter) { +- spin_release(&console_owner_dep_map, 1, _THIS_IP_); +- return 0; +- } +- +- /* The waiter is now free to continue */ +- WRITE_ONCE(console_waiter, false); +- +- spin_release(&console_owner_dep_map, 1, _THIS_IP_); +- +- /* +- * Hand off console_lock to waiter. The waiter will perform +- * the up(). After this, the waiter is the console_lock owner. +- */ +- mutex_release(&console_lock_dep_map, 1, _THIS_IP_); +- return 1; +-} +- +-/* + * Call the console drivers, asking them to write out + * log_buf[start] to log_buf[end - 1]. + * The console_lock must be held. +@@ -1889,8 +1797,6 @@ static ssize_t msg_print_ext_header(char + static ssize_t msg_print_ext_body(char *buf, size_t size, + char *dict, size_t dict_len, + char *text, size_t text_len) { return 0; } +-static void console_lock_spinning_enable(void) { } +-static int console_lock_spinning_disable_and_check(void) { return 0; } + static void call_console_drivers(const char *ext_text, size_t ext_len, + const char *text, size_t len) {} + static size_t msg_print_text(const struct printk_log *msg, bool syslog, +@@ -2125,35 +2031,6 @@ int is_console_locked(void) + { + return console_locked; + } +-EXPORT_SYMBOL(is_console_locked); +- +-/* +- * Check if we have any console that is capable of printing while cpu is +- * booting or shutting down. Requires console_sem. +- */ +-static int have_callable_console(void) +-{ +- struct console *con; +- +- for_each_console(con) +- if ((con->flags & CON_ENABLED) && +- (con->flags & CON_ANYTIME)) +- return 1; +- +- return 0; +-} +- +-/* +- * Can we actually use the console at this time on this cpu? +- * +- * Console drivers may assume that per-cpu resources have been allocated. So +- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't +- * call them until this CPU is officially up. +- */ +-static inline int can_use_console(void) +-{ +- return cpu_online(raw_smp_processor_id()) || have_callable_console(); +-} + + /** + * console_unlock - unlock the console system +@@ -2161,147 +2038,17 @@ static inline int can_use_console(void) + * Releases the console_lock which the caller holds on the console system + * and the console driver list. + * +- * While the console_lock was held, console output may have been buffered +- * by printk(). If this is the case, console_unlock(); emits +- * the output prior to releasing the lock. +- * +- * If there is output waiting, we wake /dev/kmsg and syslog() users. +- * + * console_unlock(); may be called from any context. + */ + void console_unlock(void) + { +- static char ext_text[CONSOLE_EXT_LOG_MAX]; +- static char text[LOG_LINE_MAX + PREFIX_MAX]; +- unsigned long flags; +- bool do_cond_resched, retry; +- + if (console_suspended) { + up_console_sem(); + return; + } + +- /* +- * Console drivers are called with interrupts disabled, so +- * @console_may_schedule should be cleared before; however, we may +- * end up dumping a lot of lines, for example, if called from +- * console registration path, and should invoke cond_resched() +- * between lines if allowable. Not doing so can cause a very long +- * scheduling stall on a slow console leading to RCU stall and +- * softlockup warnings which exacerbate the issue with more +- * messages practically incapacitating the system. +- * +- * console_trylock() is not able to detect the preemptive +- * context reliably. Therefore the value must be stored before +- * and cleared after the the "again" goto label. +- */ +- do_cond_resched = console_may_schedule; +-again: +- console_may_schedule = 0; +- +- /* +- * We released the console_sem lock, so we need to recheck if +- * cpu is online and (if not) is there at least one CON_ANYTIME +- * console. +- */ +- if (!can_use_console()) { +- console_locked = 0; +- up_console_sem(); +- return; +- } +- +- for (;;) { +- struct printk_log *msg; +- size_t ext_len = 0; +- size_t len; +- +- printk_safe_enter_irqsave(flags); +- raw_spin_lock(&logbuf_lock); +- if (console_seq < log_first_seq) { +- len = sprintf(text, +- "** %llu printk messages dropped **\n", +- log_first_seq - console_seq); +- +- /* messages are gone, move to first one */ +- console_seq = log_first_seq; +- console_idx = log_first_idx; +- } else { +- len = 0; +- } +-skip: +- if (console_seq == log_next_seq) +- break; +- +- msg = log_from_idx(console_idx); +- if (suppress_message_printing(msg->level)) { +- /* +- * Skip record we have buffered and already printed +- * directly to the console when we received it, and +- * record that has level above the console loglevel. +- */ +- console_idx = log_next(console_idx); +- console_seq++; +- goto skip; +- } +- +- len += msg_print_text(msg, +- console_msg_format & MSG_FORMAT_SYSLOG, +- printk_time, text + len, sizeof(text) - len); +- if (nr_ext_console_drivers) { +- ext_len = msg_print_ext_header(ext_text, +- sizeof(ext_text), +- msg, console_seq); +- ext_len += msg_print_ext_body(ext_text + ext_len, +- sizeof(ext_text) - ext_len, +- log_dict(msg), msg->dict_len, +- log_text(msg), msg->text_len); +- } +- console_idx = log_next(console_idx); +- console_seq++; +- raw_spin_unlock(&logbuf_lock); +- +- /* +- * While actively printing out messages, if another printk() +- * were to occur on another CPU, it may wait for this one to +- * finish. This task can not be preempted if there is a +- * waiter waiting to take over. +- */ +- console_lock_spinning_enable(); +- +- stop_critical_timings(); /* don't trace print latency */ +- //call_console_drivers(ext_text, ext_len, text, len); +- start_critical_timings(); +- +- if (console_lock_spinning_disable_and_check()) { +- printk_safe_exit_irqrestore(flags); +- return; +- } +- +- printk_safe_exit_irqrestore(flags); +- +- if (do_cond_resched) +- cond_resched(); +- } +- + console_locked = 0; +- +- raw_spin_unlock(&logbuf_lock); +- + up_console_sem(); +- +- /* +- * Someone could have filled up the buffer again, so re-check if there's +- * something to flush. In case we cannot trylock the console_sem again, +- * there's a new owner and the console_unlock() from them will do the +- * flush, no worries. +- */ +- raw_spin_lock(&logbuf_lock); +- retry = console_seq != log_next_seq; +- raw_spin_unlock(&logbuf_lock); +- printk_safe_exit_irqrestore(flags); +- +- if (retry && console_trylock()) +- goto again; + } + EXPORT_SYMBOL(console_unlock); + diff --git a/kernel/patches-5.4.x-rt/0030-0013-printk-track-seq-per-console.patch b/kernel/patches-5.4.x-rt/0030-0013-printk-track-seq-per-console.patch new file mode 100644 index 000000000..41a6f3d94 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0030-0013-printk-track-seq-per-console.patch @@ -0,0 +1,92 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:51 +0100 +Subject: [PATCH 13/25] printk: track seq per console + +Allow each console to track which seq record was last printed. This +simplifies identifying dropped records. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/console.h | 1 + + kernel/printk/printk.c | 30 +++++++++++++++++++++++++++--- + 2 files changed, 28 insertions(+), 3 deletions(-) + +--- a/include/linux/console.h ++++ b/include/linux/console.h +@@ -153,6 +153,7 @@ struct console { + short flags; + short index; + int cflag; ++ unsigned long printk_seq; + void *data; + struct console *next; + }; +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1507,6 +1507,16 @@ SYSCALL_DEFINE3(syslog, int, type, char + return do_syslog(type, buf, len, SYSLOG_FROM_READER); + } + ++static void print_console_dropped(struct console *con, u64 count) ++{ ++ char text[64]; ++ int len; ++ ++ len = sprintf(text, "** %llu printk message%s dropped **\n", ++ count, count > 1 ? "s" : ""); ++ con->write(con, text, len); ++} ++ + static void format_text(struct printk_log *msg, u64 seq, + char *ext_text, size_t *ext_len, + char *text, size_t *len, bool time) +@@ -1540,7 +1550,7 @@ static void format_text(struct printk_lo + * log_buf[start] to log_buf[end - 1]. + * The console_lock must be held. + */ +-static void call_console_drivers(const char *ext_text, size_t ext_len, ++static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len, + const char *text, size_t len) + { + struct console *con; +@@ -1558,6 +1568,19 @@ static void call_console_drivers(const c + if (!cpu_online(raw_smp_processor_id()) && + !(con->flags & CON_ANYTIME)) + continue; ++ if (con->printk_seq >= seq) ++ continue; ++ ++ con->printk_seq++; ++ if (con->printk_seq < seq) { ++ print_console_dropped(con, seq - con->printk_seq); ++ con->printk_seq = seq; ++ } ++ ++ /* for supressed messages, only seq is updated */ ++ if (len == 0 && ext_len == 0) ++ continue; ++ + if (con->flags & CON_EXTENDED) + con->write(con, ext_text, ext_len); + else +@@ -1797,7 +1820,7 @@ static ssize_t msg_print_ext_header(char + static ssize_t msg_print_ext_body(char *buf, size_t size, + char *dict, size_t dict_len, + char *text, size_t text_len) { return 0; } +-static void call_console_drivers(const char *ext_text, size_t ext_len, ++static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len, + const char *text, size_t len) {} + static size_t msg_print_text(const struct printk_log *msg, bool syslog, + bool time, char *buf, size_t size) { return 0; } +@@ -2550,8 +2573,9 @@ static int printk_kthread_func(void *dat + &len, printk_time); + + console_lock(); ++ call_console_drivers(master_seq, ext_text, ++ ext_len, text, len); + if (len > 0 || ext_len > 0) { +- call_console_drivers(ext_text, ext_len, text, len); + boot_delay_msec(msg->level); + printk_delay(); + } diff --git a/kernel/patches-5.4.x-rt/0031-0014-printk-do-boot_delay_msec-inside-printk_delay.patch b/kernel/patches-5.4.x-rt/0031-0014-printk-do-boot_delay_msec-inside-printk_delay.patch new file mode 100644 index 000000000..0a12b5135 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0031-0014-printk-do-boot_delay_msec-inside-printk_delay.patch @@ -0,0 +1,71 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:52 +0100 +Subject: [PATCH 14/25] printk: do boot_delay_msec inside printk_delay + +Both functions needed to be called one after the other, so just +integrate boot_delay_msec into printk_delay for simplification. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 35 +++++++++++++++++------------------ + 1 file changed, 17 insertions(+), 18 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1507,6 +1507,21 @@ SYSCALL_DEFINE3(syslog, int, type, char + return do_syslog(type, buf, len, SYSLOG_FROM_READER); + } + ++int printk_delay_msec __read_mostly; ++ ++static inline void printk_delay(int level) ++{ ++ boot_delay_msec(level); ++ if (unlikely(printk_delay_msec)) { ++ int m = printk_delay_msec; ++ ++ while (m--) { ++ mdelay(1); ++ touch_nmi_watchdog(); ++ } ++ } ++} ++ + static void print_console_dropped(struct console *con, u64 count) + { + char text[64]; +@@ -1588,20 +1603,6 @@ static void call_console_drivers(u64 seq + } + } + +-int printk_delay_msec __read_mostly; +- +-static inline void printk_delay(void) +-{ +- if (unlikely(printk_delay_msec)) { +- int m = printk_delay_msec; +- +- while (m--) { +- mdelay(1); +- touch_nmi_watchdog(); +- } +- } +-} +- + static inline u32 printk_caller_id(void) + { + return in_task() ? task_pid_nr(current) : +@@ -2575,10 +2576,8 @@ static int printk_kthread_func(void *dat + console_lock(); + call_console_drivers(master_seq, ext_text, + ext_len, text, len); +- if (len > 0 || ext_len > 0) { +- boot_delay_msec(msg->level); +- printk_delay(); +- } ++ if (len > 0 || ext_len > 0) ++ printk_delay(msg->level); + console_unlock(); + } + diff --git a/kernel/patches-5.4.x-rt/0032-0015-printk-print-history-for-new-consoles.patch b/kernel/patches-5.4.x-rt/0032-0015-printk-print-history-for-new-consoles.patch new file mode 100644 index 000000000..1b222c55b --- /dev/null +++ b/kernel/patches-5.4.x-rt/0032-0015-printk-print-history-for-new-consoles.patch @@ -0,0 +1,118 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:53 +0100 +Subject: [PATCH 15/25] printk: print history for new consoles + +When new consoles register, they currently print how many messages +they have missed. However, many (or all) of those messages may still +be in the ring buffer. Add functionality to print as much of the +history as available. This is a clean replacement of the old +exclusive console hack. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/console.h | 1 + kernel/printk/printk.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 76 insertions(+) + +--- a/include/linux/console.h ++++ b/include/linux/console.h +@@ -154,6 +154,7 @@ struct console { + short index; + int cflag; + unsigned long printk_seq; ++ int wrote_history; + void *data; + struct console *next; + }; +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1560,6 +1560,77 @@ static void format_text(struct printk_lo + } + } + ++static void printk_write_history(struct console *con, u64 master_seq) ++{ ++ struct prb_iterator iter; ++ bool time = printk_time; ++ static char *ext_text; ++ static char *text; ++ static char *buf; ++ u64 seq; ++ ++ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); ++ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL); ++ buf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL); ++ if (!ext_text || !text || !buf) ++ return; ++ ++ if (!(con->flags & CON_ENABLED)) ++ goto out; ++ ++ if (!con->write) ++ goto out; ++ ++ if (!cpu_online(raw_smp_processor_id()) && ++ !(con->flags & CON_ANYTIME)) ++ goto out; ++ ++ prb_iter_init(&iter, &printk_rb, NULL); ++ ++ for (;;) { ++ struct printk_log *msg; ++ size_t ext_len; ++ size_t len; ++ int ret; ++ ++ ret = prb_iter_next(&iter, buf, PRINTK_RECORD_MAX, &seq); ++ if (ret == 0) { ++ break; ++ } else if (ret < 0) { ++ prb_iter_init(&iter, &printk_rb, NULL); ++ continue; ++ } ++ ++ if (seq > master_seq) ++ break; ++ ++ con->printk_seq++; ++ if (con->printk_seq < seq) { ++ print_console_dropped(con, seq - con->printk_seq); ++ con->printk_seq = seq; ++ } ++ ++ msg = (struct printk_log *)buf; ++ format_text(msg, master_seq, ext_text, &ext_len, text, ++ &len, time); ++ ++ if (len == 0 && ext_len == 0) ++ continue; ++ ++ if (con->flags & CON_EXTENDED) ++ con->write(con, ext_text, ext_len); ++ else ++ con->write(con, text, len); ++ ++ printk_delay(msg->level); ++ } ++out: ++ con->wrote_history = 1; ++ kfree(ext_text); ++ kfree(text); ++ kfree(buf); ++} ++ + /* + * Call the console drivers, asking them to write out + * log_buf[start] to log_buf[end - 1]. +@@ -1578,6 +1649,10 @@ static void call_console_drivers(u64 seq + for_each_console(con) { + if (!(con->flags & CON_ENABLED)) + continue; ++ if (!con->wrote_history) { ++ printk_write_history(con, seq); ++ continue; ++ } + if (!con->write) + continue; + if (!cpu_online(raw_smp_processor_id()) && diff --git a/kernel/patches-5.4.x-rt/0033-0016-printk-implement-CON_PRINTBUFFER.patch b/kernel/patches-5.4.x-rt/0033-0016-printk-implement-CON_PRINTBUFFER.patch new file mode 100644 index 000000000..ed15f6624 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0033-0016-printk-implement-CON_PRINTBUFFER.patch @@ -0,0 +1,91 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:54 +0100 +Subject: [PATCH 16/25] printk: implement CON_PRINTBUFFER + +If the CON_PRINTBUFFER flag is not set, do not replay the history +for that console. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 34 ++++++---------------------------- + 1 file changed, 6 insertions(+), 28 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -419,10 +419,6 @@ static u32 log_first_idx; + static u64 log_next_seq; + static u32 log_next_idx; + +-/* the next printk record to write to the console */ +-static u64 console_seq; +-static u32 console_idx; +- + /* the next printk record to read after the last 'clear' command */ + static u64 clear_seq; + static u32 clear_idx; +@@ -1650,8 +1646,12 @@ static void call_console_drivers(u64 seq + if (!(con->flags & CON_ENABLED)) + continue; + if (!con->wrote_history) { +- printk_write_history(con, seq); +- continue; ++ if (con->flags & CON_PRINTBUFFER) { ++ printk_write_history(con, seq); ++ continue; ++ } ++ con->wrote_history = 1; ++ con->printk_seq = seq - 1; + } + if (!con->write) + continue; +@@ -1881,8 +1881,6 @@ EXPORT_SYMBOL(printk); + + static u64 syslog_seq; + static u32 syslog_idx; +-static u64 console_seq; +-static u32 console_idx; + static u64 log_first_seq; + static u32 log_first_idx; + static u64 log_next_seq; +@@ -2206,15 +2204,6 @@ void console_flush_on_panic(enum con_flu + */ + console_trylock(); + console_may_schedule = 0; +- +- if (mode == CONSOLE_REPLAY_ALL) { +- unsigned long flags; +- +- logbuf_lock_irqsave(flags); +- console_seq = log_first_seq; +- console_idx = log_first_idx; +- logbuf_unlock_irqrestore(flags); +- } + console_unlock(); + } + +@@ -2293,7 +2282,6 @@ early_param("keep_bootcon", keep_bootcon + void register_console(struct console *newcon) + { + int i; +- unsigned long flags; + struct console *bcon = NULL; + struct console_cmdline *c; + static bool has_preferred; +@@ -2409,16 +2397,6 @@ void register_console(struct console *ne + if (newcon->flags & CON_EXTENDED) + nr_ext_console_drivers++; + +- if (newcon->flags & CON_PRINTBUFFER) { +- /* +- * console_unlock(); will print out the buffered messages +- * for us. +- */ +- logbuf_lock_irqsave(flags); +- console_seq = syslog_seq; +- console_idx = syslog_idx; +- logbuf_unlock_irqrestore(flags); +- } + console_unlock(); + console_sysfs_notify(); + diff --git a/kernel/patches-5.4.x-rt/0034-0017-printk-add-processor-number-to-output.patch b/kernel/patches-5.4.x-rt/0034-0017-printk-add-processor-number-to-output.patch new file mode 100644 index 000000000..d46699c6c --- /dev/null +++ b/kernel/patches-5.4.x-rt/0034-0017-printk-add-processor-number-to-output.patch @@ -0,0 +1,99 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:55 +0100 +Subject: [PATCH 17/25] printk: add processor number to output + +It can be difficult to sort printk out if multiple processors are +printing simultaneously. Add the processor number to the printk +output to allow the messages to be sorted. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 19 +++++++++++++++---- + 1 file changed, 15 insertions(+), 4 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -348,6 +348,7 @@ enum log_flags { + + struct printk_log { + u64 ts_nsec; /* timestamp in nanoseconds */ ++ u16 cpu; /* cpu that generated record */ + u16 len; /* length of entire record */ + u16 text_len; /* length of text buffer */ + u16 dict_len; /* length of dictionary buffer */ +@@ -499,7 +500,7 @@ static u32 log_next(u32 idx) + + /* insert record into the buffer, discard old ones, update heads */ + static int log_store(u32 caller_id, int facility, int level, +- enum log_flags flags, u64 ts_nsec, ++ enum log_flags flags, u64 ts_nsec, u16 cpu, + const char *dict, u16 dict_len, + const char *text, u16 text_len) + { +@@ -533,6 +534,7 @@ static int log_store(u32 caller_id, int + #ifdef CONFIG_PRINTK_CALLER + msg->caller_id = caller_id; + #endif ++ msg->cpu = cpu; + msg->len = size; + + /* insert message */ +@@ -606,9 +608,9 @@ static ssize_t msg_print_ext_header(char + + do_div(ts_usec, 1000); + +- return scnprintf(buf, size, "%u,%llu,%llu,%c%s;", ++ return scnprintf(buf, size, "%u,%llu,%llu,%c%s,%hu;", + (msg->facility << 3) | msg->level, seq, ts_usec, +- msg->flags & LOG_CONT ? 'c' : '-', caller); ++ msg->flags & LOG_CONT ? 'c' : '-', caller, msg->cpu); + } + + static ssize_t msg_print_ext_body(char *buf, size_t size, +@@ -1142,6 +1144,11 @@ static inline void boot_delay_msec(int l + static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); + module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); + ++static size_t print_cpu(u16 cpu, char *buf) ++{ ++ return sprintf(buf, "%03hu: ", cpu); ++} ++ + static size_t print_syslog(unsigned int level, char *buf) + { + return sprintf(buf, "<%u>", level); +@@ -1185,6 +1192,7 @@ static size_t print_prefix(const struct + buf[len++] = ' '; + buf[len] = '\0'; + } ++ len += print_cpu(msg->cpu, buf + len); + + return len; + } +@@ -1760,6 +1768,7 @@ asmlinkage int vprintk_emit(int facility + u64 ts_nsec; + char *text; + char *rbuf; ++ int cpu; + + ts_nsec = local_clock(); + +@@ -1769,6 +1778,8 @@ asmlinkage int vprintk_emit(int facility + return printed_len; + } + ++ cpu = raw_smp_processor_id(); ++ + text = rbuf; + text_len = vscnprintf(text, PRINTK_SPRINT_MAX, fmt, args); + +@@ -1803,7 +1814,7 @@ asmlinkage int vprintk_emit(int facility + if (dict) + lflags |= LOG_NEWLINE; + +- printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, ++ printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, cpu, + dict, dictlen, text, text_len); + + prb_commit(&h); diff --git a/kernel/patches-5.4.x-rt/0035-0018-console-add-write_atomic-interface.patch b/kernel/patches-5.4.x-rt/0035-0018-console-add-write_atomic-interface.patch new file mode 100644 index 000000000..6de18c0d8 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0035-0018-console-add-write_atomic-interface.patch @@ -0,0 +1,64 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:56 +0100 +Subject: [PATCH 18/25] console: add write_atomic interface + +Add a write_atomic callback to the console. This is an optional +function for console drivers. The function must be atomic (including +NMI safe) for writing to the console. + +Console drivers must still implement the write callback. The +write_atomic callback will only be used for emergency messages. + +Creating an NMI safe write_atomic that must synchronize with write +requires a careful implementation of the console driver. To aid with +the implementation, a set of console_atomic_* functions are provided: + + void console_atomic_lock(unsigned int *flags); + void console_atomic_unlock(unsigned int flags); + +These functions synchronize using the processor-reentrant cpu lock of +the printk buffer. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/console.h | 4 ++++ + kernel/printk/printk.c | 12 ++++++++++++ + 2 files changed, 16 insertions(+) + +--- a/include/linux/console.h ++++ b/include/linux/console.h +@@ -145,6 +145,7 @@ static inline int con_debug_leave(void) + struct console { + char name[16]; + void (*write)(struct console *, const char *, unsigned); ++ void (*write_atomic)(struct console *, const char *, unsigned); + int (*read)(struct console *, char *, unsigned); + struct tty_driver *(*device)(struct console *, int *); + void (*unblank)(void); +@@ -236,4 +237,7 @@ extern void console_init(void); + void dummycon_register_output_notifier(struct notifier_block *nb); + void dummycon_unregister_output_notifier(struct notifier_block *nb); + ++extern void console_atomic_lock(unsigned int *flags); ++extern void console_atomic_unlock(unsigned int flags); ++ + #endif /* _LINUX_CONSOLE_H */ +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -3044,3 +3044,15 @@ void kmsg_dump_rewind(struct kmsg_dumper + } + EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + #endif ++ ++void console_atomic_lock(unsigned int *flags) ++{ ++ prb_lock(&printk_cpulock, flags); ++} ++EXPORT_SYMBOL(console_atomic_lock); ++ ++void console_atomic_unlock(unsigned int flags) ++{ ++ prb_unlock(&printk_cpulock, flags); ++} ++EXPORT_SYMBOL(console_atomic_unlock); diff --git a/kernel/patches-5.4.x-rt/0036-0019-printk-introduce-emergency-messages.patch b/kernel/patches-5.4.x-rt/0036-0019-printk-introduce-emergency-messages.patch new file mode 100644 index 000000000..21e12d3a7 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0036-0019-printk-introduce-emergency-messages.patch @@ -0,0 +1,272 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:57 +0100 +Subject: [PATCH 19/25] printk: introduce emergency messages + +Console messages are generally either critical or non-critical. +Critical messages are messages such as crashes or sysrq output. +Critical messages should never be lost because generally they provide +important debugging information. + +Since all console messages are output via a fully preemptible printk +kernel thread, it is possible that messages are not output because +that thread cannot be scheduled (BUG in scheduler, run-away RT task, +etc). + +To allow critical messages to be output independent of the +schedulability of the printk task, introduce an emergency mechanism +that _immediately_ outputs the message to the consoles. To avoid +possible unbounded latency issues, the emergency mechanism only +outputs the printk line provided by the caller and ignores any +pending messages in the log buffer. + +Critical messages are identified as messages (by default) with log +level LOGLEVEL_WARNING or more critical. This is configurable via the +kernel option CONSOLE_LOGLEVEL_EMERGENCY. + +Any messages output as emergency messages are skipped by the printk +thread on those consoles that output the emergency message. + +In order for a console driver to support emergency messages, the +write_atomic function must be implemented by the driver. If not +implemented, the emergency messages are handled like all other +messages and are printed by the printk thread. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/printk.h | 2 + kernel/printk/printk.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++--- + lib/Kconfig.debug | 17 +++++++ + 3 files changed, 124 insertions(+), 6 deletions(-) + +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -58,6 +58,7 @@ static inline const char *printk_skip_he + */ + #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT + #define CONSOLE_LOGLEVEL_QUIET CONFIG_CONSOLE_LOGLEVEL_QUIET ++#define CONSOLE_LOGLEVEL_EMERGENCY CONFIG_CONSOLE_LOGLEVEL_EMERGENCY + + extern int console_printk[]; + +@@ -65,6 +66,7 @@ extern int console_printk[]; + #define default_message_loglevel (console_printk[1]) + #define minimum_console_loglevel (console_printk[2]) + #define default_console_loglevel (console_printk[3]) ++#define emergency_console_loglevel (console_printk[4]) + + static inline void console_silent(void) + { +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -46,6 +46,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -62,11 +63,12 @@ + #include "braille.h" + #include "internal.h" + +-int console_printk[4] = { ++int console_printk[5] = { + CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ + CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ ++ CONSOLE_LOGLEVEL_EMERGENCY, /* emergency_console_loglevel */ + }; + EXPORT_SYMBOL_GPL(console_printk); + +@@ -498,6 +500,9 @@ static u32 log_next(u32 idx) + return idx + msg->len; + } + ++static void printk_emergency(char *buffer, int level, u64 ts_nsec, u16 cpu, ++ char *text, u16 text_len); ++ + /* insert record into the buffer, discard old ones, update heads */ + static int log_store(u32 caller_id, int facility, int level, + enum log_flags flags, u64 ts_nsec, u16 cpu, +@@ -1641,7 +1646,7 @@ static void printk_write_history(struct + * The console_lock must be held. + */ + static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len, +- const char *text, size_t len) ++ const char *text, size_t len, int level) + { + struct console *con; + +@@ -1661,6 +1666,18 @@ static void call_console_drivers(u64 seq + con->wrote_history = 1; + con->printk_seq = seq - 1; + } ++ if (con->write_atomic && level < emergency_console_loglevel) { ++ /* skip emergency messages, already printed */ ++ if (con->printk_seq < seq) ++ con->printk_seq = seq; ++ continue; ++ } ++ if (con->flags & CON_BOOT) { ++ /* skip emergency messages, already printed */ ++ if (con->printk_seq < seq) ++ con->printk_seq = seq; ++ continue; ++ } + if (!con->write) + continue; + if (!cpu_online(raw_smp_processor_id()) && +@@ -1780,8 +1797,12 @@ asmlinkage int vprintk_emit(int facility + + cpu = raw_smp_processor_id(); + +- text = rbuf; +- text_len = vscnprintf(text, PRINTK_SPRINT_MAX, fmt, args); ++ /* ++ * If this turns out to be an emergency message, there ++ * may need to be a prefix added. Leave room for it. ++ */ ++ text = rbuf + PREFIX_MAX; ++ text_len = vscnprintf(text, PRINTK_SPRINT_MAX - PREFIX_MAX, fmt, args); + + /* strip and flag a trailing newline */ + if (text_len && text[text_len-1] == '\n') { +@@ -1814,6 +1835,14 @@ asmlinkage int vprintk_emit(int facility + if (dict) + lflags |= LOG_NEWLINE; + ++ /* ++ * NOTE: ++ * - rbuf points to beginning of allocated buffer ++ * - text points to beginning of text ++ * - there is room before text for prefix ++ */ ++ printk_emergency(rbuf, level, ts_nsec, cpu, text, text_len); ++ + printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, cpu, + dict, dictlen, text, text_len); + +@@ -1906,7 +1935,7 @@ static ssize_t msg_print_ext_body(char * + char *dict, size_t dict_len, + char *text, size_t text_len) { return 0; } + static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len, +- const char *text, size_t len) {} ++ const char *text, size_t len, int level) {} + static size_t msg_print_text(const struct printk_log *msg, bool syslog, + bool time, char *buf, size_t size) { return 0; } + static bool suppress_message_printing(int level) { return false; } +@@ -2639,7 +2668,7 @@ static int printk_kthread_func(void *dat + + console_lock(); + call_console_drivers(master_seq, ext_text, +- ext_len, text, len); ++ ext_len, text, len, msg->level); + if (len > 0 || ext_len > 0) + printk_delay(msg->level); + console_unlock(); +@@ -3043,6 +3072,76 @@ void kmsg_dump_rewind(struct kmsg_dumper + logbuf_unlock_irqrestore(flags); + } + EXPORT_SYMBOL_GPL(kmsg_dump_rewind); ++ ++static bool console_can_emergency(int level) ++{ ++ struct console *con; ++ ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ if (con->write_atomic && level < emergency_console_loglevel) ++ return true; ++ if (con->write && (con->flags & CON_BOOT)) ++ return true; ++ } ++ return false; ++} ++ ++static void call_emergency_console_drivers(int level, const char *text, ++ size_t text_len) ++{ ++ struct console *con; ++ ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ if (con->write_atomic && level < emergency_console_loglevel) { ++ con->write_atomic(con, text, text_len); ++ continue; ++ } ++ if (con->write && (con->flags & CON_BOOT)) { ++ con->write(con, text, text_len); ++ continue; ++ } ++ } ++} ++ ++static void printk_emergency(char *buffer, int level, u64 ts_nsec, u16 cpu, ++ char *text, u16 text_len) ++{ ++ struct printk_log msg; ++ size_t prefix_len; ++ ++ if (!console_can_emergency(level)) ++ return; ++ ++ msg.level = level; ++ msg.ts_nsec = ts_nsec; ++ msg.cpu = cpu; ++ msg.facility = 0; ++ ++ /* "text" must have PREFIX_MAX preceding bytes available */ ++ ++ prefix_len = print_prefix(&msg, ++ console_msg_format & MSG_FORMAT_SYSLOG, ++ printk_time, buffer); ++ /* move the prefix forward to the beginning of the message text */ ++ text -= prefix_len; ++ memmove(text, buffer, prefix_len); ++ text_len += prefix_len; ++ ++ text[text_len++] = '\n'; ++ ++ call_emergency_console_drivers(level, text, text_len); ++ ++ touch_softlockup_watchdog_sync(); ++ clocksource_touch_watchdog(); ++ rcu_cpu_stall_reset(); ++ touch_nmi_watchdog(); ++ ++ printk_delay(level); ++} + #endif + + void console_atomic_lock(unsigned int *flags) +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -61,6 +61,23 @@ config CONSOLE_LOGLEVEL_QUIET + will be used as the loglevel. IOW passing "quiet" will be the + equivalent of passing "loglevel=" + ++config CONSOLE_LOGLEVEL_EMERGENCY ++ int "Emergency console loglevel (1-15)" ++ range 1 15 ++ default "5" ++ help ++ The loglevel to determine if a console message is an emergency ++ message. ++ ++ If supported by the console driver, emergency messages will be ++ flushed to the console immediately. This can cause significant system ++ latencies so the value should be set such that only significant ++ messages are classified as emergency messages. ++ ++ Setting a default here is equivalent to passing in ++ emergency_loglevel= in the kernel bootargs. emergency_loglevel= ++ continues to override whatever value is specified here as well. ++ + config MESSAGE_LOGLEVEL_DEFAULT + int "Default message log level (1-7)" + range 1 7 diff --git a/kernel/patches-5.4.x-rt/0037-0020-serial-8250-implement-write_atomic.patch b/kernel/patches-5.4.x-rt/0037-0020-serial-8250-implement-write_atomic.patch new file mode 100644 index 000000000..ea7e56c1a --- /dev/null +++ b/kernel/patches-5.4.x-rt/0037-0020-serial-8250-implement-write_atomic.patch @@ -0,0 +1,484 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:58 +0100 +Subject: [PATCH 20/25] serial: 8250: implement write_atomic + +Implement a non-sleeping NMI-safe write_atomic console function in +order to support emergency printk messages. + +Since interrupts need to be disabled during transmit, all usage of +the IER register was wrapped with access functions that use the +console_atomic_lock function to synchronize register access while +tracking the state of the interrupts. This was necessary because +write_atomic is can be calling from an NMI context that has +preempted write_atomic. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + drivers/tty/serial/8250/8250.h | 22 +++++ + drivers/tty/serial/8250/8250_core.c | 19 +++- + drivers/tty/serial/8250/8250_dma.c | 4 + drivers/tty/serial/8250/8250_port.c | 154 ++++++++++++++++++++++++++---------- + include/linux/serial_8250.h | 5 + + 5 files changed, 157 insertions(+), 47 deletions(-) + +--- a/drivers/tty/serial/8250/8250.h ++++ b/drivers/tty/serial/8250/8250.h +@@ -96,6 +96,10 @@ struct serial8250_config { + #define SERIAL8250_SHARE_IRQS 0 + #endif + ++void set_ier(struct uart_8250_port *up, unsigned char ier); ++void clear_ier(struct uart_8250_port *up); ++void restore_ier(struct uart_8250_port *up); ++ + #define SERIAL8250_PORT_FLAGS(_base, _irq, _flags) \ + { \ + .iobase = _base, \ +@@ -139,6 +143,15 @@ static inline bool serial8250_set_THRI(s + return true; + } + ++static inline bool serial8250_set_THRI_sier(struct uart_8250_port *up) ++{ ++ if (up->ier & UART_IER_THRI) ++ return false; ++ up->ier |= UART_IER_THRI; ++ set_ier(up, up->ier); ++ return true; ++} ++ + static inline bool serial8250_clear_THRI(struct uart_8250_port *up) + { + if (!(up->ier & UART_IER_THRI)) +@@ -148,6 +161,15 @@ static inline bool serial8250_clear_THRI + return true; + } + ++static inline bool serial8250_clear_THRI_sier(struct uart_8250_port *up) ++{ ++ if (!(up->ier & UART_IER_THRI)) ++ return false; ++ up->ier &= ~UART_IER_THRI; ++ set_ier(up, up->ier); ++ return true; ++} ++ + struct uart_8250_port *serial8250_get_port(int line); + + void serial8250_rpm_get(struct uart_8250_port *p); +--- a/drivers/tty/serial/8250/8250_core.c ++++ b/drivers/tty/serial/8250/8250_core.c +@@ -265,7 +265,7 @@ static void serial8250_timeout(struct ti + static void serial8250_backup_timeout(struct timer_list *t) + { + struct uart_8250_port *up = from_timer(up, t, timer); +- unsigned int iir, ier = 0, lsr; ++ unsigned int iir, lsr; + unsigned long flags; + + spin_lock_irqsave(&up->port.lock, flags); +@@ -274,10 +274,8 @@ static void serial8250_backup_timeout(st + * Must disable interrupts or else we risk racing with the interrupt + * based handler. + */ +- if (up->port.irq) { +- ier = serial_in(up, UART_IER); +- serial_out(up, UART_IER, 0); +- } ++ if (up->port.irq) ++ clear_ier(up); + + iir = serial_in(up, UART_IIR); + +@@ -300,7 +298,7 @@ static void serial8250_backup_timeout(st + serial8250_tx_chars(up); + + if (up->port.irq) +- serial_out(up, UART_IER, ier); ++ restore_ier(up); + + spin_unlock_irqrestore(&up->port.lock, flags); + +@@ -578,6 +576,14 @@ serial8250_register_ports(struct uart_dr + + #ifdef CONFIG_SERIAL_8250_CONSOLE + ++static void univ8250_console_write_atomic(struct console *co, const char *s, ++ unsigned int count) ++{ ++ struct uart_8250_port *up = &serial8250_ports[co->index]; ++ ++ serial8250_console_write_atomic(up, s, count); ++} ++ + static void univ8250_console_write(struct console *co, const char *s, + unsigned int count) + { +@@ -663,6 +669,7 @@ static int univ8250_console_match(struct + + static struct console univ8250_console = { + .name = "ttyS", ++ .write_atomic = univ8250_console_write_atomic, + .write = univ8250_console_write, + .device = uart_console_device, + .setup = univ8250_console_setup, +--- a/drivers/tty/serial/8250/8250_dma.c ++++ b/drivers/tty/serial/8250/8250_dma.c +@@ -35,7 +35,7 @@ static void __dma_tx_complete(void *para + + ret = serial8250_tx_dma(p); + if (ret) +- serial8250_set_THRI(p); ++ serial8250_set_THRI_sier(p); + + spin_unlock_irqrestore(&p->port.lock, flags); + } +@@ -98,7 +98,7 @@ int serial8250_tx_dma(struct uart_8250_p + dma_async_issue_pending(dma->txchan); + if (dma->tx_err) { + dma->tx_err = 0; +- serial8250_clear_THRI(p); ++ serial8250_clear_THRI_sier(p); + } + return 0; + err: +--- a/drivers/tty/serial/8250/8250_port.c ++++ b/drivers/tty/serial/8250/8250_port.c +@@ -721,7 +721,7 @@ static void serial8250_set_sleep(struct + serial_out(p, UART_EFR, UART_EFR_ECB); + serial_out(p, UART_LCR, 0); + } +- serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); ++ set_ier(p, sleep ? UART_IERX_SLEEP : 0); + if (p->capabilities & UART_CAP_EFR) { + serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); + serial_out(p, UART_EFR, efr); +@@ -1390,7 +1390,7 @@ static void serial8250_stop_rx(struct ua + + up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); + up->port.read_status_mask &= ~UART_LSR_DR; +- serial_port_out(port, UART_IER, up->ier); ++ set_ier(up, up->ier); + + serial8250_rpm_put(up); + } +@@ -1408,7 +1408,7 @@ static void __do_stop_tx_rs485(struct ua + serial8250_clear_and_reinit_fifos(p); + + p->ier |= UART_IER_RLSI | UART_IER_RDI; +- serial_port_out(&p->port, UART_IER, p->ier); ++ set_ier(p, p->ier); + } + } + static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t) +@@ -1459,7 +1459,7 @@ static void __stop_tx_rs485(struct uart_ + + static inline void __do_stop_tx(struct uart_8250_port *p) + { +- if (serial8250_clear_THRI(p)) ++ if (serial8250_clear_THRI_sier(p)) + serial8250_rpm_put_tx(p); + } + +@@ -1509,7 +1509,7 @@ static inline void __start_tx(struct uar + if (up->dma && !up->dma->tx_dma(up)) + return; + +- if (serial8250_set_THRI(up)) { ++ if (serial8250_set_THRI_sier(up)) { + if (up->bugs & UART_BUG_TXEN) { + unsigned char lsr; + +@@ -1616,7 +1616,7 @@ static void serial8250_disable_ms(struct + mctrl_gpio_disable_ms(up->gpios); + + up->ier &= ~UART_IER_MSI; +- serial_port_out(port, UART_IER, up->ier); ++ set_ier(up, up->ier); + } + + static void serial8250_enable_ms(struct uart_port *port) +@@ -1632,7 +1632,7 @@ static void serial8250_enable_ms(struct + up->ier |= UART_IER_MSI; + + serial8250_rpm_get(up); +- serial_port_out(port, UART_IER, up->ier); ++ set_ier(up, up->ier); + serial8250_rpm_put(up); + } + +@@ -1991,6 +1991,52 @@ static void wait_for_xmitr(struct uart_8 + } + } + ++static atomic_t ier_counter = ATOMIC_INIT(0); ++static atomic_t ier_value = ATOMIC_INIT(0); ++ ++void set_ier(struct uart_8250_port *up, unsigned char ier) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ ++ console_atomic_lock(&flags); ++ if (atomic_read(&ier_counter) > 0) ++ atomic_set(&ier_value, ier); ++ else ++ serial_port_out(port, UART_IER, ier); ++ console_atomic_unlock(flags); ++} ++ ++void clear_ier(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int ier_cleared = 0; ++ unsigned int flags; ++ unsigned int ier; ++ ++ console_atomic_lock(&flags); ++ atomic_inc(&ier_counter); ++ ier = serial_port_in(port, UART_IER); ++ if (up->capabilities & UART_CAP_UUE) ++ ier_cleared = UART_IER_UUE; ++ if (ier != ier_cleared) { ++ serial_port_out(port, UART_IER, ier_cleared); ++ atomic_set(&ier_value, ier); ++ } ++ console_atomic_unlock(flags); ++} ++ ++void restore_ier(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ ++ console_atomic_lock(&flags); ++ if (atomic_fetch_dec(&ier_counter) == 1) ++ serial_port_out(port, UART_IER, atomic_read(&ier_value)); ++ console_atomic_unlock(flags); ++} ++ + #ifdef CONFIG_CONSOLE_POLL + /* + * Console polling routines for writing and reading from the uart while +@@ -2022,18 +2068,10 @@ static int serial8250_get_poll_char(stru + static void serial8250_put_poll_char(struct uart_port *port, + unsigned char c) + { +- unsigned int ier; + struct uart_8250_port *up = up_to_u8250p(port); + + serial8250_rpm_get(up); +- /* +- * First save the IER then disable the interrupts +- */ +- ier = serial_port_in(port, UART_IER); +- if (up->capabilities & UART_CAP_UUE) +- serial_port_out(port, UART_IER, UART_IER_UUE); +- else +- serial_port_out(port, UART_IER, 0); ++ clear_ier(up); + + wait_for_xmitr(up, BOTH_EMPTY); + /* +@@ -2046,7 +2084,7 @@ static void serial8250_put_poll_char(str + * and restore the IER + */ + wait_for_xmitr(up, BOTH_EMPTY); +- serial_port_out(port, UART_IER, ier); ++ restore_ier(up); + serial8250_rpm_put(up); + } + +@@ -2358,7 +2396,7 @@ void serial8250_do_shutdown(struct uart_ + */ + spin_lock_irqsave(&port->lock, flags); + up->ier = 0; +- serial_port_out(port, UART_IER, 0); ++ set_ier(up, 0); + spin_unlock_irqrestore(&port->lock, flags); + + synchronize_irq(port->irq); +@@ -2643,7 +2681,7 @@ serial8250_do_set_termios(struct uart_po + if (up->capabilities & UART_CAP_RTOIE) + up->ier |= UART_IER_RTOIE; + +- serial_port_out(port, UART_IER, up->ier); ++ set_ier(up, up->ier); + + if (up->capabilities & UART_CAP_EFR) { + unsigned char efr = 0; +@@ -3107,7 +3145,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_default + + #ifdef CONFIG_SERIAL_8250_CONSOLE + +-static void serial8250_console_putchar(struct uart_port *port, int ch) ++static void serial8250_console_putchar_locked(struct uart_port *port, int ch) + { + struct uart_8250_port *up = up_to_u8250p(port); + +@@ -3115,6 +3153,18 @@ static void serial8250_console_putchar(s + serial_port_out(port, UART_TX, ch); + } + ++static void serial8250_console_putchar(struct uart_port *port, int ch) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int flags; ++ ++ wait_for_xmitr(up, UART_LSR_THRE); ++ ++ console_atomic_lock(&flags); ++ serial8250_console_putchar_locked(port, ch); ++ console_atomic_unlock(flags); ++} ++ + /* + * Restore serial console when h/w power-off detected + */ +@@ -3136,6 +3186,42 @@ static void serial8250_console_restore(s + serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS); + } + ++void serial8250_console_write_atomic(struct uart_8250_port *up, ++ const char *s, unsigned int count) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ bool locked; ++ ++ console_atomic_lock(&flags); ++ ++ /* ++ * If possible, keep any other CPUs from working with the ++ * UART until the atomic message is completed. This helps ++ * to keep the output more orderly. ++ */ ++ locked = spin_trylock(&port->lock); ++ ++ touch_nmi_watchdog(); ++ ++ clear_ier(up); ++ ++ if (atomic_fetch_inc(&up->console_printing)) { ++ uart_console_write(port, "\n", 1, ++ serial8250_console_putchar_locked); ++ } ++ uart_console_write(port, s, count, serial8250_console_putchar_locked); ++ atomic_dec(&up->console_printing); ++ ++ wait_for_xmitr(up, BOTH_EMPTY); ++ restore_ier(up); ++ ++ if (locked) ++ spin_unlock(&port->lock); ++ ++ console_atomic_unlock(flags); ++} ++ + /* + * Print a string to the serial port trying not to disturb + * any possible real use of the port... +@@ -3147,27 +3233,13 @@ void serial8250_console_write(struct uar + { + struct uart_port *port = &up->port; + unsigned long flags; +- unsigned int ier; +- int locked = 1; + + touch_nmi_watchdog(); + + serial8250_rpm_get(up); ++ spin_lock_irqsave(&port->lock, flags); + +- if (oops_in_progress) +- locked = spin_trylock_irqsave(&port->lock, flags); +- else +- spin_lock_irqsave(&port->lock, flags); +- +- /* +- * First save the IER then disable the interrupts +- */ +- ier = serial_port_in(port, UART_IER); +- +- if (up->capabilities & UART_CAP_UUE) +- serial_port_out(port, UART_IER, UART_IER_UUE); +- else +- serial_port_out(port, UART_IER, 0); ++ clear_ier(up); + + /* check scratch reg to see if port powered off during system sleep */ + if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { +@@ -3175,14 +3247,16 @@ void serial8250_console_write(struct uar + up->canary = 0; + } + ++ atomic_inc(&up->console_printing); + uart_console_write(port, s, count, serial8250_console_putchar); ++ atomic_dec(&up->console_printing); + + /* + * Finally, wait for transmitter to become empty + * and restore the IER + */ + wait_for_xmitr(up, BOTH_EMPTY); +- serial_port_out(port, UART_IER, ier); ++ restore_ier(up); + + /* + * The receive handling will happen properly because the +@@ -3194,8 +3268,7 @@ void serial8250_console_write(struct uar + if (up->msr_saved_flags) + serial8250_modem_status(up); + +- if (locked) +- spin_unlock_irqrestore(&port->lock, flags); ++ spin_unlock_irqrestore(&port->lock, flags); + serial8250_rpm_put(up); + } + +@@ -3216,6 +3289,7 @@ static unsigned int probe_baud(struct ua + + int serial8250_console_setup(struct uart_port *port, char *options, bool probe) + { ++ struct uart_8250_port *up = up_to_u8250p(port); + int baud = 9600; + int bits = 8; + int parity = 'n'; +@@ -3224,6 +3298,8 @@ int serial8250_console_setup(struct uart + if (!port->iobase && !port->membase) + return -ENODEV; + ++ atomic_set(&up->console_printing, 0); ++ + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + else if (probe) +--- a/include/linux/serial_8250.h ++++ b/include/linux/serial_8250.h +@@ -7,6 +7,7 @@ + #ifndef _LINUX_SERIAL_8250_H + #define _LINUX_SERIAL_8250_H + ++#include + #include + #include + #include +@@ -123,6 +124,8 @@ struct uart_8250_port { + #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA + unsigned char msr_saved_flags; + ++ atomic_t console_printing; ++ + struct uart_8250_dma *dma; + const struct uart_8250_ops *ops; + +@@ -174,6 +177,8 @@ void serial8250_init_port(struct uart_82 + void serial8250_set_defaults(struct uart_8250_port *up); + void serial8250_console_write(struct uart_8250_port *up, const char *s, + unsigned int count); ++void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, ++ unsigned int count); + int serial8250_console_setup(struct uart_port *port, char *options, bool probe); + + extern void serial8250_set_isa_configurator(void (*v) diff --git a/kernel/patches-5.4.x-rt/0038-0021-printk-implement-KERN_CONT.patch b/kernel/patches-5.4.x-rt/0038-0021-printk-implement-KERN_CONT.patch new file mode 100644 index 000000000..f2c03f9f2 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0038-0021-printk-implement-KERN_CONT.patch @@ -0,0 +1,132 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:29:59 +0100 +Subject: [PATCH 21/25] printk: implement KERN_CONT + +Implement KERN_CONT based on the printing CPU rather than on the +printing task. As long as the KERN_CONT messages are coming from the +same CPU and no non-KERN_CONT messages come, the messages are assumed +to belong to each other. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 65 +++++++++++++++++++++++++++---------------------- + 1 file changed, 37 insertions(+), 28 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1709,8 +1709,6 @@ static inline u32 printk_caller_id(void) + 0x80000000 + raw_smp_processor_id(); + } + +-/* FIXME: no support for LOG_CONT */ +-#if 0 + /* + * Continuation lines are buffered, and not committed to the record buffer + * until the line is complete, or a race forces it. The line fragments +@@ -1721,52 +1719,55 @@ static struct cont { + char buf[LOG_LINE_MAX]; + size_t len; /* length == 0 means unused buffer */ + u32 caller_id; /* printk_caller_id() of first print */ ++ int cpu_owner; /* cpu of first print */ + u64 ts_nsec; /* time of first print */ + u8 level; /* log level of first message */ + u8 facility; /* log facility of first message */ + enum log_flags flags; /* prefix, newline flags */ +-} cont; ++} cont[2]; + +-static void cont_flush(void) ++static void cont_flush(int ctx) + { +- if (cont.len == 0) ++ struct cont *c = &cont[ctx]; ++ ++ if (c->len == 0) + return; + +- log_store(cont.caller_id, cont.facility, cont.level, cont.flags, +- cont.ts_nsec, NULL, 0, cont.buf, cont.len); +- cont.len = 0; ++ log_store(c->caller_id, c->facility, c->level, c->flags, ++ c->ts_nsec, c->cpu_owner, NULL, 0, c->buf, c->len); ++ c->len = 0; + } + +-static bool cont_add(u32 caller_id, int facility, int level, ++static void cont_add(int ctx, int cpu, u32 caller_id, int facility, int level, + enum log_flags flags, const char *text, size_t len) + { ++ struct cont *c = &cont[ctx]; ++ ++ if (cpu != c->cpu_owner || !(flags & LOG_CONT)) ++ cont_flush(ctx); ++ + /* If the line gets too long, split it up in separate records. */ +- if (cont.len + len > sizeof(cont.buf)) { +- cont_flush(); +- return false; +- } ++ while (c->len + len > sizeof(c->buf)) ++ cont_flush(ctx); + +- if (!cont.len) { +- cont.facility = facility; +- cont.level = level; +- cont.caller_id = caller_id; +- cont.ts_nsec = local_clock(); +- cont.flags = flags; ++ if (!c->len) { ++ c->facility = facility; ++ c->level = level; ++ c->caller_id = caller_id; ++ c->ts_nsec = local_clock(); ++ c->flags = flags; ++ c->cpu_owner = cpu; + } + +- memcpy(cont.buf + cont.len, text, len); +- cont.len += len; ++ memcpy(c->buf + c->len, text, len); ++ c->len += len; + + // The original flags come from the first line, + // but later continuations can add a newline. + if (flags & LOG_NEWLINE) { +- cont.flags |= LOG_NEWLINE; +- cont_flush(); ++ c->flags |= LOG_NEWLINE; + } +- +- return true; + } +-#endif /* 0 */ + + /* ring buffer used as memory allocator for temporary sprint buffers */ + DECLARE_STATIC_PRINTKRB(sprint_rb, +@@ -1778,6 +1779,7 @@ asmlinkage int vprintk_emit(int facility + const char *fmt, va_list args) + { + const u32 caller_id = printk_caller_id(); ++ int ctx = !!in_nmi(); + enum log_flags lflags = 0; + int printed_len = 0; + struct prb_handle h; +@@ -1843,8 +1845,15 @@ asmlinkage int vprintk_emit(int facility + */ + printk_emergency(rbuf, level, ts_nsec, cpu, text, text_len); + +- printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, cpu, +- dict, dictlen, text, text_len); ++ if ((lflags & LOG_CONT) || !(lflags & LOG_NEWLINE)) { ++ cont_add(ctx, cpu, caller_id, facility, level, lflags, text, text_len); ++ printed_len = text_len; ++ } else { ++ if (cpu == cont[ctx].cpu_owner) ++ cont_flush(ctx); ++ printed_len = log_store(caller_id, facility, level, lflags, ts_nsec, cpu, ++ dict, dictlen, text, text_len); ++ } + + prb_commit(&h); + return printed_len; diff --git a/kernel/patches-5.4.x-rt/0039-0022-printk-implement-dev-kmsg.patch b/kernel/patches-5.4.x-rt/0039-0022-printk-implement-dev-kmsg.patch new file mode 100644 index 000000000..411044a17 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0039-0022-printk-implement-dev-kmsg.patch @@ -0,0 +1,304 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:30:00 +0100 +Subject: [PATCH 22/25] printk: implement /dev/kmsg + +Since printk messages are now logged to a new ring buffer, update +the /dev/kmsg functions to pull the messages from there. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + fs/proc/kmsg.c | 4 - + include/linux/printk.h | 1 + kernel/printk/printk.c | 162 +++++++++++++++++++++++++++++++++---------------- + 3 files changed, 113 insertions(+), 54 deletions(-) + +--- a/fs/proc/kmsg.c ++++ b/fs/proc/kmsg.c +@@ -18,8 +18,6 @@ + #include + #include + +-extern wait_queue_head_t log_wait; +- + static int kmsg_open(struct inode * inode, struct file * file) + { + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); +@@ -42,7 +40,7 @@ static ssize_t kmsg_read(struct file *fi + + static __poll_t kmsg_poll(struct file *file, poll_table *wait) + { +- poll_wait(file, &log_wait, wait); ++ poll_wait(file, printk_wait_queue(), wait); + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) + return EPOLLIN | EPOLLRDNORM; + return 0; +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -192,6 +192,7 @@ void __init setup_log_buf(int early); + void dump_stack_print_info(const char *log_lvl); + void show_regs_print_info(const char *log_lvl); + extern asmlinkage void dump_stack(void) __cold; ++struct wait_queue_head *printk_wait_queue(void); + #else + static inline __printf(1, 0) + int vprintk(const char *s, va_list args) +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -673,10 +673,11 @@ static ssize_t msg_print_ext_body(char * + /* /dev/kmsg - userspace message inject/listen interface */ + struct devkmsg_user { + u64 seq; +- u32 idx; ++ struct prb_iterator iter; + struct ratelimit_state rs; + struct mutex lock; + char buf[CONSOLE_EXT_LOG_MAX]; ++ char msgbuf[PRINTK_RECORD_MAX]; + }; + + static __printf(3, 4) __cold +@@ -759,9 +760,11 @@ static ssize_t devkmsg_read(struct file + size_t count, loff_t *ppos) + { + struct devkmsg_user *user = file->private_data; ++ struct prb_iterator backup_iter; + struct printk_log *msg; +- size_t len; + ssize_t ret; ++ size_t len; ++ u64 seq; + + if (!user) + return -EBADF; +@@ -770,52 +773,67 @@ static ssize_t devkmsg_read(struct file + if (ret) + return ret; + +- logbuf_lock_irq(); +- while (user->seq == log_next_seq) { +- if (file->f_flags & O_NONBLOCK) { +- ret = -EAGAIN; +- logbuf_unlock_irq(); +- goto out; +- } ++ /* make a backup copy in case there is a problem */ ++ prb_iter_copy(&backup_iter, &user->iter); + +- logbuf_unlock_irq(); +- ret = wait_event_interruptible(log_wait, +- user->seq != log_next_seq); +- if (ret) +- goto out; +- logbuf_lock_irq(); ++ if (file->f_flags & O_NONBLOCK) { ++ ret = prb_iter_next(&user->iter, &user->msgbuf[0], ++ sizeof(user->msgbuf), &seq); ++ } else { ++ ret = prb_iter_wait_next(&user->iter, &user->msgbuf[0], ++ sizeof(user->msgbuf), &seq); + } +- +- if (user->seq < log_first_seq) { +- /* our last seen message is gone, return error and reset */ +- user->idx = log_first_idx; +- user->seq = log_first_seq; ++ if (ret == 0) { ++ /* end of list */ ++ ret = -EAGAIN; ++ goto out; ++ } else if (ret == -EINVAL) { ++ /* iterator invalid, return error and reset */ + ret = -EPIPE; +- logbuf_unlock_irq(); ++ prb_iter_init(&user->iter, &printk_rb, &user->seq); ++ goto out; ++ } else if (ret < 0) { ++ /* interrupted by signal */ + goto out; + } + +- msg = log_from_idx(user->idx); ++ if (user->seq == 0) { ++ user->seq = seq; ++ } else { ++ user->seq++; ++ if (user->seq < seq) { ++ ret = -EPIPE; ++ goto restore_out; ++ } ++ } ++ ++ msg = (struct printk_log *)&user->msgbuf[0]; + len = msg_print_ext_header(user->buf, sizeof(user->buf), + msg, user->seq); + len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len, + log_dict(msg), msg->dict_len, + log_text(msg), msg->text_len); + +- user->idx = log_next(user->idx); +- user->seq++; +- logbuf_unlock_irq(); +- + if (len > count) { + ret = -EINVAL; +- goto out; ++ goto restore_out; + } + + if (copy_to_user(buf, user->buf, len)) { + ret = -EFAULT; +- goto out; ++ goto restore_out; + } ++ + ret = len; ++ goto out; ++restore_out: ++ /* ++ * There was an error, but this message should not be ++ * lost because of it. Restore the backup and setup ++ * seq so that it will work with the next read. ++ */ ++ prb_iter_copy(&user->iter, &backup_iter); ++ user->seq = seq - 1; + out: + mutex_unlock(&user->lock); + return ret; +@@ -824,19 +842,21 @@ static ssize_t devkmsg_read(struct file + static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) + { + struct devkmsg_user *user = file->private_data; +- loff_t ret = 0; ++ loff_t ret; + + if (!user) + return -EBADF; + if (offset) + return -ESPIPE; + +- logbuf_lock_irq(); ++ ret = mutex_lock_interruptible(&user->lock); ++ if (ret) ++ return ret; ++ + switch (whence) { + case SEEK_SET: + /* the first record */ +- user->idx = log_first_idx; +- user->seq = log_first_seq; ++ prb_iter_init(&user->iter, &printk_rb, &user->seq); + break; + case SEEK_DATA: + /* +@@ -844,40 +864,83 @@ static loff_t devkmsg_llseek(struct file + * like issued by 'dmesg -c'. Reading /dev/kmsg itself + * changes no global state, and does not clear anything. + */ +- user->idx = clear_idx; +- user->seq = clear_seq; ++ for (;;) { ++ prb_iter_init(&user->iter, &printk_rb, NULL); ++ ret = prb_iter_seek(&user->iter, clear_seq); ++ if (ret > 0) { ++ /* seeked to clear seq */ ++ user->seq = clear_seq; ++ break; ++ } else if (ret == 0) { ++ /* ++ * The end of the list was hit without ++ * ever seeing the clear seq. Just ++ * seek to the beginning of the list. ++ */ ++ prb_iter_init(&user->iter, &printk_rb, ++ &user->seq); ++ break; ++ } ++ /* iterator invalid, start over */ ++ } ++ ret = 0; + break; + case SEEK_END: + /* after the last record */ +- user->idx = log_next_idx; +- user->seq = log_next_seq; ++ for (;;) { ++ ret = prb_iter_next(&user->iter, NULL, 0, &user->seq); ++ if (ret == 0) ++ break; ++ else if (ret > 0) ++ continue; ++ /* iterator invalid, start over */ ++ prb_iter_init(&user->iter, &printk_rb, &user->seq); ++ } ++ ret = 0; + break; + default: + ret = -EINVAL; + } +- logbuf_unlock_irq(); ++ ++ mutex_unlock(&user->lock); + return ret; + } + ++struct wait_queue_head *printk_wait_queue(void) ++{ ++ /* FIXME: using prb internals! */ ++ return printk_rb.wq; ++} ++ + static __poll_t devkmsg_poll(struct file *file, poll_table *wait) + { + struct devkmsg_user *user = file->private_data; ++ struct prb_iterator iter; + __poll_t ret = 0; ++ int rbret; ++ u64 seq; + + if (!user) + return EPOLLERR|EPOLLNVAL; + +- poll_wait(file, &log_wait, wait); ++ poll_wait(file, printk_wait_queue(), wait); + +- logbuf_lock_irq(); +- if (user->seq < log_next_seq) { +- /* return error when data has vanished underneath us */ +- if (user->seq < log_first_seq) +- ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; +- else +- ret = EPOLLIN|EPOLLRDNORM; +- } +- logbuf_unlock_irq(); ++ mutex_lock(&user->lock); ++ ++ /* use copy so no actual iteration takes place */ ++ prb_iter_copy(&iter, &user->iter); ++ ++ rbret = prb_iter_next(&iter, &user->msgbuf[0], ++ sizeof(user->msgbuf), &seq); ++ if (rbret == 0) ++ goto out; ++ ++ ret = EPOLLIN|EPOLLRDNORM; ++ ++ if (rbret < 0 || (seq - user->seq) != 1) ++ ret |= EPOLLERR|EPOLLPRI; ++out: ++ mutex_unlock(&user->lock); + + return ret; + } +@@ -907,10 +970,7 @@ static int devkmsg_open(struct inode *in + + mutex_init(&user->lock); + +- logbuf_lock_irq(); +- user->idx = log_first_idx; +- user->seq = log_first_seq; +- logbuf_unlock_irq(); ++ prb_iter_init(&user->iter, &printk_rb, &user->seq); + + file->private_data = user; + return 0; diff --git a/kernel/patches-5.4.x-rt/0040-0023-printk-implement-syslog.patch b/kernel/patches-5.4.x-rt/0040-0023-printk-implement-syslog.patch new file mode 100644 index 000000000..c5af3128e --- /dev/null +++ b/kernel/patches-5.4.x-rt/0040-0023-printk-implement-syslog.patch @@ -0,0 +1,493 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:30:01 +0100 +Subject: [PATCH 23/25] printk: implement syslog + +Since printk messages are now logged to a new ring buffer, update +the syslog functions to pull the messages from there. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 342 +++++++++++++++++++++++++++++++++---------------- + 1 file changed, 236 insertions(+), 106 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -407,10 +407,12 @@ DECLARE_STATIC_PRINTKRB_CPULOCK(printk_c + /* record buffer */ + DECLARE_STATIC_PRINTKRB(printk_rb, CONFIG_LOG_BUF_SHIFT, &printk_cpulock); + ++static DEFINE_MUTEX(syslog_lock); ++DECLARE_STATIC_PRINTKRB_ITER(syslog_iter, &printk_rb); ++ + DECLARE_WAIT_QUEUE_HEAD(log_wait); + /* the next printk record to read by syslog(READ) or /proc/kmsg */ + static u64 syslog_seq; +-static u32 syslog_idx; + static size_t syslog_partial; + static bool syslog_time; + +@@ -1303,30 +1305,42 @@ static size_t msg_print_text(const struc + return len; + } + +-static int syslog_print(char __user *buf, int size) ++static int syslog_print(char __user *buf, int size, char *text, ++ char *msgbuf, int *locked) + { +- char *text; ++ struct prb_iterator iter; + struct printk_log *msg; + int len = 0; +- +- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); +- if (!text) +- return -ENOMEM; ++ u64 seq; ++ int ret; + + while (size > 0) { + size_t n; + size_t skip; + +- logbuf_lock_irq(); +- if (syslog_seq < log_first_seq) { +- /* messages are gone, move to first one */ +- syslog_seq = log_first_seq; +- syslog_idx = log_first_idx; +- syslog_partial = 0; ++ for (;;) { ++ prb_iter_copy(&iter, &syslog_iter); ++ ret = prb_iter_next(&iter, msgbuf, ++ PRINTK_RECORD_MAX, &seq); ++ if (ret < 0) { ++ /* messages are gone, move to first one */ ++ prb_iter_init(&syslog_iter, &printk_rb, ++ &syslog_seq); ++ syslog_partial = 0; ++ continue; ++ } ++ break; + } +- if (syslog_seq == log_next_seq) { +- logbuf_unlock_irq(); ++ if (ret == 0) + break; ++ ++ /* ++ * If messages have been missed, the partial tracker ++ * is no longer valid and must be reset. ++ */ ++ if (syslog_seq > 0 && seq - 1 != syslog_seq) { ++ syslog_seq = seq - 1; ++ syslog_partial = 0; + } + + /* +@@ -1336,131 +1350,212 @@ static int syslog_print(char __user *buf + if (!syslog_partial) + syslog_time = printk_time; + ++ msg = (struct printk_log *)msgbuf; ++ + skip = syslog_partial; +- msg = log_from_idx(syslog_idx); + n = msg_print_text(msg, true, syslog_time, text, +- LOG_LINE_MAX + PREFIX_MAX); ++ PRINTK_SPRINT_MAX); + if (n - syslog_partial <= size) { + /* message fits into buffer, move forward */ +- syslog_idx = log_next(syslog_idx); +- syslog_seq++; ++ prb_iter_next(&syslog_iter, NULL, 0, &syslog_seq); + n -= syslog_partial; + syslog_partial = 0; +- } else if (!len){ ++ } else if (!len) { + /* partial read(), remember position */ + n = size; + syslog_partial += n; + } else + n = 0; +- logbuf_unlock_irq(); + + if (!n) + break; + ++ mutex_unlock(&syslog_lock); + if (copy_to_user(buf, text + skip, n)) { + if (!len) + len = -EFAULT; ++ *locked = 0; + break; + } ++ ret = mutex_lock_interruptible(&syslog_lock); + + len += n; + size -= n; + buf += n; ++ ++ if (ret) { ++ if (!len) ++ len = ret; ++ *locked = 0; ++ break; ++ } + } + +- kfree(text); + return len; + } + +-static int syslog_print_all(char __user *buf, int size, bool clear) ++static int count_remaining(struct prb_iterator *iter, u64 until_seq, ++ char *msgbuf, int size, bool records, bool time) + { +- char *text; ++ struct prb_iterator local_iter; ++ struct printk_log *msg; + int len = 0; +- u64 next_seq; + u64 seq; +- u32 idx; ++ int ret; ++ ++ prb_iter_copy(&local_iter, iter); ++ for (;;) { ++ ret = prb_iter_next(&local_iter, msgbuf, size, &seq); ++ if (ret == 0) { ++ break; ++ } else if (ret < 0) { ++ /* the iter is invalid, restart from head */ ++ prb_iter_init(&local_iter, &printk_rb, NULL); ++ len = 0; ++ continue; ++ } ++ ++ if (until_seq && seq >= until_seq) ++ break; ++ ++ if (records) { ++ len++; ++ } else { ++ msg = (struct printk_log *)msgbuf; ++ len += msg_print_text(msg, true, time, NULL, 0); ++ } ++ } ++ ++ return len; ++} ++ ++static void syslog_clear(void) ++{ ++ struct prb_iterator iter; ++ int ret; ++ ++ prb_iter_init(&iter, &printk_rb, &clear_seq); ++ for (;;) { ++ ret = prb_iter_next(&iter, NULL, 0, &clear_seq); ++ if (ret == 0) ++ break; ++ else if (ret < 0) ++ prb_iter_init(&iter, &printk_rb, &clear_seq); ++ } ++} ++ ++static int syslog_print_all(char __user *buf, int size, bool clear) ++{ ++ struct prb_iterator iter; ++ struct printk_log *msg; ++ char *msgbuf = NULL; ++ char *text = NULL; ++ int textlen; ++ u64 seq = 0; ++ int len = 0; + bool time; ++ int ret; + +- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; ++ msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL); ++ if (!msgbuf) { ++ kfree(text); ++ return -ENOMEM; ++ } + + time = printk_time; +- logbuf_lock_irq(); ++ + /* +- * Find first record that fits, including all following records, +- * into the user-provided buffer for this dump. ++ * Setup iter to last event before clear. Clear may ++ * be lost, but keep going with a best effort. + */ +- seq = clear_seq; +- idx = clear_idx; +- while (seq < log_next_seq) { +- struct printk_log *msg = log_from_idx(idx); +- +- len += msg_print_text(msg, true, time, NULL, 0); +- idx = log_next(idx); +- seq++; +- } ++ prb_iter_init(&iter, &printk_rb, NULL); ++ prb_iter_seek(&iter, clear_seq); + +- /* move first record forward until length fits into the buffer */ +- seq = clear_seq; +- idx = clear_idx; +- while (len > size && seq < log_next_seq) { +- struct printk_log *msg = log_from_idx(idx); ++ /* count the total bytes after clear */ ++ len = count_remaining(&iter, 0, msgbuf, PRINTK_RECORD_MAX, ++ false, time); ++ ++ /* move iter forward until length fits into the buffer */ ++ while (len > size) { ++ ret = prb_iter_next(&iter, msgbuf, ++ PRINTK_RECORD_MAX, &seq); ++ if (ret == 0) { ++ break; ++ } else if (ret < 0) { ++ /* ++ * The iter is now invalid so clear will ++ * also be invalid. Restart from the head. ++ */ ++ prb_iter_init(&iter, &printk_rb, NULL); ++ len = count_remaining(&iter, 0, msgbuf, ++ PRINTK_RECORD_MAX, false, time); ++ continue; ++ } + ++ msg = (struct printk_log *)msgbuf; + len -= msg_print_text(msg, true, time, NULL, 0); +- idx = log_next(idx); +- seq++; +- } + +- /* last message fitting into this dump */ +- next_seq = log_next_seq; ++ if (clear) ++ clear_seq = seq; ++ } + ++ /* copy messages to buffer */ + len = 0; +- while (len >= 0 && seq < next_seq) { +- struct printk_log *msg = log_from_idx(idx); +- int textlen = msg_print_text(msg, true, time, text, +- LOG_LINE_MAX + PREFIX_MAX); ++ while (len >= 0 && len < size) { ++ if (clear) ++ clear_seq = seq; + +- idx = log_next(idx); +- seq++; ++ ret = prb_iter_next(&iter, msgbuf, ++ PRINTK_RECORD_MAX, &seq); ++ if (ret == 0) { ++ break; ++ } else if (ret < 0) { ++ /* ++ * The iter is now invalid. Make a best ++ * effort to grab the rest of the log ++ * from the new head. ++ */ ++ prb_iter_init(&iter, &printk_rb, NULL); ++ continue; ++ } ++ ++ msg = (struct printk_log *)msgbuf; ++ textlen = msg_print_text(msg, true, time, text, ++ PRINTK_SPRINT_MAX); ++ if (textlen < 0) { ++ len = textlen; ++ break; ++ } + +- logbuf_unlock_irq(); + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else + len += textlen; +- logbuf_lock_irq(); +- +- if (seq < log_first_seq) { +- /* messages are gone, move to next one */ +- seq = log_first_seq; +- idx = log_first_idx; +- } + } + +- if (clear) { +- clear_seq = log_next_seq; +- clear_idx = log_next_idx; +- } +- logbuf_unlock_irq(); ++ if (clear && !seq) ++ syslog_clear(); + +- kfree(text); ++ if (text) ++ kfree(text); ++ if (msgbuf) ++ kfree(msgbuf); + return len; + } + +-static void syslog_clear(void) +-{ +- logbuf_lock_irq(); +- clear_seq = log_next_seq; +- clear_idx = log_next_idx; +- logbuf_unlock_irq(); +-} +- + int do_syslog(int type, char __user *buf, int len, int source) + { + bool clear = false; + static int saved_console_loglevel = LOGLEVEL_DEFAULT; ++ struct prb_iterator iter; ++ char *msgbuf = NULL; ++ char *text = NULL; ++ int locked; + int error; ++ int ret; + + error = check_syslog_permissions(type, source); + if (error) +@@ -1478,11 +1573,49 @@ int do_syslog(int type, char __user *buf + return 0; + if (!access_ok(buf, len)) + return -EFAULT; +- error = wait_event_interruptible(log_wait, +- syslog_seq != log_next_seq); ++ ++ text = kmalloc(PRINTK_SPRINT_MAX, GFP_KERNEL); ++ msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL); ++ if (!text || !msgbuf) { ++ error = -ENOMEM; ++ goto out; ++ } ++ ++ error = mutex_lock_interruptible(&syslog_lock); + if (error) +- return error; +- error = syslog_print(buf, len); ++ goto out; ++ ++ /* ++ * Wait until a first message is available. Use a copy ++ * because no iteration should occur for syslog now. ++ */ ++ for (;;) { ++ prb_iter_copy(&iter, &syslog_iter); ++ ++ mutex_unlock(&syslog_lock); ++ ret = prb_iter_wait_next(&iter, NULL, 0, NULL); ++ if (ret == -ERESTARTSYS) { ++ error = ret; ++ goto out; ++ } ++ error = mutex_lock_interruptible(&syslog_lock); ++ if (error) ++ goto out; ++ ++ if (ret == -EINVAL) { ++ prb_iter_init(&syslog_iter, &printk_rb, ++ &syslog_seq); ++ syslog_partial = 0; ++ continue; ++ } ++ break; ++ } ++ ++ /* print as much as will fit in the user buffer */ ++ locked = 1; ++ error = syslog_print(buf, len, text, msgbuf, &locked); ++ if (locked) ++ mutex_unlock(&syslog_lock); + break; + /* Read/clear last kernel messages */ + case SYSLOG_ACTION_READ_CLEAR: +@@ -1527,47 +1660,45 @@ int do_syslog(int type, char __user *buf + break; + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: +- logbuf_lock_irq(); +- if (syslog_seq < log_first_seq) { +- /* messages are gone, move to first one */ +- syslog_seq = log_first_seq; +- syslog_idx = log_first_idx; +- syslog_partial = 0; +- } ++ msgbuf = kmalloc(PRINTK_RECORD_MAX, GFP_KERNEL); ++ if (!msgbuf) ++ return -ENOMEM; ++ ++ error = mutex_lock_interruptible(&syslog_lock); ++ if (error) ++ goto out; ++ + if (source == SYSLOG_FROM_PROC) { + /* + * Short-cut for poll(/"proc/kmsg") which simply checks + * for pending data, not the size; return the count of + * records, not the length. + */ +- error = log_next_seq - syslog_seq; ++ error = count_remaining(&syslog_iter, 0, msgbuf, ++ PRINTK_RECORD_MAX, true, ++ printk_time); + } else { +- u64 seq = syslog_seq; +- u32 idx = syslog_idx; +- bool time = syslog_partial ? syslog_time : printk_time; +- +- while (seq < log_next_seq) { +- struct printk_log *msg = log_from_idx(idx); +- +- error += msg_print_text(msg, true, time, NULL, +- 0); +- time = printk_time; +- idx = log_next(idx); +- seq++; +- } ++ error = count_remaining(&syslog_iter, 0, msgbuf, ++ PRINTK_RECORD_MAX, false, ++ printk_time); + error -= syslog_partial; + } +- logbuf_unlock_irq(); ++ ++ mutex_unlock(&syslog_lock); + break; + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: +- error = log_buf_len; ++ error = prb_buffer_size(&printk_rb); + break; + default: + error = -EINVAL; + break; + } +- ++out: ++ if (msgbuf) ++ kfree(msgbuf); ++ if (text) ++ kfree(text); + return error; + } + +@@ -1989,7 +2120,6 @@ EXPORT_SYMBOL(printk); + #define printk_time false + + static u64 syslog_seq; +-static u32 syslog_idx; + static u64 log_first_seq; + static u32 log_first_idx; + static u64 log_next_seq; diff --git a/kernel/patches-5.4.x-rt/0041-0024-printk-implement-kmsg_dump.patch b/kernel/patches-5.4.x-rt/0041-0024-printk-implement-kmsg_dump.patch new file mode 100644 index 000000000..4de007640 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0041-0024-printk-implement-kmsg_dump.patch @@ -0,0 +1,397 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:30:02 +0100 +Subject: [PATCH 24/25] printk: implement kmsg_dump + +Since printk messages are now logged to a new ring buffer, update +the kmsg_dump functions to pull the messages from there. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/kmsg_dump.h | 6 - + kernel/printk/printk.c | 258 ++++++++++++++++++++++++---------------------- + 2 files changed, 139 insertions(+), 125 deletions(-) + +--- a/include/linux/kmsg_dump.h ++++ b/include/linux/kmsg_dump.h +@@ -46,10 +46,8 @@ struct kmsg_dumper { + bool registered; + + /* private state of the kmsg iterator */ +- u32 cur_idx; +- u32 next_idx; +- u64 cur_seq; +- u64 next_seq; ++ u64 line_seq; ++ u64 buffer_end_seq; + }; + + #ifdef CONFIG_PRINTK +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -417,13 +417,13 @@ static size_t syslog_partial; + static bool syslog_time; + + /* index and sequence number of the first record stored in the buffer */ +-static u64 log_first_seq; + static u32 log_first_idx; + + /* index and sequence number of the next record to store in the buffer */ +-static u64 log_next_seq; + static u32 log_next_idx; + ++static DEFINE_MUTEX(kmsg_dump_lock); ++ + /* the next printk record to read after the last 'clear' command */ + static u64 clear_seq; + static u32 clear_idx; +@@ -470,38 +470,6 @@ static char *log_dict(const struct print + return (char *)msg + sizeof(struct printk_log) + msg->text_len; + } + +-/* get record by index; idx must point to valid msg */ +-static struct printk_log *log_from_idx(u32 idx) +-{ +- struct printk_log *msg = (struct printk_log *)(log_buf + idx); +- +- /* +- * A length == 0 record is the end of buffer marker. Wrap around and +- * read the message at the start of the buffer. +- */ +- if (!msg->len) +- return (struct printk_log *)log_buf; +- return msg; +-} +- +-/* get next record; idx must point to valid msg */ +-static u32 log_next(u32 idx) +-{ +- struct printk_log *msg = (struct printk_log *)(log_buf + idx); +- +- /* length == 0 indicates the end of the buffer; wrap */ +- /* +- * A length == 0 record is the end of buffer marker. Wrap around and +- * read the message at the start of the buffer as *this* one, and +- * return the one after that. +- */ +- if (!msg->len) { +- msg = (struct printk_log *)log_buf; +- return msg->len; +- } +- return idx + msg->len; +-} +- + static void printk_emergency(char *buffer, int level, u64 ts_nsec, u16 cpu, + char *text, u16 text_len); + +@@ -2120,9 +2088,7 @@ EXPORT_SYMBOL(printk); + #define printk_time false + + static u64 syslog_seq; +-static u64 log_first_seq; + static u32 log_first_idx; +-static u64 log_next_seq; + static char *log_text(const struct printk_log *msg) { return NULL; } + static char *log_dict(const struct printk_log *msg) { return NULL; } + static struct printk_log *log_from_idx(u32 idx) { return NULL; } +@@ -3032,7 +2998,6 @@ module_param_named(always_kmsg_dump, alw + void kmsg_dump(enum kmsg_dump_reason reason) + { + struct kmsg_dumper *dumper; +- unsigned long flags; + + if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) + return; +@@ -3045,12 +3010,7 @@ void kmsg_dump(enum kmsg_dump_reason rea + /* initialize iterator with data about the stored records */ + dumper->active = true; + +- logbuf_lock_irqsave(flags); +- dumper->cur_seq = clear_seq; +- dumper->cur_idx = clear_idx; +- dumper->next_seq = log_next_seq; +- dumper->next_idx = log_next_idx; +- logbuf_unlock_irqrestore(flags); ++ kmsg_dump_rewind(dumper); + + /* invoke dumper which will iterate over records */ + dumper->dump(dumper, reason); +@@ -3083,33 +3043,67 @@ void kmsg_dump(enum kmsg_dump_reason rea + bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) + { ++ struct prb_iterator iter; + struct printk_log *msg; +- size_t l = 0; +- bool ret = false; ++ struct prb_handle h; ++ bool cont = false; ++ char *msgbuf; ++ char *rbuf; ++ size_t l; ++ u64 seq; ++ int ret; + + if (!dumper->active) +- goto out; ++ return cont; ++ ++ rbuf = prb_reserve(&h, &sprint_rb, PRINTK_RECORD_MAX); ++ if (!rbuf) ++ return cont; ++ msgbuf = rbuf; ++retry: ++ for (;;) { ++ prb_iter_init(&iter, &printk_rb, &seq); ++ ++ if (dumper->line_seq == seq) { ++ /* already where we want to be */ ++ break; ++ } else if (dumper->line_seq < seq) { ++ /* messages are gone, move to first available one */ ++ dumper->line_seq = seq; ++ break; ++ } + +- if (dumper->cur_seq < log_first_seq) { +- /* messages are gone, move to first available one */ +- dumper->cur_seq = log_first_seq; +- dumper->cur_idx = log_first_idx; ++ ret = prb_iter_seek(&iter, dumper->line_seq); ++ if (ret > 0) { ++ /* seeked to line_seq */ ++ break; ++ } else if (ret == 0) { ++ /* ++ * The end of the list was hit without ever seeing ++ * line_seq. Reset it to the beginning of the list. ++ */ ++ prb_iter_init(&iter, &printk_rb, &dumper->line_seq); ++ break; ++ } ++ /* iterator invalid, start over */ + } + +- /* last entry */ +- if (dumper->cur_seq >= log_next_seq) ++ ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, ++ &dumper->line_seq); ++ if (ret == 0) + goto out; ++ else if (ret < 0) ++ goto retry; + +- msg = log_from_idx(dumper->cur_idx); ++ msg = (struct printk_log *)msgbuf; + l = msg_print_text(msg, syslog, printk_time, line, size); + +- dumper->cur_idx = log_next(dumper->cur_idx); +- dumper->cur_seq++; +- ret = true; +-out: + if (len) + *len = l; +- return ret; ++ cont = true; ++out: ++ prb_commit(&h); ++ return cont; + } + + /** +@@ -3132,12 +3126,11 @@ bool kmsg_dump_get_line_nolock(struct km + bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) + { +- unsigned long flags; + bool ret; + +- logbuf_lock_irqsave(flags); ++ mutex_lock(&kmsg_dump_lock); + ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); +- logbuf_unlock_irqrestore(flags); ++ mutex_unlock(&kmsg_dump_lock); + + return ret; + } +@@ -3165,74 +3158,101 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len) + { +- unsigned long flags; +- u64 seq; +- u32 idx; +- u64 next_seq; +- u32 next_idx; +- size_t l = 0; +- bool ret = false; ++ struct prb_iterator iter; + bool time = printk_time; ++ struct printk_log *msg; ++ u64 new_end_seq = 0; ++ struct prb_handle h; ++ bool cont = false; ++ char *msgbuf; ++ u64 end_seq; ++ int textlen; ++ u64 seq = 0; ++ char *rbuf; ++ int l = 0; ++ int ret; + + if (!dumper->active) +- goto out; ++ return cont; + +- logbuf_lock_irqsave(flags); +- if (dumper->cur_seq < log_first_seq) { +- /* messages are gone, move to first available one */ +- dumper->cur_seq = log_first_seq; +- dumper->cur_idx = log_first_idx; +- } ++ rbuf = prb_reserve(&h, &sprint_rb, PRINTK_RECORD_MAX); ++ if (!rbuf) ++ return cont; ++ msgbuf = rbuf; + +- /* last entry */ +- if (dumper->cur_seq >= dumper->next_seq) { +- logbuf_unlock_irqrestore(flags); +- goto out; +- } +- +- /* calculate length of entire buffer */ +- seq = dumper->cur_seq; +- idx = dumper->cur_idx; +- while (seq < dumper->next_seq) { +- struct printk_log *msg = log_from_idx(idx); ++ prb_iter_init(&iter, &printk_rb, NULL); + +- l += msg_print_text(msg, true, time, NULL, 0); +- idx = log_next(idx); +- seq++; ++ /* ++ * seek to the start record, which is set/modified ++ * by kmsg_dump_get_line_nolock() ++ */ ++ ret = prb_iter_seek(&iter, dumper->line_seq); ++ if (ret <= 0) ++ prb_iter_init(&iter, &printk_rb, &seq); ++ ++ /* work with a local end seq to have a constant value */ ++ end_seq = dumper->buffer_end_seq; ++ if (!end_seq) { ++ /* initialize end seq to "infinity" */ ++ end_seq = -1; ++ dumper->buffer_end_seq = end_seq; + } ++retry: ++ if (seq >= end_seq) ++ goto out; + +- /* move first record forward until length fits into the buffer */ +- seq = dumper->cur_seq; +- idx = dumper->cur_idx; +- while (l >= size && seq < dumper->next_seq) { +- struct printk_log *msg = log_from_idx(idx); ++ /* count the total bytes after seq */ ++ textlen = count_remaining(&iter, end_seq, msgbuf, ++ PRINTK_RECORD_MAX, 0, time); + +- l -= msg_print_text(msg, true, time, NULL, 0); +- idx = log_next(idx); +- seq++; ++ /* move iter forward until length fits into the buffer */ ++ while (textlen > size) { ++ ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, &seq); ++ if (ret == 0) { ++ break; ++ } else if (ret < 0) { ++ prb_iter_init(&iter, &printk_rb, &seq); ++ goto retry; ++ } ++ ++ msg = (struct printk_log *)msgbuf; ++ textlen -= msg_print_text(msg, true, time, NULL, 0); + } + +- /* last message in next interation */ +- next_seq = seq; +- next_idx = idx; ++ /* save end seq for the next interation */ ++ new_end_seq = seq + 1; + +- l = 0; +- while (seq < dumper->next_seq) { +- struct printk_log *msg = log_from_idx(idx); ++ /* copy messages to buffer */ ++ while (l < size) { ++ ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, &seq); ++ if (ret == 0) { ++ break; ++ } else if (ret < 0) { ++ /* ++ * iterator (and thus also the start position) ++ * invalid, start over from beginning of list ++ */ ++ prb_iter_init(&iter, &printk_rb, NULL); ++ continue; ++ } + +- l += msg_print_text(msg, syslog, time, buf + l, size - l); +- idx = log_next(idx); +- seq++; ++ if (seq >= end_seq) ++ break; ++ ++ msg = (struct printk_log *)msgbuf; ++ textlen = msg_print_text(msg, syslog, time, buf + l, size - l); ++ if (textlen > 0) ++ l += textlen; ++ cont = true; + } + +- dumper->next_seq = next_seq; +- dumper->next_idx = next_idx; +- ret = true; +- logbuf_unlock_irqrestore(flags); +-out: +- if (len) ++ if (cont && len) + *len = l; +- return ret; ++out: ++ prb_commit(&h); ++ if (new_end_seq) ++ dumper->buffer_end_seq = new_end_seq; ++ return cont; + } + EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); + +@@ -3248,10 +3268,8 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); + */ + void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) + { +- dumper->cur_seq = clear_seq; +- dumper->cur_idx = clear_idx; +- dumper->next_seq = log_next_seq; +- dumper->next_idx = log_next_idx; ++ dumper->line_seq = 0; ++ dumper->buffer_end_seq = 0; + } + + /** +@@ -3264,11 +3282,9 @@ void kmsg_dump_rewind_nolock(struct kmsg + */ + void kmsg_dump_rewind(struct kmsg_dumper *dumper) + { +- unsigned long flags; +- +- logbuf_lock_irqsave(flags); ++ mutex_lock(&kmsg_dump_lock); + kmsg_dump_rewind_nolock(dumper); +- logbuf_unlock_irqrestore(flags); ++ mutex_unlock(&kmsg_dump_lock); + } + EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + diff --git a/kernel/patches-5.4.x-rt/0042-0025-printk-remove-unused-code.patch b/kernel/patches-5.4.x-rt/0042-0025-printk-remove-unused-code.patch new file mode 100644 index 000000000..31dba1a77 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0042-0025-printk-remove-unused-code.patch @@ -0,0 +1,346 @@ +From: John Ogness +Date: Tue, 12 Feb 2019 15:30:03 +0100 +Subject: [PATCH 25/25] printk: remove unused code + +Code relating to the safe context and anything dealing with the +previous log buffer implementation is no longer in use. Remove it. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/internal.h | 41 ----------- + kernel/printk/printk.c | 161 ++++------------------------------------------- + lib/bust_spinlocks.c | 3 + 3 files changed, 16 insertions(+), 189 deletions(-) + delete mode 100644 kernel/printk/internal.h + +--- a/kernel/printk/internal.h ++++ /dev/null +@@ -1,41 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-or-later */ +-/* +- * internal.h - printk internal definitions +- */ +-#include +- +-#ifdef CONFIG_PRINTK +- +-#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff +-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x40000000 +-#define PRINTK_NMI_CONTEXT_MASK 0x80000000 +- +-extern raw_spinlock_t logbuf_lock; +- +-__printf(5, 0) +-int vprintk_store(int facility, int level, +- const char *dict, size_t dictlen, +- const char *fmt, va_list args); +- +-__printf(1, 0) int vprintk_default(const char *fmt, va_list args); +-__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); +-__printf(1, 0) int vprintk_func(const char *fmt, va_list args); +- +-void defer_console_output(void); +- +-#else +- +-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } +- +-/* +- * In !PRINTK builds we still export logbuf_lock spin_lock, console_sem +- * semaphore and some of console functions (console_unlock()/etc.), so +- * printk-safe must preserve the existing local IRQ guarantees. +- */ +-#endif /* CONFIG_PRINTK */ +- +-#define printk_safe_enter_irqsave(flags) local_irq_save(flags) +-#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags) +- +-#define printk_safe_enter_irq() local_irq_disable() +-#define printk_safe_exit_irq() local_irq_enable() +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -61,7 +61,6 @@ + + #include "console_cmdline.h" + #include "braille.h" +-#include "internal.h" + + int console_printk[5] = { + CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ +@@ -366,41 +365,6 @@ struct printk_log { + #endif + ; + +-/* +- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken +- * within the scheduler's rq lock. It must be released before calling +- * console_unlock() or anything else that might wake up a process. +- */ +-DEFINE_RAW_SPINLOCK(logbuf_lock); +- +-/* +- * Helper macros to lock/unlock logbuf_lock and switch between +- * printk-safe/unsafe modes. +- */ +-#define logbuf_lock_irq() \ +- do { \ +- printk_safe_enter_irq(); \ +- raw_spin_lock(&logbuf_lock); \ +- } while (0) +- +-#define logbuf_unlock_irq() \ +- do { \ +- raw_spin_unlock(&logbuf_lock); \ +- printk_safe_exit_irq(); \ +- } while (0) +- +-#define logbuf_lock_irqsave(flags) \ +- do { \ +- printk_safe_enter_irqsave(flags); \ +- raw_spin_lock(&logbuf_lock); \ +- } while (0) +- +-#define logbuf_unlock_irqrestore(flags) \ +- do { \ +- raw_spin_unlock(&logbuf_lock); \ +- printk_safe_exit_irqrestore(flags); \ +- } while (0) +- + DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock); + + #ifdef CONFIG_PRINTK +@@ -410,23 +374,15 @@ DECLARE_STATIC_PRINTKRB(printk_rb, CONFI + static DEFINE_MUTEX(syslog_lock); + DECLARE_STATIC_PRINTKRB_ITER(syslog_iter, &printk_rb); + +-DECLARE_WAIT_QUEUE_HEAD(log_wait); +-/* the next printk record to read by syslog(READ) or /proc/kmsg */ ++/* the last printk record to read by syslog(READ) or /proc/kmsg */ + static u64 syslog_seq; + static size_t syslog_partial; + static bool syslog_time; + +-/* index and sequence number of the first record stored in the buffer */ +-static u32 log_first_idx; +- +-/* index and sequence number of the next record to store in the buffer */ +-static u32 log_next_idx; +- + static DEFINE_MUTEX(kmsg_dump_lock); + + /* the next printk record to read after the last 'clear' command */ + static u64 clear_seq; +-static u32 clear_idx; + + #ifdef CONFIG_PRINTK_CALLER + #define PREFIX_MAX 48 +@@ -438,24 +394,16 @@ static u32 clear_idx; + #define LOG_LEVEL(v) ((v) & 0x07) + #define LOG_FACILITY(v) ((v) >> 3 & 0xff) + +-/* record buffer */ +-#define LOG_ALIGN __alignof__(struct printk_log) +-#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +-#define LOG_BUF_LEN_MAX (u32)(1 << 31) +-static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); +-static char *log_buf = __log_buf; +-static u32 log_buf_len = __LOG_BUF_LEN; +- + /* Return log buffer address */ + char *log_buf_addr_get(void) + { +- return log_buf; ++ return printk_rb.buffer; + } + + /* Return log buffer size */ + u32 log_buf_len_get(void) + { +- return log_buf_len; ++ return (1 << printk_rb.size_bits); + } + + /* human readable text of the record */ +@@ -980,11 +928,6 @@ const struct file_operations kmsg_fops = + */ + void log_buf_vmcoreinfo_setup(void) + { +- VMCOREINFO_SYMBOL(log_buf); +- VMCOREINFO_SYMBOL(log_buf_len); +- VMCOREINFO_SYMBOL(log_first_idx); +- VMCOREINFO_SYMBOL(clear_idx); +- VMCOREINFO_SYMBOL(log_next_idx); + /* + * Export struct printk_log size and field offsets. User space tools can + * parse it and detect any changes to structure down the line. +@@ -1000,6 +943,8 @@ void log_buf_vmcoreinfo_setup(void) + } + #endif + ++/* FIXME: no support for buffer resizing */ ++#if 0 + /* requested log_buf_len from kernel cmdline */ + static unsigned long __initdata new_log_buf_len; + +@@ -1065,9 +1010,12 @@ static void __init log_buf_add_cpu(void) + #else /* !CONFIG_SMP */ + static inline void log_buf_add_cpu(void) {} + #endif /* CONFIG_SMP */ ++#endif /* 0 */ + + void __init setup_log_buf(int early) + { ++/* FIXME: no support for buffer resizing */ ++#if 0 + unsigned long flags; + char *new_log_buf; + unsigned int free; +@@ -1099,6 +1047,7 @@ void __init setup_log_buf(int early) + pr_info("log_buf_len: %u bytes\n", log_buf_len); + pr_info("early log buf free: %u(%u%%)\n", + free, (free * 100) / __LOG_BUF_LEN); ++#endif + } + + static bool __read_mostly ignore_loglevel; +@@ -2019,7 +1968,7 @@ asmlinkage int vprintk_emit(int facility + } + EXPORT_SYMBOL(vprintk_emit); + +-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) ++static __printf(1, 0) int vprintk_func(const char *fmt, va_list args) + { + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + } +@@ -2080,31 +2029,6 @@ asmlinkage __visible int printk(const ch + return r; + } + EXPORT_SYMBOL(printk); +- +-#else /* CONFIG_PRINTK */ +- +-#define LOG_LINE_MAX 0 +-#define PREFIX_MAX 0 +-#define printk_time false +- +-static u64 syslog_seq; +-static u32 log_first_idx; +-static char *log_text(const struct printk_log *msg) { return NULL; } +-static char *log_dict(const struct printk_log *msg) { return NULL; } +-static struct printk_log *log_from_idx(u32 idx) { return NULL; } +-static u32 log_next(u32 idx) { return 0; } +-static ssize_t msg_print_ext_header(char *buf, size_t size, +- struct printk_log *msg, +- u64 seq) { return 0; } +-static ssize_t msg_print_ext_body(char *buf, size_t size, +- char *dict, size_t dict_len, +- char *text, size_t text_len) { return 0; } +-static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len, +- const char *text, size_t len, int level) {} +-static size_t msg_print_text(const struct printk_log *msg, bool syslog, +- bool time, char *buf, size_t size) { return 0; } +-static bool suppress_message_printing(int level) { return false; } +- + #endif /* CONFIG_PRINTK */ + + #ifdef CONFIG_EARLY_PRINTK +@@ -2401,15 +2325,10 @@ void console_unblank(void) + void console_flush_on_panic(enum con_flush_mode mode) + { + /* +- * If someone else is holding the console lock, trylock will fail +- * and may_schedule may be set. Ignore and proceed to unlock so +- * that messages are flushed out. As this can be called from any +- * context and we don't want to get preempted while flushing, +- * ensure may_schedule is cleared. ++ * FIXME: This is currently a NOP. Emergency messages will have been ++ * printed, but what about if write_atomic is not available on the ++ * console? What if the printk kthread is still alive? + */ +- console_trylock(); +- console_may_schedule = 0; +- console_unlock(); + } + + /* +@@ -2758,43 +2677,6 @@ static int __init printk_late_init(void) + late_initcall(printk_late_init); + + #if defined CONFIG_PRINTK +-/* +- * Delayed printk version, for scheduler-internal messages: +- */ +-#define PRINTK_PENDING_WAKEUP 0x01 +-#define PRINTK_PENDING_OUTPUT 0x02 +- +-static DEFINE_PER_CPU(int, printk_pending); +- +-static void wake_up_klogd_work_func(struct irq_work *irq_work) +-{ +- int pending = __this_cpu_xchg(printk_pending, 0); +- +- if (pending & PRINTK_PENDING_OUTPUT) { +- /* If trylock fails, someone else is doing the printing */ +- if (console_trylock()) +- console_unlock(); +- } +- +- if (pending & PRINTK_PENDING_WAKEUP) +- wake_up_interruptible(&log_wait); +-} +- +-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { +- .func = wake_up_klogd_work_func, +- .flags = IRQ_WORK_LAZY, +-}; +- +-void wake_up_klogd(void) +-{ +- preempt_disable(); +- if (waitqueue_active(&log_wait)) { +- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); +- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); +- } +- preempt_enable(); +-} +- + static int printk_kthread_func(void *data) + { + struct prb_iterator iter; +@@ -2860,22 +2742,9 @@ static int __init init_printk_kthread(vo + } + late_initcall(init_printk_kthread); + +-void defer_console_output(void) +-{ +- preempt_disable(); +- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); +- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); +- preempt_enable(); +-} +- +-int vprintk_deferred(const char *fmt, va_list args) ++static int vprintk_deferred(const char *fmt, va_list args) + { +- int r; +- +- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); +- defer_console_output(); +- +- return r; ++ return vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); + } + + int printk_deferred(const char *fmt, ...) +--- a/lib/bust_spinlocks.c ++++ b/lib/bust_spinlocks.c +@@ -26,7 +26,6 @@ void bust_spinlocks(int yes) + unblank_screen(); + #endif + console_unblank(); +- if (--oops_in_progress == 0) +- wake_up_klogd(); ++ --oops_in_progress; + } + } diff --git a/kernel/patches-5.4.x-rt/0043-printk-set-deferred-to-default-loglevel-enforce-mask.patch b/kernel/patches-5.4.x-rt/0043-printk-set-deferred-to-default-loglevel-enforce-mask.patch new file mode 100644 index 000000000..5942464ff --- /dev/null +++ b/kernel/patches-5.4.x-rt/0043-printk-set-deferred-to-default-loglevel-enforce-mask.patch @@ -0,0 +1,38 @@ +From: John Ogness +Date: Thu, 14 Feb 2019 23:13:30 +0100 +Subject: [PATCH] printk: set deferred to default loglevel, enforce mask + +All messages printed via vpritnk_deferred() were being +automatically treated as emergency messages. + +Messages printed via vprintk_deferred() should be set to the +default loglevel. LOGLEVEL_SCHED is no longer relevant. + +Also, enforce the loglevel mask for emergency messages. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1951,7 +1951,7 @@ asmlinkage int vprintk_emit(int facility + * - text points to beginning of text + * - there is room before text for prefix + */ +- printk_emergency(rbuf, level, ts_nsec, cpu, text, text_len); ++ printk_emergency(rbuf, level & 7, ts_nsec, cpu, text, text_len); + + if ((lflags & LOG_CONT) || !(lflags & LOG_NEWLINE)) { + cont_add(ctx, cpu, caller_id, facility, level, lflags, text, text_len); +@@ -2744,7 +2744,7 @@ late_initcall(init_printk_kthread); + + static int vprintk_deferred(const char *fmt, va_list args) + { +- return vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); ++ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); + } + + int printk_deferred(const char *fmt, ...) diff --git a/kernel/patches-5.4.x-rt/0044-serial-8250-remove-that-trylock-in-serial8250_consol.patch b/kernel/patches-5.4.x-rt/0044-serial-8250-remove-that-trylock-in-serial8250_consol.patch new file mode 100644 index 000000000..e84ce5a21 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0044-serial-8250-remove-that-trylock-in-serial8250_consol.patch @@ -0,0 +1,43 @@ +From: Sebastian Andrzej Siewior +Date: Thu, 14 Feb 2019 17:38:24 +0100 +Subject: [PATCH] serial: 8250: remove that trylock in + serial8250_console_write_atomic() + +This does not work as rtmutex in NMI context. As per John, it is not +needed. + +Signed-off-by: Sebastian Andrzej Siewior +--- + drivers/tty/serial/8250/8250_port.c | 11 ----------- + 1 file changed, 11 deletions(-) + +--- a/drivers/tty/serial/8250/8250_port.c ++++ b/drivers/tty/serial/8250/8250_port.c +@@ -3191,17 +3191,9 @@ void serial8250_console_write_atomic(str + { + struct uart_port *port = &up->port; + unsigned int flags; +- bool locked; + + console_atomic_lock(&flags); + +- /* +- * If possible, keep any other CPUs from working with the +- * UART until the atomic message is completed. This helps +- * to keep the output more orderly. +- */ +- locked = spin_trylock(&port->lock); +- + touch_nmi_watchdog(); + + clear_ier(up); +@@ -3216,9 +3208,6 @@ void serial8250_console_write_atomic(str + wait_for_xmitr(up, BOTH_EMPTY); + restore_ier(up); + +- if (locked) +- spin_unlock(&port->lock); +- + console_atomic_unlock(flags); + } + diff --git a/kernel/patches-5.4.x-rt/0045-serial-8250-export-symbols-which-are-used-by-symbols.patch b/kernel/patches-5.4.x-rt/0045-serial-8250-export-symbols-which-are-used-by-symbols.patch new file mode 100644 index 000000000..b82fd3a5b --- /dev/null +++ b/kernel/patches-5.4.x-rt/0045-serial-8250-export-symbols-which-are-used-by-symbols.patch @@ -0,0 +1,38 @@ +From: Sebastian Andrzej Siewior +Date: Sat, 16 Feb 2019 09:02:00 +0100 +Subject: [PATCH] serial: 8250: export symbols which are used by symbols + +Signed-off-by: Sebastian Andrzej Siewior +--- + drivers/tty/serial/8250/8250_port.c | 2 ++ + kernel/printk/printk.c | 1 + + 2 files changed, 3 insertions(+) + +--- a/drivers/tty/serial/8250/8250_port.c ++++ b/drivers/tty/serial/8250/8250_port.c +@@ -2025,6 +2025,7 @@ void clear_ier(struct uart_8250_port *up + } + console_atomic_unlock(flags); + } ++EXPORT_SYMBOL_GPL(clear_ier); + + void restore_ier(struct uart_8250_port *up) + { +@@ -2036,6 +2037,7 @@ void restore_ier(struct uart_8250_port * + serial_port_out(port, UART_IER, atomic_read(&ier_value)); + console_atomic_unlock(flags); + } ++EXPORT_SYMBOL_GPL(restore_ier); + + #ifdef CONFIG_CONSOLE_POLL + /* +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -2257,6 +2257,7 @@ int is_console_locked(void) + { + return console_locked; + } ++EXPORT_SYMBOL(is_console_locked); + + /** + * console_unlock - unlock the console system diff --git a/kernel/patches-5.4.x-rt/0046-arm-remove-printk_nmi_.patch b/kernel/patches-5.4.x-rt/0046-arm-remove-printk_nmi_.patch new file mode 100644 index 000000000..616948ff1 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0046-arm-remove-printk_nmi_.patch @@ -0,0 +1,25 @@ +From: Sebastian Andrzej Siewior +Date: Fri, 15 Feb 2019 14:34:20 +0100 +Subject: [PATCH] arm: remove printk_nmi_.*() + +It is no longer provided by the printk core code. + +Signed-off-by: Sebastian Andrzej Siewior +--- + arch/arm/kernel/smp.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/arch/arm/kernel/smp.c ++++ b/arch/arm/kernel/smp.c +@@ -682,11 +682,9 @@ void handle_IPI(int ipinr, struct pt_reg + break; + + case IPI_CPU_BACKTRACE: +- printk_nmi_enter(); + irq_enter(); + nmi_cpu_backtrace(regs); + irq_exit(); +- printk_nmi_exit(); + break; + + default: diff --git a/kernel/patches-5.4.x-rt/0047-printk-only-allow-kernel-to-emergency-message.patch b/kernel/patches-5.4.x-rt/0047-printk-only-allow-kernel-to-emergency-message.patch new file mode 100644 index 000000000..396335cd4 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0047-printk-only-allow-kernel-to-emergency-message.patch @@ -0,0 +1,67 @@ +From: John Ogness +Date: Sun, 17 Feb 2019 03:11:20 +0100 +Subject: [PATCH] printk: only allow kernel to emergency message + +Emergency messages exist as a mechanism for the kernel to +communicate critical information to users. It is not meant for +use by userspace. Only allow facility=0 messages to be +processed by the emergency message code. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1754,7 +1754,8 @@ static void printk_write_history(struct + * The console_lock must be held. + */ + static void call_console_drivers(u64 seq, const char *ext_text, size_t ext_len, +- const char *text, size_t len, int level) ++ const char *text, size_t len, int level, ++ int facility) + { + struct console *con; + +@@ -1774,13 +1775,14 @@ static void call_console_drivers(u64 seq + con->wrote_history = 1; + con->printk_seq = seq - 1; + } +- if (con->write_atomic && level < emergency_console_loglevel) { ++ if (con->write_atomic && level < emergency_console_loglevel && ++ facility == 0) { + /* skip emergency messages, already printed */ + if (con->printk_seq < seq) + con->printk_seq = seq; + continue; + } +- if (con->flags & CON_BOOT) { ++ if (con->flags & CON_BOOT && facility == 0) { + /* skip emergency messages, already printed */ + if (con->printk_seq < seq) + con->printk_seq = seq; +@@ -1951,7 +1953,10 @@ asmlinkage int vprintk_emit(int facility + * - text points to beginning of text + * - there is room before text for prefix + */ +- printk_emergency(rbuf, level & 7, ts_nsec, cpu, text, text_len); ++ if (facility == 0) { ++ /* only the kernel can create emergency messages */ ++ printk_emergency(rbuf, level & 7, ts_nsec, cpu, text, text_len); ++ } + + if ((lflags & LOG_CONT) || !(lflags & LOG_NEWLINE)) { + cont_add(ctx, cpu, caller_id, facility, level, lflags, text, text_len); +@@ -2715,8 +2720,8 @@ static int printk_kthread_func(void *dat + &len, printk_time); + + console_lock(); +- call_console_drivers(master_seq, ext_text, +- ext_len, text, len, msg->level); ++ call_console_drivers(master_seq, ext_text, ext_len, text, len, ++ msg->level, msg->facility); + if (len > 0 || ext_len > 0) + printk_delay(msg->level); + console_unlock(); diff --git a/kernel/patches-5.4.x-rt/0048-printk-devkmsg-llseek-reset-clear-if-it-is-lost.patch b/kernel/patches-5.4.x-rt/0048-printk-devkmsg-llseek-reset-clear-if-it-is-lost.patch new file mode 100644 index 000000000..6bb5e4506 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0048-printk-devkmsg-llseek-reset-clear-if-it-is-lost.patch @@ -0,0 +1,45 @@ +From: John Ogness +Date: Fri, 22 Feb 2019 23:02:44 +0100 +Subject: [PATCH] printk: devkmsg: llseek: reset clear if it is lost + +SEEK_DATA will seek to the last clear record. If this clear record +is no longer in the ring buffer, devkmsg_llseek() will go into an +infinite loop. Fix that by resetting the clear sequence if the old +clear record is no longer in the ring buffer. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -761,6 +761,7 @@ static loff_t devkmsg_llseek(struct file + { + struct devkmsg_user *user = file->private_data; + loff_t ret; ++ u64 seq; + + if (!user) + return -EBADF; +@@ -783,7 +784,7 @@ static loff_t devkmsg_llseek(struct file + * changes no global state, and does not clear anything. + */ + for (;;) { +- prb_iter_init(&user->iter, &printk_rb, NULL); ++ prb_iter_init(&user->iter, &printk_rb, &seq); + ret = prb_iter_seek(&user->iter, clear_seq); + if (ret > 0) { + /* seeked to clear seq */ +@@ -800,6 +801,10 @@ static loff_t devkmsg_llseek(struct file + break; + } + /* iterator invalid, start over */ ++ ++ /* reset clear_seq if it is no longer available */ ++ if (seq > clear_seq) ++ clear_seq = 0; + } + ret = 0; + break; diff --git a/kernel/patches-5.4.x-rt/0049-printk-print-rate-limitted-message-as-info.patch b/kernel/patches-5.4.x-rt/0049-printk-print-rate-limitted-message-as-info.patch new file mode 100644 index 000000000..24a0a284e --- /dev/null +++ b/kernel/patches-5.4.x-rt/0049-printk-print-rate-limitted-message-as-info.patch @@ -0,0 +1,24 @@ +From: Sebastian Andrzej Siewior +Date: Fri, 22 Feb 2019 12:47:13 +0100 +Subject: [PATCH] printk: print "rate-limitted" message as info + +If messages which are injected via kmsg are dropped then they don't need +to be printed as warnings. This is to avoid latency spikes if the +interface decides to print a lot of important messages. + +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/ratelimit.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/include/linux/ratelimit.h ++++ b/include/linux/ratelimit.h +@@ -59,7 +59,7 @@ static inline void ratelimit_state_exit( + return; + + if (rs->missed) { +- pr_warn("%s: %d output lines suppressed due to ratelimiting\n", ++ pr_info("%s: %d output lines suppressed due to ratelimiting\n", + current->comm, rs->missed); + rs->missed = 0; + } diff --git a/kernel/patches-5.4.x-rt/0050-printk-kmsg_dump-remove-mutex-usage.patch b/kernel/patches-5.4.x-rt/0050-printk-kmsg_dump-remove-mutex-usage.patch new file mode 100644 index 000000000..6fa6ca680 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0050-printk-kmsg_dump-remove-mutex-usage.patch @@ -0,0 +1,84 @@ +From: John Ogness +Date: Wed, 24 Apr 2019 16:36:04 +0200 +Subject: [PATCH] printk: kmsg_dump: remove mutex usage + +The kmsg dumper can be called from any context, but the dumping +helpers were using a mutex to synchronize the iterator against +concurrent dumps. + +Rather than trying to synchronize the iterator, use a local copy +of the iterator during the dump. Then no synchronization is +required. + +Reported-by: Scott Wood +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 23 ++++++++++------------- + 1 file changed, 10 insertions(+), 13 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -379,8 +379,6 @@ static u64 syslog_seq; + static size_t syslog_partial; + static bool syslog_time; + +-static DEFINE_MUTEX(kmsg_dump_lock); +- + /* the next printk record to read after the last 'clear' command */ + static u64 clear_seq; + +@@ -2877,6 +2875,7 @@ module_param_named(always_kmsg_dump, alw + */ + void kmsg_dump(enum kmsg_dump_reason reason) + { ++ struct kmsg_dumper dumper_local; + struct kmsg_dumper *dumper; + + if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) +@@ -2887,16 +2886,18 @@ void kmsg_dump(enum kmsg_dump_reason rea + if (dumper->max_reason && reason > dumper->max_reason) + continue; + +- /* initialize iterator with data about the stored records */ +- dumper->active = true; ++ /* ++ * use a local copy to avoid modifying the ++ * iterator used by any other cpus/contexts ++ */ ++ memcpy(&dumper_local, dumper, sizeof(dumper_local)); + +- kmsg_dump_rewind(dumper); ++ /* initialize iterator with data about the stored records */ ++ dumper_local.active = true; ++ kmsg_dump_rewind(&dumper_local); + + /* invoke dumper which will iterate over records */ +- dumper->dump(dumper, reason); +- +- /* reset iterator */ +- dumper->active = false; ++ dumper_local.dump(&dumper_local, reason); + } + rcu_read_unlock(); + } +@@ -3008,9 +3009,7 @@ bool kmsg_dump_get_line(struct kmsg_dump + { + bool ret; + +- mutex_lock(&kmsg_dump_lock); + ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); +- mutex_unlock(&kmsg_dump_lock); + + return ret; + } +@@ -3162,9 +3161,7 @@ void kmsg_dump_rewind_nolock(struct kmsg + */ + void kmsg_dump_rewind(struct kmsg_dumper *dumper) + { +- mutex_lock(&kmsg_dump_lock); + kmsg_dump_rewind_nolock(dumper); +- mutex_unlock(&kmsg_dump_lock); + } + EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + diff --git a/kernel/patches-5.4.x-rt/0051-printk-devkmsg-read-Return-EPIPE-when-the-first-mess.patch b/kernel/patches-5.4.x-rt/0051-printk-devkmsg-read-Return-EPIPE-when-the-first-mess.patch new file mode 100644 index 000000000..0e283165c --- /dev/null +++ b/kernel/patches-5.4.x-rt/0051-printk-devkmsg-read-Return-EPIPE-when-the-first-mess.patch @@ -0,0 +1,43 @@ +From: He Zhe +Date: Tue, 24 Sep 2019 15:26:39 +0800 +Subject: [PATCH] printk: devkmsg: read: Return EPIPE when the first + message user-space wants has gone + +When user-space wants to read the first message, that is when user->seq +is 0, and that message has gone, it currently automatically resets +user->seq to current first seq. This mis-aligns with mainline kernel. + +https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/ABI/testing/dev-kmsg#n39 +https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/printk/printk.c#n899 + +We should inform user-space that what it wants has gone by returning EPIPE +in such scenario. + +Link: https://lore.kernel.org/r/20190924072639.25986-1-zhe.he@windriver.com +Signed-off-by: He Zhe +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -713,14 +713,10 @@ static ssize_t devkmsg_read(struct file + goto out; + } + +- if (user->seq == 0) { +- user->seq = seq; +- } else { +- user->seq++; +- if (user->seq < seq) { +- ret = -EPIPE; +- goto restore_out; +- } ++ user->seq++; ++ if (user->seq < seq) { ++ ret = -EPIPE; ++ goto restore_out; + } + + msg = (struct printk_log *)&user->msgbuf[0]; diff --git a/kernel/patches-5.4.x-rt/0052-printk-handle-iterating-while-buffer-changing.patch b/kernel/patches-5.4.x-rt/0052-printk-handle-iterating-while-buffer-changing.patch new file mode 100644 index 000000000..1ce4ccb6b --- /dev/null +++ b/kernel/patches-5.4.x-rt/0052-printk-handle-iterating-while-buffer-changing.patch @@ -0,0 +1,43 @@ +From: John Ogness +Date: Mon, 7 Oct 2019 16:20:39 +0200 +Subject: [PATCH] printk: handle iterating while buffer changing + +The syslog and kmsg_dump readers are provided buffers to fill. +Both try to maximize the provided buffer usage by calculating the +maximum number of messages that can fit. However, if after the +calculation, messages are dropped and new messages added, the +calculation will no longer match. + +For syslog, add a check to make sure the provided buffer is not +overfilled. + +For kmsg_dump, start over by recalculating the messages +available. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1446,6 +1446,9 @@ static int syslog_print_all(char __user + break; + } + ++ if (len + textlen > size) ++ break; ++ + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else +@@ -3085,7 +3088,7 @@ bool kmsg_dump_get_buffer(struct kmsg_du + ret = prb_iter_next(&iter, msgbuf, PRINTK_RECORD_MAX, &seq); + if (ret == 0) { + break; +- } else if (ret < 0) { ++ } else if (ret < 0 || seq >= end_seq) { + prb_iter_init(&iter, &printk_rb, &seq); + goto retry; + } diff --git a/kernel/patches-5.4.x-rt/0053-printk-hack-out-emergency-loglevel-usage.patch b/kernel/patches-5.4.x-rt/0053-printk-hack-out-emergency-loglevel-usage.patch new file mode 100644 index 000000000..71cd11d5d --- /dev/null +++ b/kernel/patches-5.4.x-rt/0053-printk-hack-out-emergency-loglevel-usage.patch @@ -0,0 +1,52 @@ +From: John Ogness +Date: Tue, 3 Dec 2019 09:14:57 +0100 +Subject: [PATCH] printk: hack out emergency loglevel usage + +Instead of using an emergency loglevel to determine if atomic +messages should be printed, use oops_in_progress. This conforms +to the decision that latency-causing atomic messages never be +generated during normal operation. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/printk/printk.c | 13 +++---------- + 1 file changed, 3 insertions(+), 10 deletions(-) + +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -1777,15 +1777,8 @@ static void call_console_drivers(u64 seq + con->wrote_history = 1; + con->printk_seq = seq - 1; + } +- if (con->write_atomic && level < emergency_console_loglevel && +- facility == 0) { +- /* skip emergency messages, already printed */ +- if (con->printk_seq < seq) +- con->printk_seq = seq; +- continue; +- } + if (con->flags & CON_BOOT && facility == 0) { +- /* skip emergency messages, already printed */ ++ /* skip boot messages, already printed */ + if (con->printk_seq < seq) + con->printk_seq = seq; + continue; +@@ -3171,7 +3164,7 @@ static bool console_can_emergency(int le + for_each_console(con) { + if (!(con->flags & CON_ENABLED)) + continue; +- if (con->write_atomic && level < emergency_console_loglevel) ++ if (con->write_atomic && oops_in_progress) + return true; + if (con->write && (con->flags & CON_BOOT)) + return true; +@@ -3187,7 +3180,7 @@ static void call_emergency_console_drive + for_each_console(con) { + if (!(con->flags & CON_ENABLED)) + continue; +- if (con->write_atomic && level < emergency_console_loglevel) { ++ if (con->write_atomic && oops_in_progress) { + con->write_atomic(con, text, text_len); + continue; + } diff --git a/kernel/patches-5.4.x-rt/0054-serial-8250-only-atomic-lock-for-console.patch b/kernel/patches-5.4.x-rt/0054-serial-8250-only-atomic-lock-for-console.patch new file mode 100644 index 000000000..9abc5c608 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0054-serial-8250-only-atomic-lock-for-console.patch @@ -0,0 +1,384 @@ +From: John Ogness +Date: Fri, 10 Jan 2020 16:45:31 +0106 +Subject: [PATCH] serial: 8250: only atomic lock for console + +The atomic console implementation requires that IER is synchronized +between atomic and non-atomic usage. However, it was implemented such +that the console_atomic_lock was performed for all IER access, even +if that port was not a console. + +The implementation also used a usage counter to keep track of IER +clear/restore windows. However, this is not needed because the +console_atomic_lock synchronization of IER access with prevent any +situations where IER is prematurely restored or left cleared. + +Move the IER access functions to inline macros. They will only +console_atomic_lock if the port is a console. Remove the +restore_ier() function by having clear_ier() return the prior IER +value so that the caller can restore it using set_ier(). Rename the +IER access functions to match other 8250 wrapper macros. + +Suggested-by: Dick Hollenbeck +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + drivers/tty/serial/8250/8250.h | 65 +++++++++++++++++++--------- + drivers/tty/serial/8250/8250_core.c | 6 +- + drivers/tty/serial/8250/8250_dma.c | 4 - + drivers/tty/serial/8250/8250_port.c | 81 ++++++++---------------------------- + 4 files changed, 66 insertions(+), 90 deletions(-) + +--- a/drivers/tty/serial/8250/8250.h ++++ b/drivers/tty/serial/8250/8250.h +@@ -96,10 +96,6 @@ struct serial8250_config { + #define SERIAL8250_SHARE_IRQS 0 + #endif + +-void set_ier(struct uart_8250_port *up, unsigned char ier); +-void clear_ier(struct uart_8250_port *up); +-void restore_ier(struct uart_8250_port *up); +- + #define SERIAL8250_PORT_FLAGS(_base, _irq, _flags) \ + { \ + .iobase = _base, \ +@@ -134,39 +130,64 @@ static inline void serial_dl_write(struc + up->dl_write(up, value); + } + +-static inline bool serial8250_set_THRI(struct uart_8250_port *up) ++static inline void serial8250_set_IER(struct uart_8250_port *up, ++ unsigned char ier) + { +- if (up->ier & UART_IER_THRI) +- return false; +- up->ier |= UART_IER_THRI; +- serial_out(up, UART_IER, up->ier); +- return true; ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(&flags); ++ ++ serial_out(up, UART_IER, ier); ++ ++ if (is_console) ++ console_atomic_unlock(flags); + } + +-static inline bool serial8250_set_THRI_sier(struct uart_8250_port *up) ++static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up) + { +- if (up->ier & UART_IER_THRI) +- return false; +- up->ier |= UART_IER_THRI; +- set_ier(up, up->ier); +- return true; ++ struct uart_port *port = &up->port; ++ unsigned int clearval = 0; ++ unsigned int prior; ++ unsigned int flags; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (up->capabilities & UART_CAP_UUE) ++ clearval = UART_IER_UUE; ++ ++ if (is_console) ++ console_atomic_lock(&flags); ++ ++ prior = serial_port_in(port, UART_IER); ++ serial_port_out(port, UART_IER, clearval); ++ ++ if (is_console) ++ console_atomic_unlock(flags); ++ ++ return prior; + } + +-static inline bool serial8250_clear_THRI(struct uart_8250_port *up) ++static inline bool serial8250_set_THRI(struct uart_8250_port *up) + { +- if (!(up->ier & UART_IER_THRI)) ++ if (up->ier & UART_IER_THRI) + return false; +- up->ier &= ~UART_IER_THRI; +- serial_out(up, UART_IER, up->ier); ++ up->ier |= UART_IER_THRI; ++ serial8250_set_IER(up, up->ier); + return true; + } + +-static inline bool serial8250_clear_THRI_sier(struct uart_8250_port *up) ++static inline bool serial8250_clear_THRI(struct uart_8250_port *up) + { + if (!(up->ier & UART_IER_THRI)) + return false; + up->ier &= ~UART_IER_THRI; +- set_ier(up, up->ier); ++ serial8250_set_IER(up, up->ier); + return true; + } + +--- a/drivers/tty/serial/8250/8250_core.c ++++ b/drivers/tty/serial/8250/8250_core.c +@@ -265,7 +265,7 @@ static void serial8250_timeout(struct ti + static void serial8250_backup_timeout(struct timer_list *t) + { + struct uart_8250_port *up = from_timer(up, t, timer); +- unsigned int iir, lsr; ++ unsigned int iir, ier = 0, lsr; + unsigned long flags; + + spin_lock_irqsave(&up->port.lock, flags); +@@ -275,7 +275,7 @@ static void serial8250_backup_timeout(st + * based handler. + */ + if (up->port.irq) +- clear_ier(up); ++ ier = serial8250_clear_IER(up); + + iir = serial_in(up, UART_IIR); + +@@ -298,7 +298,7 @@ static void serial8250_backup_timeout(st + serial8250_tx_chars(up); + + if (up->port.irq) +- restore_ier(up); ++ serial8250_set_IER(up, ier); + + spin_unlock_irqrestore(&up->port.lock, flags); + +--- a/drivers/tty/serial/8250/8250_dma.c ++++ b/drivers/tty/serial/8250/8250_dma.c +@@ -35,7 +35,7 @@ static void __dma_tx_complete(void *para + + ret = serial8250_tx_dma(p); + if (ret) +- serial8250_set_THRI_sier(p); ++ serial8250_set_THRI(p); + + spin_unlock_irqrestore(&p->port.lock, flags); + } +@@ -98,7 +98,7 @@ int serial8250_tx_dma(struct uart_8250_p + dma_async_issue_pending(dma->txchan); + if (dma->tx_err) { + dma->tx_err = 0; +- serial8250_clear_THRI_sier(p); ++ serial8250_clear_THRI(p); + } + return 0; + err: +--- a/drivers/tty/serial/8250/8250_port.c ++++ b/drivers/tty/serial/8250/8250_port.c +@@ -721,7 +721,7 @@ static void serial8250_set_sleep(struct + serial_out(p, UART_EFR, UART_EFR_ECB); + serial_out(p, UART_LCR, 0); + } +- set_ier(p, sleep ? UART_IERX_SLEEP : 0); ++ serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0); + if (p->capabilities & UART_CAP_EFR) { + serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); + serial_out(p, UART_EFR, efr); +@@ -1390,7 +1390,7 @@ static void serial8250_stop_rx(struct ua + + up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); + up->port.read_status_mask &= ~UART_LSR_DR; +- set_ier(up, up->ier); ++ serial8250_set_IER(up, up->ier); + + serial8250_rpm_put(up); + } +@@ -1408,7 +1408,7 @@ static void __do_stop_tx_rs485(struct ua + serial8250_clear_and_reinit_fifos(p); + + p->ier |= UART_IER_RLSI | UART_IER_RDI; +- set_ier(p, p->ier); ++ serial8250_set_IER(p, p->ier); + } + } + static enum hrtimer_restart serial8250_em485_handle_stop_tx(struct hrtimer *t) +@@ -1459,7 +1459,7 @@ static void __stop_tx_rs485(struct uart_ + + static inline void __do_stop_tx(struct uart_8250_port *p) + { +- if (serial8250_clear_THRI_sier(p)) ++ if (serial8250_clear_THRI(p)) + serial8250_rpm_put_tx(p); + } + +@@ -1509,7 +1509,7 @@ static inline void __start_tx(struct uar + if (up->dma && !up->dma->tx_dma(up)) + return; + +- if (serial8250_set_THRI_sier(up)) { ++ if (serial8250_set_THRI(up)) { + if (up->bugs & UART_BUG_TXEN) { + unsigned char lsr; + +@@ -1616,7 +1616,7 @@ static void serial8250_disable_ms(struct + mctrl_gpio_disable_ms(up->gpios); + + up->ier &= ~UART_IER_MSI; +- set_ier(up, up->ier); ++ serial8250_set_IER(up, up->ier); + } + + static void serial8250_enable_ms(struct uart_port *port) +@@ -1632,7 +1632,7 @@ static void serial8250_enable_ms(struct + up->ier |= UART_IER_MSI; + + serial8250_rpm_get(up); +- set_ier(up, up->ier); ++ serial8250_set_IER(up, up->ier); + serial8250_rpm_put(up); + } + +@@ -1991,54 +1991,6 @@ static void wait_for_xmitr(struct uart_8 + } + } + +-static atomic_t ier_counter = ATOMIC_INIT(0); +-static atomic_t ier_value = ATOMIC_INIT(0); +- +-void set_ier(struct uart_8250_port *up, unsigned char ier) +-{ +- struct uart_port *port = &up->port; +- unsigned int flags; +- +- console_atomic_lock(&flags); +- if (atomic_read(&ier_counter) > 0) +- atomic_set(&ier_value, ier); +- else +- serial_port_out(port, UART_IER, ier); +- console_atomic_unlock(flags); +-} +- +-void clear_ier(struct uart_8250_port *up) +-{ +- struct uart_port *port = &up->port; +- unsigned int ier_cleared = 0; +- unsigned int flags; +- unsigned int ier; +- +- console_atomic_lock(&flags); +- atomic_inc(&ier_counter); +- ier = serial_port_in(port, UART_IER); +- if (up->capabilities & UART_CAP_UUE) +- ier_cleared = UART_IER_UUE; +- if (ier != ier_cleared) { +- serial_port_out(port, UART_IER, ier_cleared); +- atomic_set(&ier_value, ier); +- } +- console_atomic_unlock(flags); +-} +-EXPORT_SYMBOL_GPL(clear_ier); +- +-void restore_ier(struct uart_8250_port *up) +-{ +- struct uart_port *port = &up->port; +- unsigned int flags; +- +- console_atomic_lock(&flags); +- if (atomic_fetch_dec(&ier_counter) == 1) +- serial_port_out(port, UART_IER, atomic_read(&ier_value)); +- console_atomic_unlock(flags); +-} +-EXPORT_SYMBOL_GPL(restore_ier); +- + #ifdef CONFIG_CONSOLE_POLL + /* + * Console polling routines for writing and reading from the uart while +@@ -2070,10 +2022,11 @@ static int serial8250_get_poll_char(stru + static void serial8250_put_poll_char(struct uart_port *port, + unsigned char c) + { ++ unsigned int ier; + struct uart_8250_port *up = up_to_u8250p(port); + + serial8250_rpm_get(up); +- clear_ier(up); ++ ier = serial8250_clear_IER(up); + + wait_for_xmitr(up, BOTH_EMPTY); + /* +@@ -2086,7 +2039,7 @@ static void serial8250_put_poll_char(str + * and restore the IER + */ + wait_for_xmitr(up, BOTH_EMPTY); +- restore_ier(up); ++ serial8250_set_IER(up, ier); + serial8250_rpm_put(up); + } + +@@ -2398,7 +2351,7 @@ void serial8250_do_shutdown(struct uart_ + */ + spin_lock_irqsave(&port->lock, flags); + up->ier = 0; +- set_ier(up, 0); ++ serial8250_set_IER(up, 0); + spin_unlock_irqrestore(&port->lock, flags); + + synchronize_irq(port->irq); +@@ -2683,7 +2636,7 @@ serial8250_do_set_termios(struct uart_po + if (up->capabilities & UART_CAP_RTOIE) + up->ier |= UART_IER_RTOIE; + +- set_ier(up, up->ier); ++ serial8250_set_IER(up, up->ier); + + if (up->capabilities & UART_CAP_EFR) { + unsigned char efr = 0; +@@ -3193,12 +3146,13 @@ void serial8250_console_write_atomic(str + { + struct uart_port *port = &up->port; + unsigned int flags; ++ unsigned int ier; + + console_atomic_lock(&flags); + + touch_nmi_watchdog(); + +- clear_ier(up); ++ ier = serial8250_clear_IER(up); + + if (atomic_fetch_inc(&up->console_printing)) { + uart_console_write(port, "\n", 1, +@@ -3208,7 +3162,7 @@ void serial8250_console_write_atomic(str + atomic_dec(&up->console_printing); + + wait_for_xmitr(up, BOTH_EMPTY); +- restore_ier(up); ++ serial8250_set_IER(up, ier); + + console_atomic_unlock(flags); + } +@@ -3224,13 +3178,14 @@ void serial8250_console_write(struct uar + { + struct uart_port *port = &up->port; + unsigned long flags; ++ unsigned int ier; + + touch_nmi_watchdog(); + + serial8250_rpm_get(up); + spin_lock_irqsave(&port->lock, flags); + +- clear_ier(up); ++ ier = serial8250_clear_IER(up); + + /* check scratch reg to see if port powered off during system sleep */ + if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { +@@ -3247,7 +3202,7 @@ void serial8250_console_write(struct uar + * and restore the IER + */ + wait_for_xmitr(up, BOTH_EMPTY); +- restore_ier(up); ++ serial8250_set_IER(up, ier); + + /* + * The receive handling will happen properly because the diff --git a/kernel/patches-5.4.x-rt/0055-serial-8250-fsl-ingenic-mtk-fix-atomic-console.patch b/kernel/patches-5.4.x-rt/0055-serial-8250-fsl-ingenic-mtk-fix-atomic-console.patch new file mode 100644 index 000000000..6db09fe03 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0055-serial-8250-fsl-ingenic-mtk-fix-atomic-console.patch @@ -0,0 +1,102 @@ +From: John Ogness +Date: Fri, 10 Jan 2020 16:45:32 +0106 +Subject: [PATCH] serial: 8250: fsl/ingenic/mtk: fix atomic console + +A few 8250 implementations have their own IER access. If the port +is a console, wrap the accesses with console_atomic_lock. + +Signed-off-by: John Ogness +Signed-off-by: Sebastian Andrzej Siewior +--- + drivers/tty/serial/8250/8250_fsl.c | 9 +++++++++ + drivers/tty/serial/8250/8250_ingenic.c | 7 +++++++ + drivers/tty/serial/8250/8250_mtk.c | 29 +++++++++++++++++++++++++++-- + 3 files changed, 43 insertions(+), 2 deletions(-) + +--- a/drivers/tty/serial/8250/8250_fsl.c ++++ b/drivers/tty/serial/8250/8250_fsl.c +@@ -57,9 +57,18 @@ int fsl8250_handle_irq(struct uart_port + + /* Stop processing interrupts on input overrun */ + if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { ++ unsigned int ca_flags; + unsigned long delay; ++ bool is_console; + ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(&ca_flags); + up->ier = port->serial_in(port, UART_IER); ++ if (is_console) ++ console_atomic_unlock(ca_flags); ++ + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { + port->ops->stop_rx(port); + } else { +--- a/drivers/tty/serial/8250/8250_ingenic.c ++++ b/drivers/tty/serial/8250/8250_ingenic.c +@@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic + + static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) + { ++ unsigned int flags; ++ bool is_console; + int ier; + + switch (offset) { +@@ -167,7 +169,12 @@ static void ingenic_uart_serial_out(stru + * If we have enabled modem status IRQs we should enable + * modem mode. + */ ++ is_console = uart_console(p); ++ if (is_console) ++ console_atomic_lock(&flags); + ier = p->serial_in(p, UART_IER); ++ if (is_console) ++ console_atomic_unlock(flags); + + if (ier & UART_IER_MSI) + value |= UART_MCR_MDCE | UART_MCR_FCM; +--- a/drivers/tty/serial/8250/8250_mtk.c ++++ b/drivers/tty/serial/8250/8250_mtk.c +@@ -212,12 +212,37 @@ static void mtk8250_shutdown(struct uart + + static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) + { +- serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask)); ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ unsigned int ier; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(&flags); ++ ++ ier = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, ier & (~mask)); ++ ++ if (is_console) ++ console_atomic_unlock(flags); + } + + static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) + { +- serial_out(up, UART_IER, serial_in(up, UART_IER) | mask); ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ unsigned int ier; ++ ++ if (uart_console(port)) ++ console_atomic_lock(&flags); ++ ++ ier = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, ier | mask); ++ ++ if (uart_console(port)) ++ console_atomic_unlock(flags); + } + + static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) diff --git a/kernel/patches-5.4.x-rt/0056-0001-locking-percpu-rwsem-lockdep-Make-percpu-rwsem-use-i.patch b/kernel/patches-5.4.x-rt/0056-0001-locking-percpu-rwsem-lockdep-Make-percpu-rwsem-use-i.patch new file mode 100644 index 000000000..9d8dbcd73 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0056-0001-locking-percpu-rwsem-lockdep-Make-percpu-rwsem-use-i.patch @@ -0,0 +1,217 @@ +From: Peter Zijlstra +Date: Fri, 31 Jan 2020 16:07:04 +0100 +Subject: [PATCH 1/7] locking/percpu-rwsem, lockdep: Make percpu-rwsem use its + own lockdep_map + +As preparation for replacing the embedded rwsem, give percpu-rwsem its +own lockdep_map. + +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Juri Lelli +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/percpu-rwsem.h | 29 +++++++++++++++++++---------- + kernel/cpu.c | 4 ++-- + kernel/locking/percpu-rwsem.c | 16 ++++++++++++---- + kernel/locking/rwsem.c | 4 ++-- + kernel/locking/rwsem.h | 2 ++ + 5 files changed, 37 insertions(+), 18 deletions(-) + +--- a/include/linux/percpu-rwsem.h ++++ b/include/linux/percpu-rwsem.h +@@ -15,8 +15,17 @@ struct percpu_rw_semaphore { + struct rw_semaphore rw_sem; /* slowpath */ + struct rcuwait writer; /* blocked writer */ + int readers_block; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif + }; + ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }, ++#else ++#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) ++#endif ++ + #define __DEFINE_PERCPU_RWSEM(name, is_static) \ + static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \ + is_static struct percpu_rw_semaphore name = { \ +@@ -24,7 +33,9 @@ is_static struct percpu_rw_semaphore nam + .read_count = &__percpu_rwsem_rc_##name, \ + .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ + .writer = __RCUWAIT_INITIALIZER(name.writer), \ ++ __PERCPU_RWSEM_DEP_MAP_INIT(name) \ + } ++ + #define DEFINE_PERCPU_RWSEM(name) \ + __DEFINE_PERCPU_RWSEM(name, /* not static */) + #define DEFINE_STATIC_PERCPU_RWSEM(name) \ +@@ -37,7 +48,7 @@ static inline void percpu_down_read(stru + { + might_sleep(); + +- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 0, _RET_IP_); ++ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + preempt_disable(); + /* +@@ -76,13 +87,15 @@ static inline int percpu_down_read_trylo + */ + + if (ret) +- rwsem_acquire_read(&sem->rw_sem.dep_map, 0, 1, _RET_IP_); ++ rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + + return ret; + } + + static inline void percpu_up_read(struct percpu_rw_semaphore *sem) + { ++ rwsem_release(&sem->dep_map, 1, _RET_IP_); ++ + preempt_disable(); + /* + * Same as in percpu_down_read(). +@@ -92,8 +105,6 @@ static inline void percpu_up_read(struct + else + __percpu_up_read(sem); /* Unconditional memory barrier */ + preempt_enable(); +- +- rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_); + } + + extern void percpu_down_write(struct percpu_rw_semaphore *); +@@ -110,15 +121,13 @@ extern void percpu_free_rwsem(struct per + __percpu_init_rwsem(sem, #sem, &rwsem_key); \ + }) + +-#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) +- +-#define percpu_rwsem_assert_held(sem) \ +- lockdep_assert_held(&(sem)->rw_sem) ++#define percpu_rwsem_is_held(sem) lockdep_is_held(sem) ++#define percpu_rwsem_assert_held(sem) lockdep_assert_held(sem) + + static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) + { +- lock_release(&sem->rw_sem.dep_map, 1, ip); ++ lock_release(&sem->dep_map, 1, ip); + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER + if (!read) + atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN); +@@ -128,7 +137,7 @@ static inline void percpu_rwsem_release( + static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) + { +- lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip); ++ lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip); + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER + if (!read) + atomic_long_set(&sem->rw_sem.owner, (long)current); +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -331,12 +331,12 @@ void lockdep_assert_cpus_held(void) + + static void lockdep_acquire_cpus_lock(void) + { +- rwsem_acquire(&cpu_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); ++ rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); + } + + static void lockdep_release_cpus_lock(void) + { +- rwsem_release(&cpu_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_); ++ rwsem_release(&cpu_hotplug_lock.dep_map, 1, _THIS_IP_); + } + + /* +--- a/kernel/locking/percpu-rwsem.c ++++ b/kernel/locking/percpu-rwsem.c +@@ -11,7 +11,7 @@ + #include "rwsem.h" + + int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, +- const char *name, struct lock_class_key *rwsem_key) ++ const char *name, struct lock_class_key *key) + { + sem->read_count = alloc_percpu(int); + if (unlikely(!sem->read_count)) +@@ -19,9 +19,13 @@ int __percpu_init_rwsem(struct percpu_rw + + /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ + rcu_sync_init(&sem->rss); +- __init_rwsem(&sem->rw_sem, name, rwsem_key); ++ init_rwsem(&sem->rw_sem); + rcuwait_init(&sem->writer); + sem->readers_block = 0; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); ++ lockdep_init_map(&sem->dep_map, name, key, 0); ++#endif + return 0; + } + EXPORT_SYMBOL_GPL(__percpu_init_rwsem); +@@ -142,10 +146,12 @@ static bool readers_active_check(struct + + void percpu_down_write(struct percpu_rw_semaphore *sem) + { ++ rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); ++ + /* Notify readers to take the slow path. */ + rcu_sync_enter(&sem->rss); + +- down_write(&sem->rw_sem); ++ __down_write(&sem->rw_sem); + + /* + * Notify new readers to block; up until now, and thus throughout the +@@ -168,6 +174,8 @@ EXPORT_SYMBOL_GPL(percpu_down_write); + + void percpu_up_write(struct percpu_rw_semaphore *sem) + { ++ rwsem_release(&sem->dep_map, 1, _RET_IP_); ++ + /* + * Signal the writer is done, no fast path yet. + * +@@ -183,7 +191,7 @@ void percpu_up_write(struct percpu_rw_se + /* + * Release the write lock, this will allow readers back in the game. + */ +- up_write(&sem->rw_sem); ++ __up_write(&sem->rw_sem); + + /* + * Once this completes (at least one RCU-sched grace period hence) the +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -1383,7 +1383,7 @@ static inline int __down_read_trylock(st + /* + * lock for writing + */ +-static inline void __down_write(struct rw_semaphore *sem) ++inline void __down_write(struct rw_semaphore *sem) + { + long tmp = RWSEM_UNLOCKED_VALUE; + +@@ -1446,7 +1446,7 @@ inline void __up_read(struct rw_semaphor + /* + * unlock after writing + */ +-static inline void __up_write(struct rw_semaphore *sem) ++inline void __up_write(struct rw_semaphore *sem) + { + long tmp; + +--- a/kernel/locking/rwsem.h ++++ b/kernel/locking/rwsem.h +@@ -6,5 +6,7 @@ + + extern void __down_read(struct rw_semaphore *sem); + extern void __up_read(struct rw_semaphore *sem); ++extern void __down_write(struct rw_semaphore *sem); ++extern void __up_write(struct rw_semaphore *sem); + + #endif /* __INTERNAL_RWSEM_H */ diff --git a/kernel/patches-5.4.x-rt/0057-0002-locking-percpu-rwsem-Convert-to-bool.patch b/kernel/patches-5.4.x-rt/0057-0002-locking-percpu-rwsem-Convert-to-bool.patch new file mode 100644 index 000000000..46ff3d02a --- /dev/null +++ b/kernel/patches-5.4.x-rt/0057-0002-locking-percpu-rwsem-Convert-to-bool.patch @@ -0,0 +1,75 @@ +From: Peter Zijlstra +Date: Fri, 31 Jan 2020 16:07:05 +0100 +Subject: [PATCH 2/7] locking/percpu-rwsem: Convert to bool + +Use bool where possible. + +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Juri Lelli +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/percpu-rwsem.h | 6 +++--- + kernel/locking/percpu-rwsem.c | 8 ++++---- + 2 files changed, 7 insertions(+), 7 deletions(-) + +--- a/include/linux/percpu-rwsem.h ++++ b/include/linux/percpu-rwsem.h +@@ -41,7 +41,7 @@ is_static struct percpu_rw_semaphore nam + #define DEFINE_STATIC_PERCPU_RWSEM(name) \ + __DEFINE_PERCPU_RWSEM(name, static) + +-extern int __percpu_down_read(struct percpu_rw_semaphore *, int); ++extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool); + extern void __percpu_up_read(struct percpu_rw_semaphore *); + + static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +@@ -69,9 +69,9 @@ static inline void percpu_down_read(stru + preempt_enable(); + } + +-static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) ++static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) + { +- int ret = 1; ++ bool ret = true; + + preempt_disable(); + /* +--- a/kernel/locking/percpu-rwsem.c ++++ b/kernel/locking/percpu-rwsem.c +@@ -45,7 +45,7 @@ void percpu_free_rwsem(struct percpu_rw_ + } + EXPORT_SYMBOL_GPL(percpu_free_rwsem); + +-int __percpu_down_read(struct percpu_rw_semaphore *sem, int try) ++bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) + { + /* + * Due to having preemption disabled the decrement happens on +@@ -69,7 +69,7 @@ int __percpu_down_read(struct percpu_rw_ + * release in percpu_up_write(). + */ + if (likely(!smp_load_acquire(&sem->readers_block))) +- return 1; ++ return true; + + /* + * Per the above comment; we still have preemption disabled and +@@ -78,7 +78,7 @@ int __percpu_down_read(struct percpu_rw_ + __percpu_up_read(sem); + + if (try) +- return 0; ++ return false; + + /* + * We either call schedule() in the wait, or we'll fall through +@@ -94,7 +94,7 @@ int __percpu_down_read(struct percpu_rw_ + __up_read(&sem->rw_sem); + + preempt_disable(); +- return 1; ++ return true; + } + EXPORT_SYMBOL_GPL(__percpu_down_read); + diff --git a/kernel/patches-5.4.x-rt/0058-0003-locking-percpu-rwsem-Move-__this_cpu_inc-into-the-sl.patch b/kernel/patches-5.4.x-rt/0058-0003-locking-percpu-rwsem-Move-__this_cpu_inc-into-the-sl.patch new file mode 100644 index 000000000..e9f0e0e5f --- /dev/null +++ b/kernel/patches-5.4.x-rt/0058-0003-locking-percpu-rwsem-Move-__this_cpu_inc-into-the-sl.patch @@ -0,0 +1,53 @@ +From: Peter Zijlstra +Date: Fri, 31 Jan 2020 16:07:06 +0100 +Subject: [PATCH 3/7] locking/percpu-rwsem: Move __this_cpu_inc() into the + slowpath + +As preparation to rework __percpu_down_read() move the +__this_cpu_inc() into it. + +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Juri Lelli +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/percpu-rwsem.h | 10 ++++++---- + kernel/locking/percpu-rwsem.c | 2 ++ + 2 files changed, 8 insertions(+), 4 deletions(-) + +--- a/include/linux/percpu-rwsem.h ++++ b/include/linux/percpu-rwsem.h +@@ -59,8 +59,9 @@ static inline void percpu_down_read(stru + * and that once the synchronize_rcu() is done, the writer will see + * anything we did within this RCU-sched read-size critical section. + */ +- __this_cpu_inc(*sem->read_count); +- if (unlikely(!rcu_sync_is_idle(&sem->rss))) ++ if (likely(rcu_sync_is_idle(&sem->rss))) ++ __this_cpu_inc(*sem->read_count); ++ else + __percpu_down_read(sem, false); /* Unconditional memory barrier */ + /* + * The preempt_enable() prevents the compiler from +@@ -77,8 +78,9 @@ static inline bool percpu_down_read_tryl + /* + * Same as in percpu_down_read(). + */ +- __this_cpu_inc(*sem->read_count); +- if (unlikely(!rcu_sync_is_idle(&sem->rss))) ++ if (likely(rcu_sync_is_idle(&sem->rss))) ++ __this_cpu_inc(*sem->read_count); ++ else + ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ + preempt_enable(); + /* +--- a/kernel/locking/percpu-rwsem.c ++++ b/kernel/locking/percpu-rwsem.c +@@ -47,6 +47,8 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem); + + bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) + { ++ __this_cpu_inc(*sem->read_count); ++ + /* + * Due to having preemption disabled the decrement happens on + * the same CPU as the increment, avoiding the diff --git a/kernel/patches-5.4.x-rt/0059-0004-locking-percpu-rwsem-Extract-__percpu_down_read_tryl.patch b/kernel/patches-5.4.x-rt/0059-0004-locking-percpu-rwsem-Extract-__percpu_down_read_tryl.patch new file mode 100644 index 000000000..70ac7a9b5 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0059-0004-locking-percpu-rwsem-Extract-__percpu_down_read_tryl.patch @@ -0,0 +1,50 @@ +From: Peter Zijlstra +Date: Fri, 31 Jan 2020 16:07:07 +0100 +Subject: [PATCH 4/7] locking/percpu-rwsem: Extract + __percpu_down_read_trylock() + +In preparation for removing the embedded rwsem and building a custom +lock, extract the read-trylock primitive. + +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Juri Lelli +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/locking/percpu-rwsem.c | 19 +++++++++++++------ + 1 file changed, 13 insertions(+), 6 deletions(-) + +--- a/kernel/locking/percpu-rwsem.c ++++ b/kernel/locking/percpu-rwsem.c +@@ -45,7 +45,7 @@ void percpu_free_rwsem(struct percpu_rw_ + } + EXPORT_SYMBOL_GPL(percpu_free_rwsem); + +-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) ++static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem) + { + __this_cpu_inc(*sem->read_count); + +@@ -73,11 +73,18 @@ bool __percpu_down_read(struct percpu_rw + if (likely(!smp_load_acquire(&sem->readers_block))) + return true; + +- /* +- * Per the above comment; we still have preemption disabled and +- * will thus decrement on the same CPU as we incremented. +- */ +- __percpu_up_read(sem); ++ __this_cpu_dec(*sem->read_count); ++ ++ /* Prod writer to re-evaluate readers_active_check() */ ++ rcuwait_wake_up(&sem->writer); ++ ++ return false; ++} ++ ++bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) ++{ ++ if (__percpu_down_read_trylock(sem)) ++ return true; + + if (try) + return false; diff --git a/kernel/patches-5.4.x-rt/0060-0005-locking-percpu-rwsem-Remove-the-embedded-rwsem.patch b/kernel/patches-5.4.x-rt/0060-0005-locking-percpu-rwsem-Remove-the-embedded-rwsem.patch new file mode 100644 index 000000000..1a8ef0e35 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0060-0005-locking-percpu-rwsem-Remove-the-embedded-rwsem.patch @@ -0,0 +1,433 @@ +From: Peter Zijlstra +Date: Fri, 31 Jan 2020 16:07:08 +0100 +Subject: [PATCH 5/7] locking/percpu-rwsem: Remove the embedded rwsem + +The filesystem freezer uses percpu-rwsem in a way that is effectively +write_non_owner() and achieves this with a few horrible hacks that +rely on the rwsem (!percpu) implementation. + +When PREEMPT_RT replaces the rwsem implementation with a PI aware +variant this comes apart. + +Remove the embedded rwsem and implement it using a waitqueue and an +atomic_t. + + - make readers_block an atomic, and use it, with the waitqueue + for a blocking test-and-set write-side. + + - have the read-side wait for the 'lock' state to clear. + +Have the waiters use FIFO queueing and mark them (reader/writer) with +a new WQ_FLAG. Use a custom wake_function to wake either a single +writer or all readers until a writer. + +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Juri Lelli +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/percpu-rwsem.h | 19 +---- + include/linux/rwsem.h | 6 - + include/linux/wait.h | 1 + kernel/locking/percpu-rwsem.c | 153 ++++++++++++++++++++++++++++++------------ + kernel/locking/rwsem.c | 11 +-- + kernel/locking/rwsem.h | 12 --- + 6 files changed, 123 insertions(+), 79 deletions(-) + +--- a/include/linux/percpu-rwsem.h ++++ b/include/linux/percpu-rwsem.h +@@ -3,18 +3,18 @@ + #define _LINUX_PERCPU_RWSEM_H + + #include +-#include + #include + #include ++#include + #include + #include + + struct percpu_rw_semaphore { + struct rcu_sync rss; + unsigned int __percpu *read_count; +- struct rw_semaphore rw_sem; /* slowpath */ +- struct rcuwait writer; /* blocked writer */ +- int readers_block; ++ struct rcuwait writer; ++ wait_queue_head_t waiters; ++ atomic_t block; + #ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + #endif +@@ -31,8 +31,9 @@ static DEFINE_PER_CPU(unsigned int, __pe + is_static struct percpu_rw_semaphore name = { \ + .rss = __RCU_SYNC_INITIALIZER(name.rss), \ + .read_count = &__percpu_rwsem_rc_##name, \ +- .rw_sem = __RWSEM_INITIALIZER(name.rw_sem), \ + .writer = __RCUWAIT_INITIALIZER(name.writer), \ ++ .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters), \ ++ .block = ATOMIC_INIT(0), \ + __PERCPU_RWSEM_DEP_MAP_INIT(name) \ + } + +@@ -130,20 +131,12 @@ static inline void percpu_rwsem_release( + bool read, unsigned long ip) + { + lock_release(&sem->dep_map, 1, ip); +-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +- if (!read) +- atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN); +-#endif + } + + static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) + { + lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip); +-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +- if (!read) +- atomic_long_set(&sem->rw_sem.owner, (long)current); +-#endif + } + + #endif +--- a/include/linux/rwsem.h ++++ b/include/linux/rwsem.h +@@ -53,12 +53,6 @@ struct rw_semaphore { + #endif + }; + +-/* +- * Setting all bits of the owner field except bit 0 will indicate +- * that the rwsem is writer-owned with an unknown owner. +- */ +-#define RWSEM_OWNER_UNKNOWN (-2L) +- + /* In all implementations count != 0 means locked */ + static inline int rwsem_is_locked(struct rw_semaphore *sem) + { +--- a/include/linux/wait.h ++++ b/include/linux/wait.h +@@ -20,6 +20,7 @@ int default_wake_function(struct wait_qu + #define WQ_FLAG_EXCLUSIVE 0x01 + #define WQ_FLAG_WOKEN 0x02 + #define WQ_FLAG_BOOKMARK 0x04 ++#define WQ_FLAG_CUSTOM 0x08 + + /* + * A single wait-queue entry structure: +--- a/kernel/locking/percpu-rwsem.c ++++ b/kernel/locking/percpu-rwsem.c +@@ -1,15 +1,14 @@ + // SPDX-License-Identifier: GPL-2.0-only + #include +-#include + #include ++#include + #include + #include + #include + #include ++#include + #include + +-#include "rwsem.h" +- + int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, + const char *name, struct lock_class_key *key) + { +@@ -17,11 +16,10 @@ int __percpu_init_rwsem(struct percpu_rw + if (unlikely(!sem->read_count)) + return -ENOMEM; + +- /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ + rcu_sync_init(&sem->rss); +- init_rwsem(&sem->rw_sem); + rcuwait_init(&sem->writer); +- sem->readers_block = 0; ++ init_waitqueue_head(&sem->waiters); ++ atomic_set(&sem->block, 0); + #ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +@@ -54,23 +52,23 @@ static bool __percpu_down_read_trylock(s + * the same CPU as the increment, avoiding the + * increment-on-one-CPU-and-decrement-on-another problem. + * +- * If the reader misses the writer's assignment of readers_block, then +- * the writer is guaranteed to see the reader's increment. ++ * If the reader misses the writer's assignment of sem->block, then the ++ * writer is guaranteed to see the reader's increment. + * + * Conversely, any readers that increment their sem->read_count after +- * the writer looks are guaranteed to see the readers_block value, +- * which in turn means that they are guaranteed to immediately +- * decrement their sem->read_count, so that it doesn't matter that the +- * writer missed them. ++ * the writer looks are guaranteed to see the sem->block value, which ++ * in turn means that they are guaranteed to immediately decrement ++ * their sem->read_count, so that it doesn't matter that the writer ++ * missed them. + */ + + smp_mb(); /* A matches D */ + + /* +- * If !readers_block the critical section starts here, matched by the ++ * If !sem->block the critical section starts here, matched by the + * release in percpu_up_write(). + */ +- if (likely(!smp_load_acquire(&sem->readers_block))) ++ if (likely(!atomic_read_acquire(&sem->block))) + return true; + + __this_cpu_dec(*sem->read_count); +@@ -81,6 +79,88 @@ static bool __percpu_down_read_trylock(s + return false; + } + ++static inline bool __percpu_down_write_trylock(struct percpu_rw_semaphore *sem) ++{ ++ if (atomic_read(&sem->block)) ++ return false; ++ ++ return atomic_xchg(&sem->block, 1) == 0; ++} ++ ++static bool __percpu_rwsem_trylock(struct percpu_rw_semaphore *sem, bool reader) ++{ ++ if (reader) { ++ bool ret; ++ ++ preempt_disable(); ++ ret = __percpu_down_read_trylock(sem); ++ preempt_enable(); ++ ++ return ret; ++ } ++ return __percpu_down_write_trylock(sem); ++} ++ ++/* ++ * The return value of wait_queue_entry::func means: ++ * ++ * <0 - error, wakeup is terminated and the error is returned ++ * 0 - no wakeup, a next waiter is tried ++ * >0 - woken, if EXCLUSIVE, counted towards @nr_exclusive. ++ * ++ * We use EXCLUSIVE for both readers and writers to preserve FIFO order, ++ * and play games with the return value to allow waking multiple readers. ++ * ++ * Specifically, we wake readers until we've woken a single writer, or until a ++ * trylock fails. ++ */ ++static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, ++ unsigned int mode, int wake_flags, ++ void *key) ++{ ++ struct task_struct *p = get_task_struct(wq_entry->private); ++ bool reader = wq_entry->flags & WQ_FLAG_CUSTOM; ++ struct percpu_rw_semaphore *sem = key; ++ ++ /* concurrent against percpu_down_write(), can get stolen */ ++ if (!__percpu_rwsem_trylock(sem, reader)) ++ return 1; ++ ++ list_del_init(&wq_entry->entry); ++ smp_store_release(&wq_entry->private, NULL); ++ ++ wake_up_process(p); ++ put_task_struct(p); ++ ++ return !reader; /* wake (readers until) 1 writer */ ++} ++ ++static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) ++{ ++ DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); ++ bool wait; ++ ++ spin_lock_irq(&sem->waiters.lock); ++ /* ++ * Serialize against the wakeup in percpu_up_write(), if we fail ++ * the trylock, the wakeup must see us on the list. ++ */ ++ wait = !__percpu_rwsem_trylock(sem, reader); ++ if (wait) { ++ wq_entry.flags |= WQ_FLAG_EXCLUSIVE | reader * WQ_FLAG_CUSTOM; ++ __add_wait_queue_entry_tail(&sem->waiters, &wq_entry); ++ } ++ spin_unlock_irq(&sem->waiters.lock); ++ ++ while (wait) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (!smp_load_acquire(&wq_entry.private)) ++ break; ++ schedule(); ++ } ++ __set_current_state(TASK_RUNNING); ++} ++ + bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) + { + if (__percpu_down_read_trylock(sem)) +@@ -89,20 +169,10 @@ bool __percpu_down_read(struct percpu_rw + if (try) + return false; + +- /* +- * We either call schedule() in the wait, or we'll fall through +- * and reschedule on the preempt_enable() in percpu_down_read(). +- */ +- preempt_enable_no_resched(); +- +- /* +- * Avoid lockdep for the down/up_read() we already have them. +- */ +- __down_read(&sem->rw_sem); +- this_cpu_inc(*sem->read_count); +- __up_read(&sem->rw_sem); +- ++ preempt_enable(); ++ percpu_rwsem_wait(sem, /* .reader = */ true); + preempt_disable(); ++ + return true; + } + EXPORT_SYMBOL_GPL(__percpu_down_read); +@@ -117,7 +187,7 @@ void __percpu_up_read(struct percpu_rw_s + */ + __this_cpu_dec(*sem->read_count); + +- /* Prod writer to recheck readers_active */ ++ /* Prod writer to re-evaluate readers_active_check() */ + rcuwait_wake_up(&sem->writer); + } + EXPORT_SYMBOL_GPL(__percpu_up_read); +@@ -137,6 +207,8 @@ EXPORT_SYMBOL_GPL(__percpu_up_read); + * zero. If this sum is zero, then it is stable due to the fact that if any + * newly arriving readers increment a given counter, they will immediately + * decrement that same counter. ++ * ++ * Assumes sem->block is set. + */ + static bool readers_active_check(struct percpu_rw_semaphore *sem) + { +@@ -160,23 +232,22 @@ void percpu_down_write(struct percpu_rw_ + /* Notify readers to take the slow path. */ + rcu_sync_enter(&sem->rss); + +- __down_write(&sem->rw_sem); +- + /* +- * Notify new readers to block; up until now, and thus throughout the +- * longish rcu_sync_enter() above, new readers could still come in. ++ * Try set sem->block; this provides writer-writer exclusion. ++ * Having sem->block set makes new readers block. + */ +- WRITE_ONCE(sem->readers_block, 1); ++ if (!__percpu_down_write_trylock(sem)) ++ percpu_rwsem_wait(sem, /* .reader = */ false); + +- smp_mb(); /* D matches A */ ++ /* smp_mb() implied by __percpu_down_write_trylock() on success -- D matches A */ + + /* +- * If they don't see our writer of readers_block, then we are +- * guaranteed to see their sem->read_count increment, and therefore +- * will wait for them. ++ * If they don't see our store of sem->block, then we are guaranteed to ++ * see their sem->read_count increment, and therefore will wait for ++ * them. + */ + +- /* Wait for all now active readers to complete. */ ++ /* Wait for all active readers to complete. */ + rcuwait_wait_event(&sem->writer, readers_active_check(sem)); + } + EXPORT_SYMBOL_GPL(percpu_down_write); +@@ -195,12 +266,12 @@ void percpu_up_write(struct percpu_rw_se + * Therefore we force it through the slow path which guarantees an + * acquire and thereby guarantees the critical section's consistency. + */ +- smp_store_release(&sem->readers_block, 0); ++ atomic_set_release(&sem->block, 0); + + /* +- * Release the write lock, this will allow readers back in the game. ++ * Prod any pending reader/writer to make progress. + */ +- __up_write(&sem->rw_sem); ++ __wake_up(&sem->waiters, TASK_NORMAL, 1, sem); + + /* + * Once this completes (at least one RCU-sched grace period hence) the +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -28,7 +28,6 @@ + #include + #include + +-#include "rwsem.h" + #include "lock_events.h" + + /* +@@ -660,8 +659,6 @@ static inline bool rwsem_can_spin_on_own + unsigned long flags; + bool ret = true; + +- BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); +- + if (need_resched()) { + lockevent_inc(rwsem_opt_fail); + return false; +@@ -1338,7 +1335,7 @@ static struct rw_semaphore *rwsem_downgr + /* + * lock for reading + */ +-inline void __down_read(struct rw_semaphore *sem) ++static inline void __down_read(struct rw_semaphore *sem) + { + if (!rwsem_read_trylock(sem)) { + rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); +@@ -1383,7 +1380,7 @@ static inline int __down_read_trylock(st + /* + * lock for writing + */ +-inline void __down_write(struct rw_semaphore *sem) ++static inline void __down_write(struct rw_semaphore *sem) + { + long tmp = RWSEM_UNLOCKED_VALUE; + +@@ -1426,7 +1423,7 @@ static inline int __down_write_trylock(s + /* + * unlock after reading + */ +-inline void __up_read(struct rw_semaphore *sem) ++static inline void __up_read(struct rw_semaphore *sem) + { + long tmp; + +@@ -1446,7 +1443,7 @@ inline void __up_read(struct rw_semaphor + /* + * unlock after writing + */ +-inline void __up_write(struct rw_semaphore *sem) ++static inline void __up_write(struct rw_semaphore *sem) + { + long tmp; + +--- a/kernel/locking/rwsem.h ++++ b/kernel/locking/rwsem.h +@@ -1,12 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +- +-#ifndef __INTERNAL_RWSEM_H +-#define __INTERNAL_RWSEM_H +-#include +- +-extern void __down_read(struct rw_semaphore *sem); +-extern void __up_read(struct rw_semaphore *sem); +-extern void __down_write(struct rw_semaphore *sem); +-extern void __up_write(struct rw_semaphore *sem); +- +-#endif /* __INTERNAL_RWSEM_H */ diff --git a/kernel/patches-5.4.x-rt/0061-0006-locking-percpu-rwsem-Fold-__percpu_up_read.patch b/kernel/patches-5.4.x-rt/0061-0006-locking-percpu-rwsem-Fold-__percpu_up_read.patch new file mode 100644 index 000000000..a89d99f6e --- /dev/null +++ b/kernel/patches-5.4.x-rt/0061-0006-locking-percpu-rwsem-Fold-__percpu_up_read.patch @@ -0,0 +1,85 @@ +From: Davidlohr Bueso +Date: Fri, 31 Jan 2020 16:07:09 +0100 +Subject: [PATCH 6/7] locking/percpu-rwsem: Fold __percpu_up_read() + +Now that __percpu_up_read() is only ever used from percpu_up_read() +merge them, it's a small function. + +Signed-off-by: Peter Zijlstra (Intel) +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/percpu-rwsem.h | 19 +++++++++++++++---- + kernel/exit.c | 1 + + kernel/locking/percpu-rwsem.c | 15 --------------- + 3 files changed, 16 insertions(+), 19 deletions(-) + +--- a/include/linux/percpu-rwsem.h ++++ b/include/linux/percpu-rwsem.h +@@ -43,7 +43,6 @@ is_static struct percpu_rw_semaphore nam + __DEFINE_PERCPU_RWSEM(name, static) + + extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool); +-extern void __percpu_up_read(struct percpu_rw_semaphore *); + + static inline void percpu_down_read(struct percpu_rw_semaphore *sem) + { +@@ -103,10 +102,22 @@ static inline void percpu_up_read(struct + /* + * Same as in percpu_down_read(). + */ +- if (likely(rcu_sync_is_idle(&sem->rss))) ++ if (likely(rcu_sync_is_idle(&sem->rss))) { + __this_cpu_dec(*sem->read_count); +- else +- __percpu_up_read(sem); /* Unconditional memory barrier */ ++ } else { ++ /* ++ * slowpath; reader will only ever wake a single blocked ++ * writer. ++ */ ++ smp_mb(); /* B matches C */ ++ /* ++ * In other words, if they see our decrement (presumably to ++ * aggregate zero, as that is the only time it matters) they ++ * will also see our critical section. ++ */ ++ __this_cpu_dec(*sem->read_count); ++ rcuwait_wake_up(&sem->writer); ++ } + preempt_enable(); + } + +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -258,6 +258,7 @@ void rcuwait_wake_up(struct rcuwait *w) + wake_up_process(task); + rcu_read_unlock(); + } ++EXPORT_SYMBOL_GPL(rcuwait_wake_up); + + /* + * Determine if a process group is "orphaned", according to the POSIX +--- a/kernel/locking/percpu-rwsem.c ++++ b/kernel/locking/percpu-rwsem.c +@@ -177,21 +177,6 @@ bool __percpu_down_read(struct percpu_rw + } + EXPORT_SYMBOL_GPL(__percpu_down_read); + +-void __percpu_up_read(struct percpu_rw_semaphore *sem) +-{ +- smp_mb(); /* B matches C */ +- /* +- * In other words, if they see our decrement (presumably to aggregate +- * zero, as that is the only time it matters) they will also see our +- * critical section. +- */ +- __this_cpu_dec(*sem->read_count); +- +- /* Prod writer to re-evaluate readers_active_check() */ +- rcuwait_wake_up(&sem->writer); +-} +-EXPORT_SYMBOL_GPL(__percpu_up_read); +- + #define per_cpu_sum(var) \ + ({ \ + typeof(var) __sum = 0; \ diff --git a/kernel/patches-5.4.x-rt/0062-0007-locking-percpu-rwsem-Add-might_sleep-for-writer-lock.patch b/kernel/patches-5.4.x-rt/0062-0007-locking-percpu-rwsem-Add-might_sleep-for-writer-lock.patch new file mode 100644 index 000000000..8f5e2a791 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0062-0007-locking-percpu-rwsem-Add-might_sleep-for-writer-lock.patch @@ -0,0 +1,26 @@ +From: Davidlohr Bueso +Date: Fri, 31 Jan 2020 16:07:10 +0100 +Subject: [PATCH 7/7] locking/percpu-rwsem: Add might_sleep() for writer + locking + +We are missing this annotation in percpu_down_write(). Correct +this. + +Signed-off-by: Davidlohr Bueso +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20200108013305.7732-1-dave@stgolabs.net +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/locking/percpu-rwsem.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/kernel/locking/percpu-rwsem.c ++++ b/kernel/locking/percpu-rwsem.c +@@ -212,6 +212,7 @@ static bool readers_active_check(struct + + void percpu_down_write(struct percpu_rw_semaphore *sem) + { ++ might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + /* Notify readers to take the slow path. */ diff --git a/kernel/patches-5.4.x-rt/0063-fs-buffer-Make-BH_Uptodate_Lock-bit_spin_lock-a-regu.patch b/kernel/patches-5.4.x-rt/0063-fs-buffer-Make-BH_Uptodate_Lock-bit_spin_lock-a-regu.patch new file mode 100644 index 000000000..17b26e49f --- /dev/null +++ b/kernel/patches-5.4.x-rt/0063-fs-buffer-Make-BH_Uptodate_Lock-bit_spin_lock-a-regu.patch @@ -0,0 +1,192 @@ +From: Thomas Gleixner +Date: Fri, 15 Nov 2019 18:54:20 +0100 +Subject: [PATCH] fs/buffer: Make BH_Uptodate_Lock bit_spin_lock a regular + spinlock_t + +Bit spinlocks are problematic if PREEMPT_RT is enabled, because they +disable preemption, which is undesired for latency reasons and breaks when +regular spinlocks are taken within the bit_spinlock locked region because +regular spinlocks are converted to 'sleeping spinlocks' on RT. So RT +replaces the bit spinlocks with regular spinlocks to avoid this problem. +Bit spinlocks are also not covered by lock debugging, e.g. lockdep. + +Substitute the BH_Uptodate_Lock bit spinlock with a regular spinlock. + +Reviewed-by: Jan Kara +Signed-off-by: Thomas Gleixner +[bigeasy: remove the wrapper and use always spinlock_t and move it into + the padding hole] +Signed-off-by: Sebastian Andrzej Siewior +--- +v2…v3: rename uptodate_lock to b_uptodate_lock. + +v1…v2: Move the spinlock_t to the padding hole as per Jan Kara. pahole says +its total size remained unchanged, before + +| atomic_t b_count; /* 96 4 */ +| +| /* size: 104, cachelines: 2, members: 12 */ +| /* padding: 4 */ +| /* last cacheline: 40 bytes */ + +after + +| atomic_t b_count; /* 96 4 */ +| spinlock_t uptodate_lock; /* 100 4 */ +| +| /* size: 104, cachelines: 2, members: 13 */ +| /* last cacheline: 40 bytes */ + + fs/buffer.c | 19 +++++++------------ + fs/ext4/page-io.c | 8 +++----- + fs/ntfs/aops.c | 9 +++------ + include/linux/buffer_head.h | 6 +++--- + 4 files changed, 16 insertions(+), 26 deletions(-) + +--- a/fs/buffer.c ++++ b/fs/buffer.c +@@ -275,8 +275,7 @@ static void end_buffer_async_read(struct + * decide that the page is now completely done. + */ + first = page_buffers(page); +- local_irq_save(flags); +- bit_spin_lock(BH_Uptodate_Lock, &first->b_state); ++ spin_lock_irqsave(&first->b_uptodate_lock, flags); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; +@@ -289,8 +288,7 @@ static void end_buffer_async_read(struct + } + tmp = tmp->b_this_page; + } while (tmp != bh); +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + + /* + * If none of the buffers had errors and they are all +@@ -302,8 +300,7 @@ static void end_buffer_async_read(struct + return; + + still_busy: +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + return; + } + +@@ -331,8 +328,7 @@ void end_buffer_async_write(struct buffe + } + + first = page_buffers(page); +- local_irq_save(flags); +- bit_spin_lock(BH_Uptodate_Lock, &first->b_state); ++ spin_lock_irqsave(&first->b_uptodate_lock, flags); + + clear_buffer_async_write(bh); + unlock_buffer(bh); +@@ -344,14 +340,12 @@ void end_buffer_async_write(struct buffe + } + tmp = tmp->b_this_page; + } +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + end_page_writeback(page); + return; + + still_busy: +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + return; + } + EXPORT_SYMBOL(end_buffer_async_write); +@@ -3345,6 +3339,7 @@ struct buffer_head *alloc_buffer_head(gf + struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); + if (ret) { + INIT_LIST_HEAD(&ret->b_assoc_buffers); ++ spin_lock_init(&ret->b_uptodate_lock); + preempt_disable(); + __this_cpu_inc(bh_accounting.nr); + recalc_bh_state(); +--- a/fs/ext4/page-io.c ++++ b/fs/ext4/page-io.c +@@ -87,11 +87,10 @@ static void ext4_finish_bio(struct bio * + } + bh = head = page_buffers(page); + /* +- * We check all buffers in the page under BH_Uptodate_Lock ++ * We check all buffers in the page under b_uptodate_lock + * to avoid races with other end io clearing async_write flags + */ +- local_irq_save(flags); +- bit_spin_lock(BH_Uptodate_Lock, &head->b_state); ++ spin_lock_irqsave(&head->b_uptodate_lock, flags); + do { + if (bh_offset(bh) < bio_start || + bh_offset(bh) + bh->b_size > bio_end) { +@@ -103,8 +102,7 @@ static void ext4_finish_bio(struct bio * + if (bio->bi_status) + buffer_io_error(bh); + } while ((bh = bh->b_this_page) != head); +- bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&head->b_uptodate_lock, flags); + if (!under_io) { + fscrypt_free_bounce_page(bounce_page); + end_page_writeback(page); +--- a/fs/ntfs/aops.c ++++ b/fs/ntfs/aops.c +@@ -92,8 +92,7 @@ static void ntfs_end_buffer_async_read(s + "0x%llx.", (unsigned long long)bh->b_blocknr); + } + first = page_buffers(page); +- local_irq_save(flags); +- bit_spin_lock(BH_Uptodate_Lock, &first->b_state); ++ spin_lock_irqsave(&first->b_uptodate_lock, flags); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; +@@ -108,8 +107,7 @@ static void ntfs_end_buffer_async_read(s + } + tmp = tmp->b_this_page; + } while (tmp != bh); +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + /* + * If none of the buffers had errors then we can set the page uptodate, + * but we first have to perform the post read mst fixups, if the +@@ -142,8 +140,7 @@ static void ntfs_end_buffer_async_read(s + unlock_page(page); + return; + still_busy: +- bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + return; + } + +--- a/include/linux/buffer_head.h ++++ b/include/linux/buffer_head.h +@@ -22,9 +22,6 @@ enum bh_state_bits { + BH_Dirty, /* Is dirty */ + BH_Lock, /* Is locked */ + BH_Req, /* Has been submitted for I/O */ +- BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise +- * IO completion of other buffers in the page +- */ + + BH_Mapped, /* Has a disk mapping */ + BH_New, /* Disk mapping was newly created by get_block */ +@@ -76,6 +73,9 @@ struct buffer_head { + struct address_space *b_assoc_map; /* mapping this buffer is + associated with */ + atomic_t b_count; /* users using this buffer_head */ ++ spinlock_t b_uptodate_lock; /* Used by the first bh in a page, to ++ * serialise IO completion of other ++ * buffers in the page */ + }; + + /* diff --git a/kernel/patches-5.4.x-rt/0064-thermal-x86_pkg_temp-make-pkg_temp_lock-a-raw-spinlo.patch b/kernel/patches-5.4.x-rt/0064-thermal-x86_pkg_temp-make-pkg_temp_lock-a-raw-spinlo.patch new file mode 100644 index 000000000..7f544dae5 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0064-thermal-x86_pkg_temp-make-pkg_temp_lock-a-raw-spinlo.patch @@ -0,0 +1,109 @@ +From: Clark Williams +Date: Mon, 15 Jul 2019 15:25:00 -0500 +Subject: [PATCH] thermal/x86_pkg_temp: Make pkg_temp_lock a raw_spinlock_t + +The spinlock pkg_temp_lock has the potential of being taken in atomic +context because it can be acquired from the thermal IRQ vector. +It's static and limited scope so go ahead and make it a raw spinlock. + +Signed-off-by: Clark Williams +Signed-off-by: Sebastian Andrzej Siewior +--- + drivers/thermal/intel/x86_pkg_temp_thermal.c | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +--- a/drivers/thermal/intel/x86_pkg_temp_thermal.c ++++ b/drivers/thermal/intel/x86_pkg_temp_thermal.c +@@ -63,7 +63,7 @@ static int max_id __read_mostly; + /* Array of zone pointers */ + static struct zone_device **zones; + /* Serializes interrupt notification, work and hotplug */ +-static DEFINE_SPINLOCK(pkg_temp_lock); ++static DEFINE_RAW_SPINLOCK(pkg_temp_lock); + /* Protects zone operation in the work function against hotplug removal */ + static DEFINE_MUTEX(thermal_zone_mutex); + +@@ -266,12 +266,12 @@ static void pkg_temp_thermal_threshold_w + u64 msr_val, wr_val; + + mutex_lock(&thermal_zone_mutex); +- spin_lock_irq(&pkg_temp_lock); ++ raw_spin_lock_irq(&pkg_temp_lock); + ++pkg_work_cnt; + + zonedev = pkg_temp_thermal_get_dev(cpu); + if (!zonedev) { +- spin_unlock_irq(&pkg_temp_lock); ++ raw_spin_unlock_irq(&pkg_temp_lock); + mutex_unlock(&thermal_zone_mutex); + return; + } +@@ -285,7 +285,7 @@ static void pkg_temp_thermal_threshold_w + } + + enable_pkg_thres_interrupt(); +- spin_unlock_irq(&pkg_temp_lock); ++ raw_spin_unlock_irq(&pkg_temp_lock); + + /* + * If tzone is not NULL, then thermal_zone_mutex will prevent the +@@ -310,7 +310,7 @@ static int pkg_thermal_notify(u64 msr_va + struct zone_device *zonedev; + unsigned long flags; + +- spin_lock_irqsave(&pkg_temp_lock, flags); ++ raw_spin_lock_irqsave(&pkg_temp_lock, flags); + ++pkg_interrupt_cnt; + + disable_pkg_thres_interrupt(); +@@ -322,7 +322,7 @@ static int pkg_thermal_notify(u64 msr_va + pkg_thermal_schedule_work(zonedev->cpu, &zonedev->work); + } + +- spin_unlock_irqrestore(&pkg_temp_lock, flags); ++ raw_spin_unlock_irqrestore(&pkg_temp_lock, flags); + return 0; + } + +@@ -368,9 +368,9 @@ static int pkg_temp_thermal_device_add(u + zonedev->msr_pkg_therm_high); + + cpumask_set_cpu(cpu, &zonedev->cpumask); +- spin_lock_irq(&pkg_temp_lock); ++ raw_spin_lock_irq(&pkg_temp_lock); + zones[id] = zonedev; +- spin_unlock_irq(&pkg_temp_lock); ++ raw_spin_unlock_irq(&pkg_temp_lock); + return 0; + } + +@@ -407,7 +407,7 @@ static int pkg_thermal_cpu_offline(unsig + } + + /* Protect against work and interrupts */ +- spin_lock_irq(&pkg_temp_lock); ++ raw_spin_lock_irq(&pkg_temp_lock); + + /* + * Check whether this cpu was the current target and store the new +@@ -439,9 +439,9 @@ static int pkg_thermal_cpu_offline(unsig + * To cancel the work we need to drop the lock, otherwise + * we might deadlock if the work needs to be flushed. + */ +- spin_unlock_irq(&pkg_temp_lock); ++ raw_spin_unlock_irq(&pkg_temp_lock); + cancel_delayed_work_sync(&zonedev->work); +- spin_lock_irq(&pkg_temp_lock); ++ raw_spin_lock_irq(&pkg_temp_lock); + /* + * If this is not the last cpu in the package and the work + * did not run after we dropped the lock above, then we +@@ -452,7 +452,7 @@ static int pkg_thermal_cpu_offline(unsig + pkg_thermal_schedule_work(target, &zonedev->work); + } + +- spin_unlock_irq(&pkg_temp_lock); ++ raw_spin_unlock_irq(&pkg_temp_lock); + + /* Final cleanup if this is the last cpu */ + if (lastcpu) diff --git a/kernel/patches-5.4.x-rt/0065-perf-core-Add-SRCU-annotation-for-pmus-list-walk.patch b/kernel/patches-5.4.x-rt/0065-perf-core-Add-SRCU-annotation-for-pmus-list-walk.patch new file mode 100644 index 000000000..5632c3877 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0065-perf-core-Add-SRCU-annotation-for-pmus-list-walk.patch @@ -0,0 +1,30 @@ +From: Sebastian Andrzej Siewior +Date: Fri, 15 Nov 2019 18:04:07 +0100 +Subject: [PATCH] perf/core: Add SRCU annotation for pmus list walk + +Since commit + 28875945ba98d ("rcu: Add support for consolidated-RCU reader checking") + +there is an additional check to ensure that a RCU related lock is held +while the RCU list is iterated. +This section holds the SRCU reader lock instead. + +Add annotation to list_for_each_entry_rcu() that pmus_srcu must be +acquired during the list traversal. + +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/events/core.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -10264,7 +10264,7 @@ static struct pmu *perf_init_event(struc + goto unlock; + } + +- list_for_each_entry_rcu(pmu, &pmus, entry) { ++ list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) { + ret = perf_try_init_event(pmu, event); + if (!ret) + goto unlock; diff --git a/kernel/patches-5.4.x-rt/0066-kmemleak-Turn-kmemleak_lock-and-object-lock-to-raw_s.patch b/kernel/patches-5.4.x-rt/0066-kmemleak-Turn-kmemleak_lock-and-object-lock-to-raw_s.patch new file mode 100644 index 000000000..23039044e --- /dev/null +++ b/kernel/patches-5.4.x-rt/0066-kmemleak-Turn-kmemleak_lock-and-object-lock-to-raw_s.patch @@ -0,0 +1,411 @@ +From: He Zhe +Date: Wed, 19 Dec 2018 16:30:57 +0100 +Subject: [PATCH] kmemleak: Turn kmemleak_lock and object->lock to + raw_spinlock_t + +kmemleak_lock as a rwlock on RT can possibly be acquired in atomic context +which does work on RT. +Since the kmemleak operation is performed in atomic context make it a +raw_spinlock_t so it can also be acquired on RT. This is used for +debugging and is not enabled by default in a production like environment +(where performance/latency matters) so it makes sense to make it a +raw_spinlock_t instead trying to get rid of the atomic context. +Turn also the kmemleak_object->lock into raw_spinlock_t which is +acquired (nested) while the kmemleak_lock is held. + +The time spent in "echo scan > kmemleak" slightly improved on 64core box +with this patch applied after boot. + +Acked-by: Catalin Marinas +Link: https://lkml.kernel.org/r/20181218150744.GB20197@arrakis.emea.arm.com +Link: https://lkml.kernel.org/r/1542877459-144382-1-git-send-email-zhe.he@windriver.com +Link: https://lkml.kernel.org/r/20190927082230.34152-1-yongxin.liu@windriver.com +Signed-off-by: He Zhe +Signed-off-by: Liu Haitao +Signed-off-by: Yongxin Liu +[bigeasy: Redo the description. Merge the individual bits: He Zhe did +the kmemleak_lock, Liu Haitao the ->lock and Yongxin Liu forwarded the +patch.] +Signed-off-by: Sebastian Andrzej Siewior +--- + mm/kmemleak.c | 112 +++++++++++++++++++++++++++++----------------------------- + 1 file changed, 56 insertions(+), 56 deletions(-) + +--- a/mm/kmemleak.c ++++ b/mm/kmemleak.c +@@ -13,7 +13,7 @@ + * + * The following locks and mutexes are used by kmemleak: + * +- * - kmemleak_lock (rwlock): protects the object_list modifications and ++ * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and + * accesses to the object_tree_root. The object_list is the main list + * holding the metadata (struct kmemleak_object) for the allocated memory + * blocks. The object_tree_root is a red black tree used to look-up +@@ -22,13 +22,13 @@ + * object_tree_root in the create_object() function called from the + * kmemleak_alloc() callback and removed in delete_object() called from the + * kmemleak_free() callback +- * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to +- * the metadata (e.g. count) are protected by this lock. Note that some +- * members of this structure may be protected by other means (atomic or +- * kmemleak_lock). This lock is also held when scanning the corresponding +- * memory block to avoid the kernel freeing it via the kmemleak_free() +- * callback. This is less heavyweight than holding a global lock like +- * kmemleak_lock during scanning ++ * - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object. ++ * Accesses to the metadata (e.g. count) are protected by this lock. Note ++ * that some members of this structure may be protected by other means ++ * (atomic or kmemleak_lock). This lock is also held when scanning the ++ * corresponding memory block to avoid the kernel freeing it via the ++ * kmemleak_free() callback. This is less heavyweight than holding a global ++ * lock like kmemleak_lock during scanning. + * - scan_mutex (mutex): ensures that only one thread may scan the memory for + * unreferenced objects at a time. The gray_list contains the objects which + * are already referenced or marked as false positives and need to be +@@ -135,7 +135,7 @@ struct kmemleak_scan_area { + * (use_count) and freed using the RCU mechanism. + */ + struct kmemleak_object { +- spinlock_t lock; ++ raw_spinlock_t lock; + unsigned int flags; /* object status flags */ + struct list_head object_list; + struct list_head gray_list; +@@ -191,8 +191,8 @@ static int mem_pool_free_count = ARRAY_S + static LIST_HEAD(mem_pool_free_list); + /* search tree for object boundaries */ + static struct rb_root object_tree_root = RB_ROOT; +-/* rw_lock protecting the access to object_list and object_tree_root */ +-static DEFINE_RWLOCK(kmemleak_lock); ++/* protecting the access to object_list and object_tree_root */ ++static DEFINE_RAW_SPINLOCK(kmemleak_lock); + + /* allocation caches for kmemleak internal data */ + static struct kmem_cache *object_cache; +@@ -426,7 +426,7 @@ static struct kmemleak_object *mem_pool_ + } + + /* slab allocation failed, try the memory pool */ +- write_lock_irqsave(&kmemleak_lock, flags); ++ raw_spin_lock_irqsave(&kmemleak_lock, flags); + object = list_first_entry_or_null(&mem_pool_free_list, + typeof(*object), object_list); + if (object) +@@ -435,7 +435,7 @@ static struct kmemleak_object *mem_pool_ + object = &mem_pool[--mem_pool_free_count]; + else + pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n"); +- write_unlock_irqrestore(&kmemleak_lock, flags); ++ raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + + return object; + } +@@ -453,9 +453,9 @@ static void mem_pool_free(struct kmemlea + } + + /* add the object to the memory pool free list */ +- write_lock_irqsave(&kmemleak_lock, flags); ++ raw_spin_lock_irqsave(&kmemleak_lock, flags); + list_add(&object->object_list, &mem_pool_free_list); +- write_unlock_irqrestore(&kmemleak_lock, flags); ++ raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + } + + /* +@@ -514,9 +514,9 @@ static struct kmemleak_object *find_and_ + struct kmemleak_object *object; + + rcu_read_lock(); +- read_lock_irqsave(&kmemleak_lock, flags); ++ raw_spin_lock_irqsave(&kmemleak_lock, flags); + object = lookup_object(ptr, alias); +- read_unlock_irqrestore(&kmemleak_lock, flags); ++ raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + + /* check whether the object is still available */ + if (object && !get_object(object)) +@@ -546,11 +546,11 @@ static struct kmemleak_object *find_and_ + unsigned long flags; + struct kmemleak_object *object; + +- write_lock_irqsave(&kmemleak_lock, flags); ++ raw_spin_lock_irqsave(&kmemleak_lock, flags); + object = lookup_object(ptr, alias); + if (object) + __remove_object(object); +- write_unlock_irqrestore(&kmemleak_lock, flags); ++ raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + + return object; + } +@@ -585,7 +585,7 @@ static struct kmemleak_object *create_ob + INIT_LIST_HEAD(&object->object_list); + INIT_LIST_HEAD(&object->gray_list); + INIT_HLIST_HEAD(&object->area_list); +- spin_lock_init(&object->lock); ++ raw_spin_lock_init(&object->lock); + atomic_set(&object->use_count, 1); + object->flags = OBJECT_ALLOCATED; + object->pointer = ptr; +@@ -617,7 +617,7 @@ static struct kmemleak_object *create_ob + /* kernel backtrace */ + object->trace_len = __save_stack_trace(object->trace); + +- write_lock_irqsave(&kmemleak_lock, flags); ++ raw_spin_lock_irqsave(&kmemleak_lock, flags); + + untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); + min_addr = min(min_addr, untagged_ptr); +@@ -649,7 +649,7 @@ static struct kmemleak_object *create_ob + + list_add_tail_rcu(&object->object_list, &object_list); + out: +- write_unlock_irqrestore(&kmemleak_lock, flags); ++ raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + return object; + } + +@@ -667,9 +667,9 @@ static void __delete_object(struct kmeml + * Locking here also ensures that the corresponding memory block + * cannot be freed when it is being scanned. + */ +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + object->flags &= ~OBJECT_ALLOCATED; +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + put_object(object); + } + +@@ -739,9 +739,9 @@ static void paint_it(struct kmemleak_obj + { + unsigned long flags; + +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + __paint_it(object, color); +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + } + + static void paint_ptr(unsigned long ptr, int color) +@@ -798,7 +798,7 @@ static void add_scan_area(unsigned long + if (scan_area_cache) + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); + +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + if (!area) { + pr_warn_once("Cannot allocate a scan area, scanning the full object\n"); + /* mark the object for full scan to avoid false positives */ +@@ -820,7 +820,7 @@ static void add_scan_area(unsigned long + + hlist_add_head(&area->node, &object->area_list); + out_unlock: +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + put_object(object); + } + +@@ -842,9 +842,9 @@ static void object_set_excess_ref(unsign + return; + } + +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + object->excess_ref = excess_ref; +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + put_object(object); + } + +@@ -864,9 +864,9 @@ static void object_no_scan(unsigned long + return; + } + +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + object->flags |= OBJECT_NO_SCAN; +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + put_object(object); + } + +@@ -1026,9 +1026,9 @@ void __ref kmemleak_update_trace(const v + return; + } + +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + object->trace_len = __save_stack_trace(object->trace); +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + } +@@ -1233,7 +1233,7 @@ static void scan_block(void *_start, voi + unsigned long flags; + unsigned long untagged_ptr; + +- read_lock_irqsave(&kmemleak_lock, flags); ++ raw_spin_lock_irqsave(&kmemleak_lock, flags); + for (ptr = start; ptr < end; ptr++) { + struct kmemleak_object *object; + unsigned long pointer; +@@ -1268,7 +1268,7 @@ static void scan_block(void *_start, voi + * previously acquired in scan_object(). These locks are + * enclosed by scan_mutex. + */ +- spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); ++ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + /* only pass surplus references (object already gray) */ + if (color_gray(object)) { + excess_ref = object->excess_ref; +@@ -1277,7 +1277,7 @@ static void scan_block(void *_start, voi + excess_ref = 0; + update_refs(object); + } +- spin_unlock(&object->lock); ++ raw_spin_unlock(&object->lock); + + if (excess_ref) { + object = lookup_object(excess_ref, 0); +@@ -1286,12 +1286,12 @@ static void scan_block(void *_start, voi + if (object == scanned) + /* circular reference, ignore */ + continue; +- spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); ++ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + update_refs(object); +- spin_unlock(&object->lock); ++ raw_spin_unlock(&object->lock); + } + } +- read_unlock_irqrestore(&kmemleak_lock, flags); ++ raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + } + + /* +@@ -1324,7 +1324,7 @@ static void scan_object(struct kmemleak_ + * Once the object->lock is acquired, the corresponding memory block + * cannot be freed (the same lock is acquired in delete_object). + */ +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + if (object->flags & OBJECT_NO_SCAN) + goto out; + if (!(object->flags & OBJECT_ALLOCATED)) +@@ -1344,9 +1344,9 @@ static void scan_object(struct kmemleak_ + if (start >= end) + break; + +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + cond_resched(); +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + } while (object->flags & OBJECT_ALLOCATED); + } else + hlist_for_each_entry(area, &object->area_list, node) +@@ -1354,7 +1354,7 @@ static void scan_object(struct kmemleak_ + (void *)(area->start + area->size), + object); + out: +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + } + + /* +@@ -1407,7 +1407,7 @@ static void kmemleak_scan(void) + /* prepare the kmemleak_object's */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + #ifdef DEBUG + /* + * With a few exceptions there should be a maximum of +@@ -1424,7 +1424,7 @@ static void kmemleak_scan(void) + if (color_gray(object) && get_object(object)) + list_add_tail(&object->gray_list, &gray_list); + +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + +@@ -1492,14 +1492,14 @@ static void kmemleak_scan(void) + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + if (color_white(object) && (object->flags & OBJECT_ALLOCATED) + && update_checksum(object) && get_object(object)) { + /* color it gray temporarily */ + object->count = object->min_count; + list_add_tail(&object->gray_list, &gray_list); + } +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + +@@ -1519,7 +1519,7 @@ static void kmemleak_scan(void) + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + if (unreferenced_object(object) && + !(object->flags & OBJECT_REPORTED)) { + object->flags |= OBJECT_REPORTED; +@@ -1529,7 +1529,7 @@ static void kmemleak_scan(void) + + new_leaks++; + } +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + +@@ -1681,10 +1681,10 @@ static int kmemleak_seq_show(struct seq_ + struct kmemleak_object *object = v; + unsigned long flags; + +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) + print_unreferenced(seq, object); +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + return 0; + } + +@@ -1714,9 +1714,9 @@ static int dump_str_object_info(const ch + return -EINVAL; + } + +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + return 0; +@@ -1735,11 +1735,11 @@ static void kmemleak_clear(void) + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { +- spin_lock_irqsave(&object->lock, flags); ++ raw_spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && + unreferenced_object(object)) + __paint_it(object, KMEMLEAK_GREY); +- spin_unlock_irqrestore(&object->lock, flags); ++ raw_spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + diff --git a/kernel/patches-5.4.x-rt/0067-smp-Use-smp_cond_func_t-as-type-for-the-conditional-.patch b/kernel/patches-5.4.x-rt/0067-smp-Use-smp_cond_func_t-as-type-for-the-conditional-.patch new file mode 100644 index 000000000..293d2901d --- /dev/null +++ b/kernel/patches-5.4.x-rt/0067-smp-Use-smp_cond_func_t-as-type-for-the-conditional-.patch @@ -0,0 +1,99 @@ +From: Sebastian Andrzej Siewior +Date: Thu, 16 Jan 2020 12:00:31 +0100 +Subject: [PATCH] smp: Use smp_cond_func_t as type for the conditional + function + +Use a typdef for the conditional function instead defining it each time in +the function prototype. + +Signed-off-by: Sebastian Andrzej Siewior +--- + include/linux/smp.h | 14 +++++++------- + kernel/smp.c | 11 +++++------ + kernel/up.c | 11 +++++------ + 3 files changed, 17 insertions(+), 19 deletions(-) + +--- a/include/linux/smp.h ++++ b/include/linux/smp.h +@@ -15,6 +15,7 @@ + #include + + typedef void (*smp_call_func_t)(void *info); ++typedef bool (*smp_cond_func_t)(int cpu, void *info); + struct __call_single_data { + struct llist_node llist; + smp_call_func_t func; +@@ -49,13 +50,12 @@ void on_each_cpu_mask(const struct cpuma + * cond_func returns a positive value. This may include the local + * processor. + */ +-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), +- smp_call_func_t func, void *info, bool wait, +- gfp_t gfp_flags); +- +-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), +- smp_call_func_t func, void *info, bool wait, +- gfp_t gfp_flags, const struct cpumask *mask); ++void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, ++ void *info, bool wait, gfp_t gfp_flags); ++ ++void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, ++ void *info, bool wait, gfp_t gfp_flags, ++ const struct cpumask *mask); + + int smp_call_function_single_async(int cpu, call_single_data_t *csd); + +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -680,9 +680,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), +- smp_call_func_t func, void *info, bool wait, +- gfp_t gfp_flags, const struct cpumask *mask) ++void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, ++ void *info, bool wait, gfp_t gfp_flags, ++ const struct cpumask *mask) + { + cpumask_var_t cpus; + int cpu, ret; +@@ -714,9 +714,8 @@ void on_each_cpu_cond_mask(bool (*cond_f + } + EXPORT_SYMBOL(on_each_cpu_cond_mask); + +-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), +- smp_call_func_t func, void *info, bool wait, +- gfp_t gfp_flags) ++void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, ++ void *info, bool wait, gfp_t gfp_flags) + { + on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, + cpu_online_mask); +--- a/kernel/up.c ++++ b/kernel/up.c +@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask); + * Preemption is disabled here to make sure the cond_func is called under the + * same condtions in UP and SMP. + */ +-void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info), +- smp_call_func_t func, void *info, bool wait, +- gfp_t gfp_flags, const struct cpumask *mask) ++void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, ++ void *info, bool wait, gfp_t gfp_flags, ++ const struct cpumask *mask) + { + unsigned long flags; + +@@ -84,9 +84,8 @@ void on_each_cpu_cond_mask(bool (*cond_f + } + EXPORT_SYMBOL(on_each_cpu_cond_mask); + +-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), +- smp_call_func_t func, void *info, bool wait, +- gfp_t gfp_flags) ++void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, ++ void *info, bool wait, gfp_t gfp_flags) + { + on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL); + } diff --git a/kernel/patches-5.4.x-rt/0068-smp-Add-a-smp_cond_func_t-argument-to-smp_call_funct.patch b/kernel/patches-5.4.x-rt/0068-smp-Add-a-smp_cond_func_t-argument-to-smp_call_funct.patch new file mode 100644 index 000000000..0dc396cf4 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0068-smp-Add-a-smp_cond_func_t-argument-to-smp_call_funct.patch @@ -0,0 +1,139 @@ +From: Sebastian Andrzej Siewior +Date: Thu, 16 Jan 2020 12:14:38 +0100 +Subject: [PATCH] smp: Add a smp_cond_func_t argument to + smp_call_function_many() + +on_each_cpu_cond_mask() allocates a new CPU mask. The newly allocated +mask is a subset of the provided mask based on the conditional function. +This memory allocation could be avoided by extending +smp_call_function_many() with the conditional function and performing the +remote function call based on the mask and the conditional function. + +Rename smp_call_function_many() to smp_call_function_many_cond() and add +the smp_cond_func_t argument. If smp_cond_func_t is provided then it is +used before invoking the function. +Provide smp_call_function_many() with cond_func set to NULL. +Let on_each_cpu_cond_mask() use smp_call_function_many_cond(). + +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/smp.c | 77 +++++++++++++++++++++++++++-------------------------------- + 1 file changed, 36 insertions(+), 41 deletions(-) + +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -395,22 +395,9 @@ int smp_call_function_any(const struct c + } + EXPORT_SYMBOL_GPL(smp_call_function_any); + +-/** +- * smp_call_function_many(): Run a function on a set of other CPUs. +- * @mask: The set of cpus to run on (only runs on online subset). +- * @func: The function to run. This must be fast and non-blocking. +- * @info: An arbitrary pointer to pass to the function. +- * @wait: If true, wait (atomically) until function has completed +- * on other CPUs. +- * +- * If @wait is true, then returns once @func has returned. +- * +- * You must not call this function with disabled interrupts or from a +- * hardware interrupt handler or from a bottom half handler. Preemption +- * must be disabled when calling this function. +- */ +-void smp_call_function_many(const struct cpumask *mask, +- smp_call_func_t func, void *info, bool wait) ++static void smp_call_function_many_cond(const struct cpumask *mask, ++ smp_call_func_t func, void *info, ++ bool wait, smp_cond_func_t cond_func) + { + struct call_function_data *cfd; + int cpu, next_cpu, this_cpu = smp_processor_id(); +@@ -448,7 +435,8 @@ void smp_call_function_many(const struct + + /* Fastpath: do that cpu by itself. */ + if (next_cpu >= nr_cpu_ids) { +- smp_call_function_single(cpu, func, info, wait); ++ if (!cond_func || cond_func(cpu, info)) ++ smp_call_function_single(cpu, func, info, wait); + return; + } + +@@ -465,6 +453,9 @@ void smp_call_function_many(const struct + for_each_cpu(cpu, cfd->cpumask) { + call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); + ++ if (cond_func && !cond_func(cpu, info)) ++ continue; ++ + csd_lock(csd); + if (wait) + csd->flags |= CSD_FLAG_SYNCHRONOUS; +@@ -486,6 +477,26 @@ void smp_call_function_many(const struct + } + } + } ++ ++/** ++ * smp_call_function_many(): Run a function on a set of other CPUs. ++ * @mask: The set of cpus to run on (only runs on online subset). ++ * @func: The function to run. This must be fast and non-blocking. ++ * @info: An arbitrary pointer to pass to the function. ++ * @wait: If true, wait (atomically) until function has completed ++ * on other CPUs. ++ * ++ * If @wait is true, then returns once @func has returned. ++ * ++ * You must not call this function with disabled interrupts or from a ++ * hardware interrupt handler or from a bottom half handler. Preemption ++ * must be disabled when calling this function. ++ */ ++void smp_call_function_many(const struct cpumask *mask, ++ smp_call_func_t func, void *info, bool wait) ++{ ++ smp_call_function_many_cond(mask, func, info, wait, NULL); ++} + EXPORT_SYMBOL(smp_call_function_many); + + /** +@@ -684,33 +695,17 @@ void on_each_cpu_cond_mask(smp_cond_func + void *info, bool wait, gfp_t gfp_flags, + const struct cpumask *mask) + { +- cpumask_var_t cpus; +- int cpu, ret; ++ int cpu = get_cpu(); + +- might_sleep_if(gfpflags_allow_blocking(gfp_flags)); ++ smp_call_function_many_cond(mask, func, info, wait, cond_func); ++ if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) { ++ unsigned long flags; + +- if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { +- preempt_disable(); +- for_each_cpu(cpu, mask) +- if (cond_func(cpu, info)) +- __cpumask_set_cpu(cpu, cpus); +- on_each_cpu_mask(cpus, func, info, wait); +- preempt_enable(); +- free_cpumask_var(cpus); +- } else { +- /* +- * No free cpumask, bother. No matter, we'll +- * just have to IPI them one by one. +- */ +- preempt_disable(); +- for_each_cpu(cpu, mask) +- if (cond_func(cpu, info)) { +- ret = smp_call_function_single(cpu, func, +- info, wait); +- WARN_ON_ONCE(ret); +- } +- preempt_enable(); ++ local_irq_save(flags); ++ func(info); ++ local_irq_restore(flags); + } ++ put_cpu(); + } + EXPORT_SYMBOL(on_each_cpu_cond_mask); + diff --git a/kernel/patches-5.4.x-rt/0069-smp-Remove-allocation-mask-from-on_each_cpu_cond.patch b/kernel/patches-5.4.x-rt/0069-smp-Remove-allocation-mask-from-on_each_cpu_cond.patch new file mode 100644 index 000000000..1a08853dd --- /dev/null +++ b/kernel/patches-5.4.x-rt/0069-smp-Remove-allocation-mask-from-on_each_cpu_cond.patch @@ -0,0 +1,127 @@ +From: Sebastian Andrzej Siewior +Date: Thu, 16 Jan 2020 13:13:41 +0100 +Subject: [PATCH] smp: Remove allocation mask from on_each_cpu_cond.*() + +The allocation mask is no longer used by on_each_cpu_cond() and +on_each_cpu_cond_mask() and ca be removed. + +Signed-off-by: Sebastian Andrzej Siewior +--- + arch/x86/mm/tlb.c | 2 +- + fs/buffer.c | 2 +- + include/linux/smp.h | 5 ++--- + kernel/smp.c | 13 +++---------- + kernel/up.c | 7 +++---- + mm/slub.c | 2 +- + 6 files changed, 11 insertions(+), 20 deletions(-) + +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -708,7 +708,7 @@ void native_flush_tlb_others(const struc + (void *)info, 1); + else + on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote, +- (void *)info, 1, GFP_ATOMIC, cpumask); ++ (void *)info, 1, cpumask); + } + + /* +--- a/fs/buffer.c ++++ b/fs/buffer.c +@@ -1387,7 +1387,7 @@ static bool has_bh_in_lru(int cpu, void + + void invalidate_bh_lrus(void) + { +- on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL); ++ on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1); + } + EXPORT_SYMBOL_GPL(invalidate_bh_lrus); + +--- a/include/linux/smp.h ++++ b/include/linux/smp.h +@@ -51,11 +51,10 @@ void on_each_cpu_mask(const struct cpuma + * processor. + */ + void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, +- void *info, bool wait, gfp_t gfp_flags); ++ void *info, bool wait); + + void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, +- void *info, bool wait, gfp_t gfp_flags, +- const struct cpumask *mask); ++ void *info, bool wait, const struct cpumask *mask); + + int smp_call_function_single_async(int cpu, call_single_data_t *csd); + +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -679,11 +679,6 @@ EXPORT_SYMBOL(on_each_cpu_mask); + * @info: An arbitrary pointer to pass to both functions. + * @wait: If true, wait (atomically) until function has + * completed on other CPUs. +- * @gfp_flags: GFP flags to use when allocating the cpumask +- * used internally by the function. +- * +- * The function might sleep if the GFP flags indicates a non +- * atomic allocation is allowed. + * + * Preemption is disabled to protect against CPUs going offline but not online. + * CPUs going online during the call will not be seen or sent an IPI. +@@ -692,8 +687,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); + * from a hardware interrupt handler or from a bottom half handler. + */ + void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, +- void *info, bool wait, gfp_t gfp_flags, +- const struct cpumask *mask) ++ void *info, bool wait, const struct cpumask *mask) + { + int cpu = get_cpu(); + +@@ -710,10 +704,9 @@ void on_each_cpu_cond_mask(smp_cond_func + EXPORT_SYMBOL(on_each_cpu_cond_mask); + + void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, +- void *info, bool wait, gfp_t gfp_flags) ++ void *info, bool wait) + { +- on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, +- cpu_online_mask); ++ on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); + } + EXPORT_SYMBOL(on_each_cpu_cond); + +--- a/kernel/up.c ++++ b/kernel/up.c +@@ -69,8 +69,7 @@ EXPORT_SYMBOL(on_each_cpu_mask); + * same condtions in UP and SMP. + */ + void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, +- void *info, bool wait, gfp_t gfp_flags, +- const struct cpumask *mask) ++ void *info, bool wait, const struct cpumask *mask) + { + unsigned long flags; + +@@ -85,9 +84,9 @@ void on_each_cpu_cond_mask(smp_cond_func + EXPORT_SYMBOL(on_each_cpu_cond_mask); + + void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, +- void *info, bool wait, gfp_t gfp_flags) ++ void *info, bool wait) + { +- on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL); ++ on_each_cpu_cond_mask(cond_func, func, info, wait, NULL); + } + EXPORT_SYMBOL(on_each_cpu_cond); + +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -2338,7 +2338,7 @@ static bool has_cpu_slab(int cpu, void * + + static void flush_all(struct kmem_cache *s) + { +- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); ++ on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); + } + + /* diff --git a/kernel/patches-5.4.x-rt/0070-drm-vmwgfx-Drop-preempt_disable-in-vmw_fifo_ping_hos.patch b/kernel/patches-5.4.x-rt/0070-drm-vmwgfx-Drop-preempt_disable-in-vmw_fifo_ping_hos.patch new file mode 100644 index 000000000..089cd86b9 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0070-drm-vmwgfx-Drop-preempt_disable-in-vmw_fifo_ping_hos.patch @@ -0,0 +1,35 @@ +From: Sebastian Andrzej Siewior +Date: Fri, 21 Feb 2020 18:57:11 +0100 +Subject: [PATCH] drm/vmwgfx: Drop preempt_disable() in + vmw_fifo_ping_host() + +vmw_fifo_ping_host() disables preemption around a test and a register +write via vmw_write(). The write function acquires a spinlock_t typed +lock which is not allowed in a preempt_disable()ed section on +PREEMPT_RT. This has been reported in the bugzilla. + +It has been explained by Thomas Hellstrom that this preempt_disable()ed +section is not required for correctness. + +Remove the preempt_disable() section. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=206591 +Link: https://lkml.kernel.org/r/0b5e1c65d89951de993deab06d1d197b40fd67aa.camel@vmware.com +Signed-off-by: Sebastian Andrzej Siewior +--- + drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +@@ -169,10 +169,8 @@ void vmw_fifo_ping_host(struct vmw_priva + { + u32 *fifo_mem = dev_priv->mmio_virt; + +- preempt_disable(); + if (cmpxchg(fifo_mem + SVGA_FIFO_BUSY, 0, 1) == 0) + vmw_write(dev_priv, SVGA_REG_SYNC, reason); +- preempt_enable(); + } + + void vmw_fifo_release(struct vmw_private *dev_priv, struct vmw_fifo_state *fifo) diff --git a/kernel/patches-5.4.x-rt/0071-mm-compaction-Really-limit-compact_unevictable_allow.patch b/kernel/patches-5.4.x-rt/0071-mm-compaction-Really-limit-compact_unevictable_allow.patch new file mode 100644 index 000000000..b34687eac --- /dev/null +++ b/kernel/patches-5.4.x-rt/0071-mm-compaction-Really-limit-compact_unevictable_allow.patch @@ -0,0 +1,32 @@ +From: Sebastian Andrzej Siewior +Date: Tue, 3 Mar 2020 13:43:25 +0100 +Subject: [PATCH] =?UTF-8?q?mm/compaction:=20Really=20limit=20compact=5Fune?= + =?UTF-8?q?victable=5Fallowed=20to=200=E2=80=A61?= +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The proc file `compact_unevictable_allowed' should allow 0 and 1 only, +the `extra*' attribues have been set properly but without +proc_dointvec_minmax() as the `proc_handler' the limit will not be +enforced. + +Use proc_dointvec_minmax() as the `proc_handler' to enfoce the valid +specified range. + +Signed-off-by: Sebastian Andrzej Siewior +--- + kernel/sysctl.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -1493,7 +1493,7 @@ static struct ctl_table vm_table[] = { + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, +- .proc_handler = proc_dointvec, ++ .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, diff --git a/kernel/patches-5.4.x-rt/0072-mm-compaction-Disable-compact_unevictable_allowed-on.patch b/kernel/patches-5.4.x-rt/0072-mm-compaction-Disable-compact_unevictable_allowed-on.patch new file mode 100644 index 000000000..838166248 --- /dev/null +++ b/kernel/patches-5.4.x-rt/0072-mm-compaction-Disable-compact_unevictable_allowed-on.patch @@ -0,0 +1,102 @@ +From: Sebastian Andrzej Siewior +Date: Fri, 8 Nov 2019 12:55:47 +0100 +Subject: [PATCH] mm/compaction: Disable compact_unevictable_allowed on RT + +Since commit + 5bbe3547aa3ba ("mm: allow compaction of unevictable pages") + +it is allowed to examine mlocked pages and compact them by default. +On -RT even minor pagefaults are problematic because it may take a few +100us to resolve them and until then the task is blocked. + +Make compact_unevictable_allowed = 0 default and issue a warning on RT +if it is changed. + +Link: https://lore.kernel.org/linux-mm/20190710144138.qyn4tuttdq6h7kqx@linutronix.de/ +Acked-by: Mel Gorman +Acked-by: Vlastimil Babka +Signed-off-by: Sebastian Andrzej Siewior +--- + Documentation/admin-guide/sysctl/vm.rst | 3 +++ + kernel/sysctl.c | 29 ++++++++++++++++++++++++++++- + mm/compaction.c | 4 ++++ + 3 files changed, 35 insertions(+), 1 deletion(-) + +--- a/Documentation/admin-guide/sysctl/vm.rst ++++ b/Documentation/admin-guide/sysctl/vm.rst +@@ -128,6 +128,9 @@ allowed to examine the unevictable lru ( + This should be used on systems where stalls for minor page faults are an + acceptable trade for large contiguous free memory. Set to 0 to prevent + compaction from moving pages that are unevictable. Default value is 1. ++On CONFIG_PREEMPT_RT the default value is 0 in order to avoid a page fault, due ++to compaction, which would block the task from becomming active until the fault ++is resolved. + + + dirty_background_bytes +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -212,6 +212,11 @@ static int proc_do_cad_pid(struct ctl_ta + void __user *buffer, size_t *lenp, loff_t *ppos); + static int proc_taint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); ++#ifdef CONFIG_COMPACTION ++static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, ++ int write, void __user *buffer, ++ size_t *lenp, loff_t *ppos); ++#endif + #endif + + #ifdef CONFIG_PRINTK +@@ -1493,7 +1498,7 @@ static struct ctl_table vm_table[] = { + .data = &sysctl_compact_unevictable_allowed, + .maxlen = sizeof(int), + .mode = 0644, +- .proc_handler = proc_dointvec_minmax, ++ .proc_handler = proc_dointvec_minmax_warn_RT_change, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +@@ -2581,6 +2586,28 @@ int proc_dointvec(struct ctl_table *tabl + return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); + } + ++#ifdef CONFIG_COMPACTION ++static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, ++ int write, void __user *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ int ret, old; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write) ++ return proc_dointvec_minmax(table, write, buffer, lenp, ppos); ++ ++ old = *(int *)table->data; ++ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); ++ if (ret) ++ return ret; ++ if (old != *(int *)table->data) ++ pr_warn_once("sysctl attribute %s changed by %s[%d]\n", ++ table->procname, current->comm, ++ task_pid_nr(current)); ++ return ret; ++} ++#endif ++ + /** + * proc_douintvec - read a vector of unsigned integers + * @table: the sysctl table +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -1590,7 +1590,11 @@ typedef enum { + * Allow userspace to control policy on scanning the unevictable LRU for + * compactable pages. + */ ++#ifdef CONFIG_PREEMPT_RT ++int sysctl_compact_unevictable_allowed __read_mostly = 0; ++#else + int sysctl_compact_unevictable_allowed __read_mostly = 1; ++#endif + + static inline void + update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) diff --git a/kernel/patches-5.4.x-rt/0073-Use-CONFIG_PREEMPTION.patch b/kernel/patches-5.4.x-rt/0073-Use-CONFIG_PREEMPTION.patch new file mode 100644 index 000000000..9226e5a3e --- /dev/null +++ b/kernel/patches-5.4.x-rt/0073-Use-CONFIG_PREEMPTION.patch @@ -0,0 +1,1521 @@ +From: Sebastian Andrzej Siewior +Date: Fri, 26 Jul 2019 11:30:49 +0200 +Subject: [PATCH] Use CONFIG_PREEMPTION + +Thisi is an all-in-one patch of the current `PREEMPTION' branch. + +Signed-off-by: Sebastian Andrzej Siewior +--- + Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html | 8 - + Documentation/RCU/Design/Requirements/Requirements.html | 24 ++-- + Documentation/RCU/checklist.txt | 4 + Documentation/RCU/rcubarrier.txt | 8 - + Documentation/RCU/stallwarn.txt | 4 + Documentation/RCU/whatisRCU.txt | 7 - + Documentation/trace/ftrace-uses.rst | 2 + arch/arc/kernel/entry.S | 6 - + arch/arm/include/asm/switch_to.h | 2 + arch/arm/kernel/entry-armv.S | 4 + arch/arm/kernel/traps.c | 2 + arch/arm/mm/cache-v7.S | 4 + arch/arm/mm/cache-v7m.S | 4 + arch/arm64/Kconfig | 52 +++++----- + arch/arm64/crypto/sha256-glue.c | 2 + arch/arm64/include/asm/assembler.h | 6 - + arch/arm64/include/asm/preempt.h | 4 + arch/arm64/kernel/entry.S | 2 + arch/arm64/kernel/traps.c | 3 + arch/c6x/kernel/entry.S | 8 - + arch/csky/kernel/entry.S | 4 + arch/h8300/kernel/entry.S | 6 - + arch/hexagon/kernel/vm_entry.S | 6 - + arch/ia64/kernel/entry.S | 12 +- + arch/ia64/kernel/kprobes.c | 2 + arch/m68k/coldfire/entry.S | 2 + arch/microblaze/kernel/entry.S | 2 + arch/mips/include/asm/asmmacro.h | 4 + arch/mips/kernel/entry.S | 6 - + arch/nds32/Kconfig | 2 + arch/nds32/kernel/ex-exit.S | 4 + arch/nios2/kernel/entry.S | 2 + arch/parisc/Kconfig | 2 + arch/parisc/kernel/entry.S | 10 - + arch/powerpc/Kconfig | 2 + arch/powerpc/kernel/entry_32.S | 4 + arch/powerpc/kernel/entry_64.S | 4 + arch/powerpc/kernel/traps.c | 7 + + arch/riscv/kernel/entry.S | 4 + arch/s390/Kconfig | 2 + arch/s390/include/asm/preempt.h | 4 + arch/s390/kernel/dumpstack.c | 2 + arch/s390/kernel/entry.S | 2 + arch/sh/Kconfig | 2 + arch/sh/kernel/cpu/sh5/entry.S | 4 + arch/sh/kernel/entry-common.S | 4 + arch/sparc/Kconfig | 2 + arch/sparc/kernel/rtrap_64.S | 2 + arch/xtensa/kernel/entry.S | 2 + arch/xtensa/kernel/traps.c | 7 - + drivers/gpu/drm/Kconfig | 2 + drivers/media/platform/Kconfig | 2 + drivers/video/backlight/Kconfig | 4 + drivers/xen/preempt.c | 4 + fs/btrfs/volumes.h | 2 + fs/stack.c | 6 - + include/linux/fs.h | 4 + include/linux/genhd.h | 6 - + include/linux/rcupdate.h | 4 + include/xen/xen-ops.h | 4 + kernel/Kconfig.locks | 12 +- + kernel/rcu/Kconfig | 4 + kernel/rcu/rcutorture.c | 2 + kernel/rcu/srcutiny.c | 2 + kernel/rcu/tree.c | 4 + kernel/rcu/tree_exp.h | 2 + kernel/rcu/tree_plugin.h | 4 + kernel/trace/trace.c | 2 + kernel/workqueue.c | 2 + lib/Kconfig.debug | 2 + mm/memory.c | 2 + mm/slub.c | 12 +- + net/core/dev.c | 2 + 73 files changed, 191 insertions(+), 173 deletions(-) + +--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html ++++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html +@@ -56,8 +56,8 @@ sections. + RCU-preempt Expedited Grace Periods + +

+-CONFIG_PREEMPT=y kernels implement RCU-preempt. +-The overall flow of the handling of a given CPU by an RCU-preempt ++CONFIG_PREEMPT=y and CONFIG_PREEMPT_RT=y kernels implement ++RCU-preempt. The overall flow of the handling of a given CPU by an RCU-preempt + expedited grace period is shown in the following diagram: + +

ExpRCUFlow.svg +@@ -140,8 +140,8 @@ or offline, among other things. + RCU-sched Expedited Grace Periods + +

+-CONFIG_PREEMPT=n kernels implement RCU-sched. +-The overall flow of the handling of a given CPU by an RCU-sched ++CONFIG_PREEMPT=n and CONFIG_PREEMPT_RT=n kernels implement ++RCU-sched. The overall flow of the handling of a given CPU by an RCU-sched + expedited grace period is shown in the following diagram: + +

ExpSchedFlow.svg +--- a/Documentation/RCU/Design/Requirements/Requirements.html ++++ b/Documentation/RCU/Design/Requirements/Requirements.html +@@ -106,7 +106,7 @@ big RCU read-side critical section. + Production-quality implementations of rcu_read_lock() and + rcu_read_unlock() are extremely lightweight, and in + fact have exactly zero overhead in Linux kernels built for production +-use with CONFIG_PREEMPT=n. ++use with CONFIG_PREEMPTION=n. + +

+ This guarantee allows ordering to be enforced with extremely low +@@ -1499,7 +1499,7 @@ costs have plummeted. + However, as I learned from Matt Mackall's + bloatwatch + efforts, memory footprint is critically important on single-CPU systems with +-non-preemptible (CONFIG_PREEMPT=n) kernels, and thus ++non-preemptible (CONFIG_PREEMPTION=n) kernels, and thus + tiny RCU + was born. + Josh Triplett has since taken over the small-memory banner with his +@@ -1887,7 +1887,7 @@ constructs, there are limitations. +

+ Implementations of RCU for which rcu_read_lock() + and rcu_read_unlock() generate no code, such as +-Linux-kernel RCU when CONFIG_PREEMPT=n, can be ++Linux-kernel RCU when CONFIG_PREEMPTION=n, can be + nested arbitrarily deeply. + After all, there is no overhead. + Except that if all these instances of rcu_read_lock() +@@ -2229,7 +2229,7 @@ be a no-op. +

+ However, once the scheduler has spawned its first kthread, this early + boot trick fails for synchronize_rcu() (as well as for +-synchronize_rcu_expedited()) in CONFIG_PREEMPT=y ++synchronize_rcu_expedited()) in CONFIG_PREEMPTION=y + kernels. + The reason is that an RCU read-side critical section might be preempted, + which means that a subsequent synchronize_rcu() really does have +@@ -2568,7 +2568,7 @@ The compiler must not be permitted to tr + +

+ If the compiler did make this transformation in a +-CONFIG_PREEMPT=n kernel build, and if get_user() did ++CONFIG_PREEMPTION=n kernel build, and if get_user() did + page fault, the result would be a quiescent state in the middle + of an RCU read-side critical section. + This misplaced quiescent state could result in line 4 being +@@ -2906,7 +2906,7 @@ in conjunction with the + The real-time-latency response requirements are such that the + traditional approach of disabling preemption across RCU + read-side critical sections is inappropriate. +-Kernels built with CONFIG_PREEMPT=y therefore ++Kernels built with CONFIG_PREEMPTION=y therefore + use an RCU implementation that allows RCU read-side critical + sections to be preempted. + This requirement made its presence known after users made it +@@ -3064,7 +3064,7 @@ includes + rcu_barrier_bh(), and + rcu_read_lock_bh_held(). + However, the update-side APIs are now simple wrappers for other RCU +-flavors, namely RCU-sched in CONFIG_PREEMPT=n kernels and RCU-preempt ++flavors, namely RCU-sched in CONFIG_PREEMPTION=n kernels and RCU-preempt + otherwise. + +

Sched Flavor (Historical)

+@@ -3088,12 +3088,12 @@ of an RCU read-side critical section can + Therefore, RCU-sched was created, which follows “classic” + RCU in that an RCU-sched grace period waits for for pre-existing + interrupt and NMI handlers. +-In kernels built with CONFIG_PREEMPT=n, the RCU and RCU-sched ++In kernels built with CONFIG_PREEMPTION=n, the RCU and RCU-sched + APIs have identical implementations, while kernels built with +-CONFIG_PREEMPT=y provide a separate implementation for each. ++CONFIG_PREEMPTION=y provide a separate implementation for each. + +

+-Note well that in CONFIG_PREEMPT=y kernels, ++Note well that in CONFIG_PREEMPTION=y kernels, + rcu_read_lock_sched() and rcu_read_unlock_sched() + disable and re-enable preemption, respectively. + This means that if there was a preemption attempt during the +@@ -3302,12 +3302,12 @@ The tasks-RCU API is quite compact, cons + call_rcu_tasks(), + synchronize_rcu_tasks(), and + rcu_barrier_tasks(). +-In CONFIG_PREEMPT=n kernels, trampolines cannot be preempted, ++In CONFIG_PREEMPTION=n kernels, trampolines cannot be preempted, + so these APIs map to + call_rcu(), + synchronize_rcu(), and + rcu_barrier(), respectively. +-In CONFIG_PREEMPT=y kernels, trampolines can be preempted, ++In CONFIG_PREEMPTION=y kernels, trampolines can be preempted, + and these three APIs are therefore implemented by separate functions + that check for voluntary context switches. + +--- a/Documentation/RCU/checklist.txt ++++ b/Documentation/RCU/checklist.txt +@@ -210,8 +210,8 @@ over a rather long period of time, but i + the rest of the system. + + 7. As of v4.20, a given kernel implements only one RCU flavor, +- which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y. +- If the updater uses call_rcu() or synchronize_rcu(), ++ which is RCU-sched for PREEMPTION=n and RCU-preempt for ++ PREEMPTION=y. If the updater uses call_rcu() or synchronize_rcu(), + then the corresponding readers my use rcu_read_lock() and + rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(), + or any pair of primitives that disables and re-enables preemption, +--- a/Documentation/RCU/rcubarrier.txt ++++ b/Documentation/RCU/rcubarrier.txt +@@ -6,8 +6,8 @@ RCU (read-copy update) is a synchronizat + of as a replacement for read-writer locking (among other things), but with + very low-overhead readers that are immune to deadlock, priority inversion, + and unbounded latency. RCU read-side critical sections are delimited +-by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT +-kernels, generate no code whatsoever. ++by rcu_read_lock() and rcu_read_unlock(), which, in ++non-CONFIG_PREEMPTION kernels, generate no code whatsoever. + + This means that RCU writers are unaware of the presence of concurrent + readers, so that RCU updates to shared data must be undertaken quite +@@ -303,10 +303,10 @@ Answer: This cannot happen. The reason i + to smp_call_function() and further to smp_call_function_on_cpu(), + causing this latter to spin until the cross-CPU invocation of + rcu_barrier_func() has completed. This by itself would prevent +- a grace period from completing on non-CONFIG_PREEMPT kernels, ++ a grace period from completing on non-CONFIG_PREEMPTION kernels, + since each CPU must undergo a context switch (or other quiescent + state) before the grace period can complete. However, this is +- of no use in CONFIG_PREEMPT kernels. ++ of no use in CONFIG_PREEMPTION kernels. + + Therefore, on_each_cpu() disables preemption across its call + to smp_call_function() and also across the local call to +--- a/Documentation/RCU/stallwarn.txt ++++ b/Documentation/RCU/stallwarn.txt +@@ -20,7 +20,7 @@ o A CPU looping with preemption disabled + + o A CPU looping with bottom halves disabled. + +-o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel ++o For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the kernel + without invoking schedule(). If the looping in the kernel is + really expected and desirable behavior, you might need to add + some calls to cond_resched(). +@@ -39,7 +39,7 @@ o Anything that prevents RCU's grace-per + result in the "rcu_.*kthread starved for" console-log message, + which will include additional debugging information. + +-o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might ++o A CPU-bound real-time task in a CONFIG_PREEMPTION kernel, which might + happen to preempt a low-priority task in the middle of an RCU + read-side critical section. This is especially damaging if + that low-priority task is not permitted to run on any other CPU, +--- a/Documentation/RCU/whatisRCU.txt ++++ b/Documentation/RCU/whatisRCU.txt +@@ -648,9 +648,10 @@ Quick Quiz #1: Why is this argument naiv + + This section presents a "toy" RCU implementation that is based on + "classic RCU". It is also short on performance (but only for updates) and +-on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT +-kernels. The definitions of rcu_dereference() and rcu_assign_pointer() +-are the same as those shown in the preceding section, so they are omitted. ++on features such as hotplug CPU and the ability to run in ++CONFIG_PREEMPTION kernels. The definitions of rcu_dereference() and ++rcu_assign_pointer() are the same as those shown in the preceding ++section, so they are omitted. + + void rcu_read_lock(void) { } + +--- a/Documentation/trace/ftrace-uses.rst ++++ b/Documentation/trace/ftrace-uses.rst +@@ -146,7 +146,7 @@ FTRACE_OPS_FL_RECURSION_SAFE + itself or any nested functions that those functions call. + + If this flag is set, it is possible that the callback will also +- be called with preemption enabled (when CONFIG_PREEMPT is set), ++ be called with preemption enabled (when CONFIG_PREEMPTION is set), + but this is not guaranteed. + + FTRACE_OPS_FL_IPMODIFY +--- a/arch/arc/kernel/entry.S ++++ b/arch/arc/kernel/entry.S +@@ -337,11 +337,11 @@ ENTRY(ret_from_exception) + resume_kernel_mode: + + ; Disable Interrupts from this point on +- ; CONFIG_PREEMPT: This is a must for preempt_schedule_irq() +- ; !CONFIG_PREEMPT: To ensure restore_regs is intr safe ++ ; CONFIG_PREEMPTION: This is a must for preempt_schedule_irq() ++ ; !CONFIG_PREEMPTION: To ensure restore_regs is intr safe + IRQ_DISABLE r9 + +-#ifdef CONFIG_PREEMPT ++#ifdef CONFIG_PREEMPTION + + ; Can't preempt if preemption disabled + GET_CURR_THR_INFO_FROM_SP r10 +--- a/arch/arm/include/asm/switch_to.h ++++ b/arch/arm/include/asm/switch_to.h +@@ -10,7 +10,7 @@ + * to ensure that the maintenance completes in case we migrate to another + * CPU. + */ +-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) && defined(CONFIG_CPU_V7) ++#if defined(CONFIG_PREEMPTION) && defined(CONFIG_SMP) && defined(CONFIG_CPU_V7) + #define __complete_pending_tlbi() dsb(ish) + #else + #define __complete_pending_tlbi() +--- a/arch/arm/kernel/entry-armv.S ++++ b/arch/arm/kernel/entry-armv.S +@@ -211,7 +211,7 @@ ENDPROC(__dabt_svc) + svc_entry + irq_handler + +-#ifdef CONFIG_PREEMPT ++#ifdef CONFIG_PREEMPTION + ldr r8, [tsk, #TI_PREEMPT] @ get preempt count + ldr r0, [tsk, #TI_FLAGS] @ get flags + teq r8, #0 @ if preempt count != 0 +@@ -226,7 +226,7 @@ ENDPROC(__irq_svc) + + .ltorg + +-#ifdef CONFIG_PREEMPT ++#ifdef CONFIG_PREEMPTION + svc_preempt: + mov r8, lr + 1: bl preempt_schedule_irq @ irq en/disable is done inside +--- a/arch/arm/kernel/traps.c ++++ b/arch/arm/kernel/traps.c +@@ -248,6 +248,8 @@ void show_stack(struct task_struct *tsk, + + #ifdef CONFIG_PREEMPT + #define S_PREEMPT " PREEMPT" ++#elif defined(CONFIG_PREEMPT_RT) ++#define S_PREEMPT " PREEMPT_RT" + #else + #define S_PREEMPT "" + #endif +--- a/arch/arm/mm/cache-v7.S ++++ b/arch/arm/mm/cache-v7.S +@@ -135,13 +135,13 @@ ENTRY(v7_flush_dcache_all) + and r1, r1, #7 @ mask of the bits for current cache only + cmp r1, #2 @ see what cache we have at this level + blt skip @ skip if no cache, or just i-cache +-#ifdef CONFIG_PREEMPT ++#ifdef CONFIG_PREEMPTION + save_and_disable_irqs_notrace r9 @ make cssr&csidr read atomic + #endif + mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr + isb @ isb to sych the new cssr&csidr + mrc p15, 1, r1, c0, c0, 0 @ read the new csidr +-#ifdef CONFIG_PREEMPT ++#ifdef CONFIG_PREEMPTION + restore_irqs_notrace r9 + #endif + and r2, r1, #7 @ extract the length of the cache lines +--- a/arch/arm/mm/cache-v7m.S ++++ b/arch/arm/mm/cache-v7m.S +@@ -183,13 +183,13 @@ ENTRY(v7m_flush_dcache_all) + and r1, r1, #7 @ mask of the bits for current cache only + cmp r1, #2 @ see what cache we have at this level + blt skip @ skip if no cache, or just i-cache +-#ifdef CONFIG_PREEMPT ++#ifdef CONFIG_PREEMPTION + save_and_disable_irqs_notrace r9 @ make cssr&csidr read atomic + #endif + write_csselr r10, r1 @ set current cache level + isb @ isb to sych the new cssr&csidr + read_ccsidr r1 @ read the new csidr +-#ifdef CONFIG_PREEMPT ++#ifdef CONFIG_PREEMPTION + restore_irqs_notrace r9 + #endif + and r2, r1, #7 @ extract the length of the cache lines +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -35,32 +35,32 @@ config ARM64 + select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT + select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAVE_NMI_SAFE_CMPXCHG +- select ARCH_INLINE_READ_LOCK if !PREEMPT +- select ARCH_INLINE_READ_LOCK_BH if !PREEMPT +- select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPT +- select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPT +- select ARCH_INLINE_READ_UNLOCK if !PREEMPT +- select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPT +- select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPT +- select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPT +- select ARCH_INLINE_WRITE_LOCK if !PREEMPT +- select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPT +- select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPT +- select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPT +- select ARCH_INLINE_WRITE_UNLOCK if !PREEMPT +- select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPT +- select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPT +- select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPT +- select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPT +- select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPT +- select ARCH_INLINE_SPIN_LOCK if !PREEMPT +- select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPT +- select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPT +- select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPT +- select ARCH_INLINE_SPIN_UNLOCK if !PREEMPT +- select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT +- select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT +- select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT ++ select ARCH_INLINE_READ_LOCK if !PREEMPTION ++ select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION ++ select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPTION ++ select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPTION ++ select ARCH_INLINE_READ_UNLOCK if !PREEMPTION ++ select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPTION ++ select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPTION ++ select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPTION ++ select ARCH_INLINE_WRITE_LOCK if !PREEMPTION ++ select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPTION ++ select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPTION ++ select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPTION ++ select ARCH_INLINE_WRITE_UNLOCK if !PREEMPTION ++ select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPTION ++ select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPTION ++ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPTION ++ select ARCH_INLINE_SPIN_TRYLOCK if !PREEMPTION ++ select ARCH_INLINE_SPIN_TRYLOCK_BH if !PREEMPTION ++ select ARCH_INLINE_SPIN_LOCK if !PREEMPTION ++ select ARCH_INLINE_SPIN_LOCK_BH if !PREEMPTION ++ select ARCH_INLINE_SPIN_LOCK_IRQ if !PREEMPTION ++ select ARCH_INLINE_SPIN_LOCK_IRQSAVE if !PREEMPTION ++ select ARCH_INLINE_SPIN_UNLOCK if !PREEMPTION ++ select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPTION ++ select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION ++ select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION + select ARCH_KEEP_MEMBLOCK + select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_QUEUED_RWLOCKS +--- a/arch/arm64/crypto/sha256-glue.c ++++ b/arch/arm64/crypto/sha256-glue.c +@@ -97,7 +97,7 @@ static int sha256_update_neon(struct sha + * input when running on a preemptible kernel, but process the + * data block by block instead. + */ +- if (IS_ENABLED(CONFIG_PREEMPT) && ++ if (IS_ENABLED(CONFIG_PREEMPTION) && + chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE) + chunk = SHA256_BLOCK_SIZE - + sctx->count % SHA256_BLOCK_SIZE; +--- a/arch/arm64/include/asm/assembler.h ++++ b/arch/arm64/include/asm/assembler.h +@@ -699,8 +699,8 @@ USER(\label, ic ivau, \tmp2) // invali + * where