Skip to content

Commit c003bd1

Browse files
Ming Leiintel-lab-lkp
Ming Lei
authored andcommitted
block: model freeze & enter queue as rwsem for supporting lockdep
Recently we got several deadlock report[1][2][3] caused by blk_mq_freeze_queue and blk_enter_queue(). Turns out the two are just like one rwsem, so model them as rwsem for supporting lockdep: 1) model blk_mq_freeze_queue() as down_write_trylock() - it is exclusive lock, so dependency with blk_enter_queue() is covered - it is trylock because blk_mq_freeze_queue() are allowed to run concurrently 2) model blk_enter_queue() as down_read() - it is shared lock, so concurrent blk_enter_queue() are allowed - it is read lock, so dependency with blk_mq_freeze_queue() is modeled - blk_queue_exit() is often called from other contexts(such as irq), and it can't be annotated as rwsem_release(), so simply do it in blk_enter_queue(), this way still covered cases as many as possible NVMe is the only subsystem which may call blk_mq_freeze_queue() and blk_mq_unfreeze_queue() from different context, so it is the only exception for the modeling. Add one tagset flag to exclude it from the lockdep support. With lockdep support, such kind of reports may be reported asap and needn't wait until the real deadlock is triggered. For example, the following lockdep report can be triggered in the report[3]. [ 45.701432] ====================================================== [ 45.702621] WARNING: possible circular locking dependency detected [ 45.703829] 6.12.0-rc1_uring_dev+ torvalds#188 Not tainted [ 45.704806] ------------------------------------------------------ [ 45.705903] bash/1323 is trying to acquire lock: [ 45.706153] ffff88813e075870 (&q->limits_lock){+.+.}-{3:3}, at: queue_wc_store+0x11c/0x380 [ 45.706602] but task is already holding lock: [ 45.706927] ffff88813e075730 (&q->sysfs_lock){+.+.}-{3:3}, at: queue_attr_store+0xd9/0x170 [ 45.707391] which lock already depends on the new lock. [ 45.707838] the existing dependency chain (in reverse order) is: [ 45.708238] -> #2 (&q->sysfs_lock){+.+.}-{3:3}: [ 45.708558] __mutex_lock+0x177/0x10d0 [ 45.708797] queue_attr_store+0xd9/0x170 [ 45.709039] kernfs_fop_write_iter+0x39f/0x5a0 [ 45.709304] vfs_write+0x5d3/0xe80 [ 45.709514] ksys_write+0xfb/0x1d0 [ 45.709723] do_syscall_64+0x95/0x180 [ 45.709946] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 45.710256] -> #1 (q->q_usage_counter){++++}-{0:0}: [ 45.710589] blk_try_enter_queue+0x32/0x340 [ 45.710842] blk_queue_enter+0x97/0x4c0 [ 45.711080] blk_mq_alloc_request+0x347/0x900 [ 45.711340] scsi_execute_cmd+0x183/0xc20 [ 45.711584] read_capacity_16+0x1ce/0xbb0 [ 45.711823] sd_revalidate_disk.isra.0+0xf78/0x8b40 [ 45.712119] sd_probe+0x813/0xf40 [ 45.712320] really_probe+0x1e0/0x8a0 [ 45.712542] __driver_probe_device+0x18c/0x370 [ 45.712801] driver_probe_device+0x4a/0x120 [ 45.713053] __device_attach_driver+0x162/0x270 [ 45.713327] bus_for_each_drv+0x115/0x1a0 [ 45.713572] __device_attach_async_helper+0x1a0/0x240 [ 45.713872] async_run_entry_fn+0x97/0x4f0 [ 45.714127] process_one_work+0x85d/0x1430 [ 45.714377] worker_thread+0x5be/0xff0 [ 45.714610] kthread+0x293/0x350 [ 45.714811] ret_from_fork+0x31/0x70 [ 45.715032] ret_from_fork_asm+0x1a/0x30 [ 45.715667] -> #0 (&q->limits_lock){+.+.}-{3:3}: [ 45.716819] __lock_acquire+0x310a/0x6060 [ 45.717476] lock_acquire.part.0+0x122/0x360 [ 45.718133] __mutex_lock+0x177/0x10d0 [ 45.718759] queue_wc_store+0x11c/0x380 [ 45.719384] queue_attr_store+0xff/0x170 [ 45.720007] kernfs_fop_write_iter+0x39f/0x5a0 [ 45.720647] vfs_write+0x5d3/0xe80 [ 45.721252] ksys_write+0xfb/0x1d0 [ 45.721847] do_syscall_64+0x95/0x180 [ 45.722433] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 45.723085] other info that might help us debug this: [ 45.724532] Chain exists of: &q->limits_lock --> q->q_usage_counter --> &q->sysfs_lock [ 45.726122] Possible unsafe locking scenario: [ 45.727114] CPU0 CPU1 [ 45.727687] ---- ---- [ 45.728270] lock(&q->sysfs_lock); [ 45.728792] lock(q->q_usage_counter); [ 45.729466] lock(&q->sysfs_lock); [ 45.730119] lock(&q->limits_lock); [ 45.730655] *** DEADLOCK *** [ 45.731983] 5 locks held by bash/1323: [ 45.732524] #0: ffff88811a4a0450 (sb_writers#4){.+.+}-{0:0}, at: ksys_write+0xfb/0x1d0 [ 45.733290] #1: ffff88811fa35890 (&of->mutex#2){+.+.}-{3:3}, at: kernfs_fop_write_iter+0x25d/0x5a0 [ 45.734113] #2: ffff888118e9fc20 (kn->active#133){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x281/0x5a0 [ 45.734939] #3: ffff88813e0751e0 (q->q_usage_counter){++++}-{0:0}, at: blk_mq_freeze_queue+0x12/0x20 [ 45.735797] #4: ffff88813e075730 (&q->sysfs_lock){+.+.}-{3:3}, at: queue_attr_store+0xd9/0x170 [ 45.736626] stack backtrace: [ 45.737598] CPU: 9 UID: 0 PID: 1323 Comm: bash Not tainted 6.12.0-rc1_uring_dev+ torvalds#188 [ 45.738388] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-1.fc39 04/01/2014 [ 45.739222] Call Trace: [ 45.739762] <TASK> [ 45.740309] dump_stack_lvl+0x84/0xd0 [ 45.740962] print_circular_bug.cold+0x1e4/0x278 [ 45.741628] check_noncircular+0x331/0x410 [ 45.742289] ? __pfx_check_noncircular+0x10/0x10 [ 45.742933] ? lock_release+0x588/0xcc0 [ 45.743536] ? lockdep_lock+0xbe/0x1c0 [ 45.744156] ? __pfx_lockdep_lock+0x10/0x10 [ 45.744785] ? is_bpf_text_address+0x21/0x100 [ 45.745442] __lock_acquire+0x310a/0x6060 [ 45.746088] ? __pfx___lock_acquire+0x10/0x10 [ 45.746735] ? __pfx___bfs+0x10/0x10 [ 45.747325] lock_acquire.part.0+0x122/0x360 [ 45.747947] ? queue_wc_store+0x11c/0x380 [ 45.748564] ? __pfx_lock_acquire.part.0+0x10/0x10 [ 45.749254] ? trace_lock_acquire+0x12f/0x1a0 [ 45.749903] ? queue_wc_store+0x11c/0x380 [ 45.750545] ? lock_acquire+0x2f/0xb0 [ 45.751168] ? queue_wc_store+0x11c/0x380 [ 45.751777] __mutex_lock+0x177/0x10d0 [ 45.752379] ? queue_wc_store+0x11c/0x380 [ 45.752988] ? queue_wc_store+0x11c/0x380 [ 45.753586] ? __pfx___mutex_lock+0x10/0x10 [ 45.754202] ? __pfx___lock_acquire+0x10/0x10 [ 45.754823] ? lock_acquire.part.0+0x122/0x360 [ 45.755456] ? queue_wc_store+0x11c/0x380 [ 45.756063] queue_wc_store+0x11c/0x380 [ 45.756653] ? __pfx_lock_acquired+0x10/0x10 [ 45.757305] ? lock_acquire+0x2f/0xb0 [ 45.757904] ? trace_contention_end+0xd4/0x110 [ 45.758524] ? __pfx_queue_wc_store+0x10/0x10 [ 45.759172] ? queue_attr_store+0xd9/0x170 [ 45.759899] ? __pfx_autoremove_wake_function+0x10/0x10 [ 45.760587] queue_attr_store+0xff/0x170 [ 45.761213] ? sysfs_kf_write+0x41/0x170 [ 45.761823] ? __pfx_sysfs_kf_write+0x10/0x10 [ 45.762461] kernfs_fop_write_iter+0x39f/0x5a0 [ 45.763109] vfs_write+0x5d3/0xe80 [ 45.763690] ? lockdep_hardirqs_on_prepare+0x274/0x3f0 [ 45.764368] ? __pfx_vfs_write+0x10/0x10 [ 45.764964] ksys_write+0xfb/0x1d0 [ 45.765544] ? __pfx_ksys_write+0x10/0x10 [ 45.766160] ? ksys_dup3+0xce/0x2b0 [ 45.766710] do_syscall_64+0x95/0x180 [ 45.767267] ? filp_close+0x1d/0x30 [ 45.767801] ? do_dup2+0x27c/0x4f0 [ 45.768334] ? syscall_exit_to_user_mode+0x97/0x290 [ 45.768930] ? lockdep_hardirqs_on_prepare+0x274/0x3f0 [ 45.769538] ? syscall_exit_to_user_mode+0x97/0x290 [ 45.770136] ? do_syscall_64+0xa1/0x180 [ 45.770665] ? clear_bhb_loop+0x25/0x80 [ 45.771206] ? clear_bhb_loop+0x25/0x80 [ 45.771735] ? clear_bhb_loop+0x25/0x80 [ 45.772261] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 45.772843] RIP: 0033:0x7f3b17be9984 [ 45.773363] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d c5 06 0e 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89 [ 45.775131] RSP: 002b:00007fffaa975ba8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [ 45.775909] RAX: ffffffffffffffda RBX: 0000000000000005 RCX: 00007f3b17be9984 [ 45.776672] RDX: 0000000000000005 RSI: 0000556af3758790 RDI: 0000000000000001 [ 45.777454] RBP: 00007fffaa975bd0 R08: 0000000000000073 R09: 00000000ffffffff [ 45.778251] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000005 [ 45.779039] R13: 0000556af3758790 R14: 00007f3b17cc35c0 R15: 00007f3b17cc0f00 [ 45.779822] </TASK> [1] occasional block layer hang when setting 'echo noop > /sys/block/sda/queue/scheduler' https://bugzilla.kernel.org/show_bug.cgi?id=219166 [2] del_gendisk() vs blk_queue_enter() race condition https://lore.kernel.org/linux-block/[email protected]/ [3] queue_freeze & queue_enter deadlock in scsi https://lore.kernel.org/linux-block/ZxG38G9BuFdBpBHZ@fedora/T/#u Cc: Christoph Hellwig <[email protected]> Signed-off-by: Ming Lei <[email protected]>
1 parent f82eab0 commit c003bd1

File tree

8 files changed

+43
-3
lines changed

8 files changed

+43
-3
lines changed

block/blk-core.c

+3
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,7 @@ static void blk_timeout_work(struct work_struct *work)
384384

385385
struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
386386
{
387+
static struct lock_class_key __q_usage_counter_key;
387388
struct request_queue *q;
388389
int error;
389390

@@ -441,6 +442,8 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
441442
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
442443
if (error)
443444
goto fail_stats;
445+
lockdep_init_map(&q->q_usage_counter_map, "q->q_usage_counter",
446+
&__q_usage_counter_key, 0);
444447

445448
q->nr_requests = BLKDEV_DEFAULT_RQ;
446449

block/blk-mq-debugfs.c

+1
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ static const char *const hctx_flag_name[] = {
188188
HCTX_FLAG_NAME(BLOCKING),
189189
HCTX_FLAG_NAME(NO_SCHED),
190190
HCTX_FLAG_NAME(NO_SCHED_BY_DEFAULT),
191+
HCTX_FLAG_NAME(SKIP_FREEZE_LOCKDEP),
191192
};
192193
#undef HCTX_FLAG_NAME
193194

block/blk-mq.c

+15
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,10 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
122122

123123
void blk_freeze_queue_start(struct request_queue *q)
124124
{
125+
int sub_class;
126+
125127
mutex_lock(&q->mq_freeze_lock);
128+
sub_class = q->mq_freeze_depth;
126129
if (++q->mq_freeze_depth == 1) {
127130
percpu_ref_kill(&q->q_usage_counter);
128131
mutex_unlock(&q->mq_freeze_lock);
@@ -131,6 +134,12 @@ void blk_freeze_queue_start(struct request_queue *q)
131134
} else {
132135
mutex_unlock(&q->mq_freeze_lock);
133136
}
137+
/*
138+
* model as down_write_trylock() so that two concurrent freeze queue
139+
* can be allowed
140+
*/
141+
if (blk_queue_freeze_lockdep(q))
142+
rwsem_acquire(&q->q_usage_counter_map, sub_class, 1, _RET_IP_);
134143
}
135144
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
136145

@@ -188,6 +197,9 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
188197
wake_up_all(&q->mq_freeze_wq);
189198
}
190199
mutex_unlock(&q->mq_freeze_lock);
200+
201+
if (blk_queue_freeze_lockdep(q))
202+
rwsem_release(&q->q_usage_counter_map, _RET_IP_);
191203
}
192204

193205
void blk_mq_unfreeze_queue(struct request_queue *q)
@@ -4241,6 +4253,9 @@ void blk_mq_destroy_queue(struct request_queue *q)
42414253
blk_queue_start_drain(q);
42424254
blk_mq_freeze_queue_wait(q);
42434255

4256+
/* counter pair of acquire in blk_queue_start_drain */
4257+
if (blk_queue_freeze_lockdep(q))
4258+
rwsem_release(&q->q_usage_counter_map, _RET_IP_);
42444259
blk_sync_queue(q);
42454260
blk_mq_cancel_work_sync(q);
42464261
blk_mq_exit_queue(q);

block/blk.h

+9
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <linux/bio-integrity.h>
66
#include <linux/blk-crypto.h>
7+
#include <linux/lockdep.h>
78
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
89
#include <linux/sched/sysctl.h>
910
#include <linux/timekeeping.h>
@@ -43,6 +44,8 @@ void bio_await_chain(struct bio *bio);
4344

4445
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
4546
{
47+
/* model as down_read() for lockdep */
48+
rwsem_acquire_read(&q->q_usage_counter_map, 0, 0, _RET_IP_);
4649
rcu_read_lock();
4750
if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
4851
goto fail;
@@ -56,12 +59,18 @@ static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
5659
goto fail_put;
5760

5861
rcu_read_unlock();
62+
/*
63+
* queue exit often happen in other context, so we simply annotate
64+
* release here, still lots of cases can be covered
65+
*/
66+
rwsem_release(&q->q_usage_counter_map, _RET_IP_);
5967
return true;
6068

6169
fail_put:
6270
blk_queue_exit(q);
6371
fail:
6472
rcu_read_unlock();
73+
rwsem_release(&q->q_usage_counter_map, _RET_IP_);
6574
return false;
6675
}
6776

block/genhd.c

+3
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,9 @@ void del_gendisk(struct gendisk *disk)
742742
blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
743743
__blk_mq_unfreeze_queue(q, true);
744744
} else {
745+
/* counter pair of acquire in blk_queue_start_drain */
746+
if (blk_queue_freeze_lockdep(q))
747+
rwsem_release(&q->q_usage_counter_map, _RET_IP_);
745748
if (queue_is_mq(q))
746749
blk_mq_exit_queue(q);
747750
}

drivers/nvme/host/core.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -4528,7 +4528,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
45284528
/* Reserved for fabric connect and keep alive */
45294529
set->reserved_tags = 2;
45304530
set->numa_node = ctrl->numa_node;
4531-
set->flags = BLK_MQ_F_NO_SCHED;
4531+
set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_SKIP_FREEZE_LOCKDEP;
45324532
if (ctrl->ops->flags & NVME_F_BLOCKING)
45334533
set->flags |= BLK_MQ_F_BLOCKING;
45344534
set->cmd_size = cmd_size;
@@ -4598,7 +4598,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
45984598
/* Reserved for fabric connect */
45994599
set->reserved_tags = 1;
46004600
set->numa_node = ctrl->numa_node;
4601-
set->flags = BLK_MQ_F_SHOULD_MERGE;
4601+
set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SKIP_FREEZE_LOCKDEP;
46024602
if (ctrl->ops->flags & NVME_F_BLOCKING)
46034603
set->flags |= BLK_MQ_F_BLOCKING;
46044604
set->cmd_size = cmd_size;

include/linux/blk-mq.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -687,7 +687,10 @@ enum {
687687
* or shared hwqs instead of 'mq-deadline'.
688688
*/
689689
BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6,
690-
BLK_MQ_F_ALLOC_POLICY_START_BIT = 7,
690+
691+
BLK_MQ_F_SKIP_FREEZE_LOCKDEP = 1 << 7,
692+
693+
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
691694
BLK_MQ_F_ALLOC_POLICY_BITS = 1,
692695
};
693696
#define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \

include/linux/blkdev.h

+6
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <linux/uuid.h>
2626
#include <linux/xarray.h>
2727
#include <linux/file.h>
28+
#include <linux/lockdep.h>
2829

2930
struct module;
3031
struct request_queue;
@@ -474,6 +475,9 @@ struct request_queue {
474475
struct xarray hctx_table;
475476

476477
struct percpu_ref q_usage_counter;
478+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
479+
struct lockdep_map q_usage_counter_map;
480+
#endif
477481

478482
struct request *last_merge;
479483

@@ -640,6 +644,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
640644
#define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
641645
#define blk_queue_skip_tagset_quiesce(q) \
642646
((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE)
647+
#define blk_queue_freeze_lockdep(q) \
648+
!(q->tag_set->flags & BLK_MQ_F_SKIP_FREEZE_LOCKDEP)
643649

644650
extern void blk_set_pm_only(struct request_queue *q);
645651
extern void blk_clear_pm_only(struct request_queue *q);

0 commit comments

Comments
 (0)