Skip to content

Commit c2cb422

Browse files
biger410gregkh
authored andcommitted
scsi: megaraid_sas: Fix deadlock on firmware crashdump
commit 0b0747d upstream. The following processes run into a deadlock. CPU 41 was waiting for CPU 29 to handle a CSD request while holding spinlock "crashdump_lock", but CPU 29 was hung by that spinlock with IRQs disabled. PID: 17360 TASK: ffff95c1090c5c40 CPU: 41 COMMAND: "mrdiagd" !# 0 [ffffb80edbf37b58] __read_once_size at ffffffff9b871a40 include/linux/compiler.h:185:0 !# 1 [ffffb80edbf37b58] atomic_read at ffffffff9b871a40 arch/x86/include/asm/atomic.h:27:0 !# 2 [ffffb80edbf37b58] dump_stack at ffffffff9b871a40 lib/dump_stack.c:54:0 # 3 [ffffb80edbf37b78] csd_lock_wait_toolong at ffffffff9b131ad5 kernel/smp.c:364:0 # 4 [ffffb80edbf37b78] __csd_lock_wait at ffffffff9b131ad5 kernel/smp.c:384:0 # 5 [ffffb80edbf37bf8] csd_lock_wait at ffffffff9b13267a kernel/smp.c:394:0 # 6 [ffffb80edbf37bf8] smp_call_function_many at ffffffff9b13267a kernel/smp.c:843:0 # 7 [ffffb80edbf37c50] smp_call_function at ffffffff9b13279d kernel/smp.c:867:0 # 8 [ffffb80edbf37c50] on_each_cpu at ffffffff9b13279d kernel/smp.c:976:0 # 9 [ffffb80edbf37c78] flush_tlb_kernel_range at ffffffff9b085c4b arch/x86/mm/tlb.c:742:0 Freescale#10 [ffffb80edbf37cb8] __purge_vmap_area_lazy at ffffffff9b23a1e0 mm/vmalloc.c:701:0 Freescale#11 [ffffb80edbf37ce0] try_purge_vmap_area_lazy at ffffffff9b23a2cc mm/vmalloc.c:722:0 Freescale#12 [ffffb80edbf37ce0] free_vmap_area_noflush at ffffffff9b23a2cc mm/vmalloc.c:754:0 Freescale#13 [ffffb80edbf37cf8] free_unmap_vmap_area at ffffffff9b23bb3b mm/vmalloc.c:764:0 Freescale#14 [ffffb80edbf37cf8] remove_vm_area at ffffffff9b23bb3b mm/vmalloc.c:1509:0 Freescale#15 [ffffb80edbf37d18] __vunmap at ffffffff9b23bb8a mm/vmalloc.c:1537:0 Freescale#16 [ffffb80edbf37d40] vfree at ffffffff9b23bc85 mm/vmalloc.c:1612:0 Freescale#17 [ffffb80edbf37d58] megasas_free_host_crash_buffer [megaraid_sas] at ffffffffc020b7f2 drivers/scsi/megaraid/megaraid_sas_fusion.c:3932:0 Freescale#18 [ffffb80edbf37d80] fw_crash_state_store [megaraid_sas] at ffffffffc01f804d drivers/scsi/megaraid/megaraid_sas_base.c:3291:0 Freescale#19 [ffffb80edbf37dc0] dev_attr_store at ffffffff9b56dd7b drivers/base/core.c:758:0 Freescale#20 [ffffb80edbf37dd0] sysfs_kf_write at ffffffff9b326acf fs/sysfs/file.c:144:0 Freescale#21 [ffffb80edbf37de0] kernfs_fop_write at ffffffff9b325fd4 fs/kernfs/file.c:316:0 Freescale#22 [ffffb80edbf37e20] __vfs_write at ffffffff9b29418a fs/read_write.c:480:0 Freescale#23 [ffffb80edbf37ea8] vfs_write at ffffffff9b294462 fs/read_write.c:544:0 Freescale#24 [ffffb80edbf37ee8] SYSC_write at ffffffff9b2946ec fs/read_write.c:590:0 Freescale#25 [ffffb80edbf37ee8] SyS_write at ffffffff9b2946ec fs/read_write.c:582:0 Freescale#26 [ffffb80edbf37f30] do_syscall_64 at ffffffff9b003ca9 arch/x86/entry/common.c:298:0 Freescale#27 [ffffb80edbf37f58] entry_SYSCALL_64 at ffffffff9ba001b1 arch/x86/entry/entry_64.S:238:0 PID: 17355 TASK: ffff95c1090c3d80 CPU: 29 COMMAND: "mrdiagd" !# 0 [ffffb80f2d3c7d30] __read_once_size at ffffffff9b0f2ab0 include/linux/compiler.h:185:0 !# 1 [ffffb80f2d3c7d30] native_queued_spin_lock_slowpath at ffffffff9b0f2ab0 kernel/locking/qspinlock.c:368:0 # 2 [ffffb80f2d3c7d58] pv_queued_spin_lock_slowpath at ffffffff9b0f244b arch/x86/include/asm/paravirt.h:674:0 # 3 [ffffb80f2d3c7d58] queued_spin_lock_slowpath at ffffffff9b0f244b arch/x86/include/asm/qspinlock.h:53:0 # 4 [ffffb80f2d3c7d68] queued_spin_lock at ffffffff9b8961a6 include/asm-generic/qspinlock.h:90:0 # 5 [ffffb80f2d3c7d68] do_raw_spin_lock_flags at ffffffff9b8961a6 include/linux/spinlock.h:173:0 # 6 [ffffb80f2d3c7d68] __raw_spin_lock_irqsave at ffffffff9b8961a6 include/linux/spinlock_api_smp.h:122:0 # 7 [ffffb80f2d3c7d68] _raw_spin_lock_irqsave at ffffffff9b8961a6 kernel/locking/spinlock.c:160:0 # 8 [ffffb80f2d3c7d88] fw_crash_buffer_store [megaraid_sas] at ffffffffc01f8129 drivers/scsi/megaraid/megaraid_sas_base.c:3205:0 # 9 [ffffb80f2d3c7dc0] dev_attr_store at ffffffff9b56dd7b drivers/base/core.c:758:0 Freescale#10 [ffffb80f2d3c7dd0] sysfs_kf_write at ffffffff9b326acf fs/sysfs/file.c:144:0 Freescale#11 [ffffb80f2d3c7de0] kernfs_fop_write at ffffffff9b325fd4 fs/kernfs/file.c:316:0 Freescale#12 [ffffb80f2d3c7e20] __vfs_write at ffffffff9b29418a fs/read_write.c:480:0 Freescale#13 [ffffb80f2d3c7ea8] vfs_write at ffffffff9b294462 fs/read_write.c:544:0 Freescale#14 [ffffb80f2d3c7ee8] SYSC_write at ffffffff9b2946ec fs/read_write.c:590:0 Freescale#15 [ffffb80f2d3c7ee8] SyS_write at ffffffff9b2946ec fs/read_write.c:582:0 Freescale#16 [ffffb80f2d3c7f30] do_syscall_64 at ffffffff9b003ca9 arch/x86/entry/common.c:298:0 Freescale#17 [ffffb80f2d3c7f58] entry_SYSCALL_64 at ffffffff9ba001b1 arch/x86/entry/entry_64.S:238:0 The lock is used to synchronize different sysfs operations, it doesn't protect any resource that will be touched by an interrupt. Consequently it's not required to disable IRQs. Replace the spinlock with a mutex to fix the deadlock. Signed-off-by: Junxiao Bi <[email protected]> Link: https://lore.kernel.org/r/[email protected] Reviewed-by: Mike Christie <[email protected]> Cc: [email protected] Signed-off-by: Martin K. Petersen <[email protected]> Signed-off-by: Greg Kroah-Hartman <[email protected]>
1 parent 890e1e5 commit c2cb422

File tree

2 files changed

+10
-13
lines changed

2 files changed

+10
-13
lines changed

drivers/scsi/megaraid/megaraid_sas.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -2332,7 +2332,7 @@ struct megasas_instance {
23322332
u32 support_morethan256jbod; /* FW support for more than 256 PD/JBOD */
23332333
bool use_seqnum_jbod_fp; /* Added for PD sequence */
23342334
bool smp_affinity_enable;
2335-
spinlock_t crashdump_lock;
2335+
struct mutex crashdump_lock;
23362336

23372337
struct megasas_register_set __iomem *reg_set;
23382338
u32 __iomem *reply_post_host_index_addr[MR_MAX_MSIX_REG_ARRAY];

drivers/scsi/megaraid/megaraid_sas_base.c

+9-12
Original file line numberDiff line numberDiff line change
@@ -3272,14 +3272,13 @@ fw_crash_buffer_store(struct device *cdev,
32723272
struct megasas_instance *instance =
32733273
(struct megasas_instance *) shost->hostdata;
32743274
int val = 0;
3275-
unsigned long flags;
32763275

32773276
if (kstrtoint(buf, 0, &val) != 0)
32783277
return -EINVAL;
32793278

3280-
spin_lock_irqsave(&instance->crashdump_lock, flags);
3279+
mutex_lock(&instance->crashdump_lock);
32813280
instance->fw_crash_buffer_offset = val;
3282-
spin_unlock_irqrestore(&instance->crashdump_lock, flags);
3281+
mutex_unlock(&instance->crashdump_lock);
32833282
return strlen(buf);
32843283
}
32853284

@@ -3294,24 +3293,23 @@ fw_crash_buffer_show(struct device *cdev,
32943293
unsigned long dmachunk = CRASH_DMA_BUF_SIZE;
32953294
unsigned long chunk_left_bytes;
32963295
unsigned long src_addr;
3297-
unsigned long flags;
32983296
u32 buff_offset;
32993297

3300-
spin_lock_irqsave(&instance->crashdump_lock, flags);
3298+
mutex_lock(&instance->crashdump_lock);
33013299
buff_offset = instance->fw_crash_buffer_offset;
33023300
if (!instance->crash_dump_buf ||
33033301
!((instance->fw_crash_state == AVAILABLE) ||
33043302
(instance->fw_crash_state == COPYING))) {
33053303
dev_err(&instance->pdev->dev,
33063304
"Firmware crash dump is not available\n");
3307-
spin_unlock_irqrestore(&instance->crashdump_lock, flags);
3305+
mutex_unlock(&instance->crashdump_lock);
33083306
return -EINVAL;
33093307
}
33103308

33113309
if (buff_offset > (instance->fw_crash_buffer_size * dmachunk)) {
33123310
dev_err(&instance->pdev->dev,
33133311
"Firmware crash dump offset is out of range\n");
3314-
spin_unlock_irqrestore(&instance->crashdump_lock, flags);
3312+
mutex_unlock(&instance->crashdump_lock);
33153313
return 0;
33163314
}
33173315

@@ -3323,7 +3321,7 @@ fw_crash_buffer_show(struct device *cdev,
33233321
src_addr = (unsigned long)instance->crash_buf[buff_offset / dmachunk] +
33243322
(buff_offset % dmachunk);
33253323
memcpy(buf, (void *)src_addr, size);
3326-
spin_unlock_irqrestore(&instance->crashdump_lock, flags);
3324+
mutex_unlock(&instance->crashdump_lock);
33273325

33283326
return size;
33293327
}
@@ -3348,7 +3346,6 @@ fw_crash_state_store(struct device *cdev,
33483346
struct megasas_instance *instance =
33493347
(struct megasas_instance *) shost->hostdata;
33503348
int val = 0;
3351-
unsigned long flags;
33523349

33533350
if (kstrtoint(buf, 0, &val) != 0)
33543351
return -EINVAL;
@@ -3362,9 +3359,9 @@ fw_crash_state_store(struct device *cdev,
33623359
instance->fw_crash_state = val;
33633360

33643361
if ((val == COPIED) || (val == COPY_ERROR)) {
3365-
spin_lock_irqsave(&instance->crashdump_lock, flags);
3362+
mutex_lock(&instance->crashdump_lock);
33663363
megasas_free_host_crash_buffer(instance);
3367-
spin_unlock_irqrestore(&instance->crashdump_lock, flags);
3364+
mutex_unlock(&instance->crashdump_lock);
33683365
if (val == COPY_ERROR)
33693366
dev_info(&instance->pdev->dev, "application failed to "
33703367
"copy Firmware crash dump\n");
@@ -7423,7 +7420,7 @@ static inline void megasas_init_ctrl_params(struct megasas_instance *instance)
74237420
init_waitqueue_head(&instance->int_cmd_wait_q);
74247421
init_waitqueue_head(&instance->abort_cmd_wait_q);
74257422

7426-
spin_lock_init(&instance->crashdump_lock);
7423+
mutex_init(&instance->crashdump_lock);
74277424
spin_lock_init(&instance->mfi_pool_lock);
74287425
spin_lock_init(&instance->hba_lock);
74297426
spin_lock_init(&instance->stream_lock);

0 commit comments

Comments
 (0)