Skip to content

Commit 52f481a

Browse files
Adrian Huangakpm00
Adrian Huang
authored andcommitted
mm/vmalloc: combine all TLB flush operations of KASAN shadow virtual address into one operation
When compiling kernel source 'make -j $(nproc)' with the up-and-running KASAN-enabled kernel on a 256-core machine, the following soft lockup is shown: watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760] CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 torvalds#95 Workqueue: events drain_vmap_area_work RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0 Code: 38 c8 7c 08 84 c9 0f 85 49 08 00 00 8b 45 08 a8 01 74 2e 48 89 f1 49 89 f7 48 c1 e9 03 41 83 e7 07 4c 01 e9 41 83 c7 03 f3 90 <0f> b6 01 41 38 c7 7c 08 84 c0 0f 85 d4 06 00 00 8b 45 08 a8 01 75 RSP: 0018:ffffc9000cb3fb60 EFLAGS: 00000202 RAX: 0000000000000011 RBX: ffff8883bc4469c0 RCX: ffffed10776e9949 RDX: 0000000000000002 RSI: ffff8883bb74ca48 RDI: ffffffff8434dc50 RBP: ffff8883bb74ca40 R08: ffff888103585dc0 R09: ffff8884533a1800 R10: 0000000000000004 R11: ffffffffffffffff R12: ffffed1077888d39 R13: dffffc0000000000 R14: ffffed1077888d38 R15: 0000000000000003 FS: 0000000000000000(0000) GS:ffff8883bc400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005577b5c8d158 CR3: 0000000004850000 CR4: 0000000000350ef0 Call Trace: <IRQ> ? watchdog_timer_fn+0x2cd/0x390 ? __pfx_watchdog_timer_fn+0x10/0x10 ? __hrtimer_run_queues+0x300/0x6d0 ? sched_clock_cpu+0x69/0x4e0 ? __pfx___hrtimer_run_queues+0x10/0x10 ? srso_return_thunk+0x5/0x5f ? ktime_get_update_offsets_now+0x7f/0x2a0 ? srso_return_thunk+0x5/0x5f ? srso_return_thunk+0x5/0x5f ? hrtimer_interrupt+0x2ca/0x760 ? __sysvec_apic_timer_interrupt+0x8c/0x2b0 ? sysvec_apic_timer_interrupt+0x6a/0x90 </IRQ> <TASK> ? asm_sysvec_apic_timer_interrupt+0x16/0x20 ? smp_call_function_many_cond+0x1d8/0xbb0 ? __pfx_do_kernel_range_flush+0x10/0x10 on_each_cpu_cond_mask+0x20/0x40 flush_tlb_kernel_range+0x19b/0x250 ? srso_return_thunk+0x5/0x5f ? kasan_release_vmalloc+0xa7/0xc0 purge_vmap_node+0x357/0x820 ? __pfx_purge_vmap_node+0x10/0x10 __purge_vmap_area_lazy+0x5b8/0xa10 drain_vmap_area_work+0x21/0x30 process_one_work+0x661/0x10b0 worker_thread+0x844/0x10e0 ? srso_return_thunk+0x5/0x5f ? __kthread_parkme+0x82/0x140 ? __pfx_worker_thread+0x10/0x10 kthread+0x2a5/0x370 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x30/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 </TASK> Debugging Analysis: 1. The following ftrace log shows that the lockup CPU spends too much time iterating vmap_nodes and flushing TLB when purging vm_area structures. (Some info is trimmed). kworker: funcgraph_entry: | drain_vmap_area_work() { kworker: funcgraph_entry: | mutex_lock() { kworker: funcgraph_entry: 1.092 us | __cond_resched(); kworker: funcgraph_exit: 3.306 us | } ... ... kworker: funcgraph_entry: | flush_tlb_kernel_range() { ... ... kworker: funcgraph_exit: # 7533.649 us | } ... ... kworker: funcgraph_entry: 2.344 us | mutex_unlock(); kworker: funcgraph_exit: $ 23871554 us | } The drain_vmap_area_work() spends over 23 seconds. There are 2805 flush_tlb_kernel_range() calls in the ftrace log. * One is called in __purge_vmap_area_lazy(). * Others are called by purge_vmap_node->kasan_release_vmalloc. purge_vmap_node() iteratively releases kasan vmalloc allocations and flushes TLB for each vmap_area. - [Rough calculation] Each flush_tlb_kernel_range() runs about 7.5ms. -- 2804 * 7.5ms = 21.03 seconds. -- That's why a soft lock is triggered. 2. Extending the soft lockup time can work around the issue (For example, # echo 60 > /proc/sys/kernel/watchdog_thresh). This confirms the above-mentioned speculation: drain_vmap_area_work() spends too much time. If we combine all TLB flush operations of the KASAN shadow virtual address into one operation in the call path 'purge_vmap_node()->kasan_release_vmalloc()', the running time of drain_vmap_area_work() can be saved greatly. The idea is from the flush_tlb_kernel_range() call in __purge_vmap_area_lazy(). And, the soft lockup won't be triggered. Here is the test result based on 6.10: [6.10 wo/ the patch] 1. ftrace latency profiling (record a trace if the latency > 20s). echo 20000000 > /sys/kernel/debug/tracing/tracing_thresh echo drain_vmap_area_work > /sys/kernel/debug/tracing/set_graph_function echo function_graph > /sys/kernel/debug/tracing/current_tracer echo 1 > /sys/kernel/debug/tracing/tracing_on 2. Run `make -j $(nproc)` to compile the kernel source 3. Once the soft lockup is reproduced, check the ftrace log: cat /sys/kernel/debug/tracing/trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 76) $ 50412985 us | } /* __purge_vmap_area_lazy */ 76) $ 50412997 us | } /* drain_vmap_area_work */ 76) $ 29165911 us | } /* __purge_vmap_area_lazy */ 76) $ 29165926 us | } /* drain_vmap_area_work */ 91) $ 53629423 us | } /* __purge_vmap_area_lazy */ 91) $ 53629434 us | } /* drain_vmap_area_work */ 91) $ 28121014 us | } /* __purge_vmap_area_lazy */ 91) $ 28121026 us | } /* drain_vmap_area_work */ [6.10 w/ the patch] 1. Repeat step 1-2 in "[6.10 wo/ the patch]" 2. The soft lockup is not triggered and ftrace log is empty. cat /sys/kernel/debug/tracing/trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 3. Setting 'tracing_thresh' to 10/5 seconds does not get any ftrace log. 4. Setting 'tracing_thresh' to 1 second gets ftrace log. cat /sys/kernel/debug/tracing/trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 23) $ 1074942 us | } /* __purge_vmap_area_lazy */ 23) $ 1074950 us | } /* drain_vmap_area_work */ The worst execution time of drain_vmap_area_work() is about 1 second. Link: https://lore.kernel.org/lkml/[email protected]/ Link: https://lkml.kernel.org/r/[email protected] Fixes: 282631c ("mm: vmalloc: remove global purge_vmap_area_root rb-tree") Signed-off-by: Adrian Huang <[email protected]> Co-developed-by: Uladzislau Rezki (Sony) <[email protected]> Signed-off-by: Uladzislau Rezki (Sony) <[email protected]> Tested-by: Jiwei Sun <[email protected]> Reviewed-by: Baoquan He <[email protected]> Cc: Alexander Potapenko <[email protected]> Cc: Andrey Konovalov <[email protected]> Cc: Andrey Ryabinin <[email protected]> Cc: Christoph Hellwig <[email protected]> Cc: Dmitry Vyukov <[email protected]> Cc: Vincenzo Frascino <[email protected]> Cc: <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent b7dc4ae commit 52f481a

File tree

3 files changed

+45
-15
lines changed

3 files changed

+45
-15
lines changed

include/linux/kasan.h

+9-3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ typedef unsigned int __bitwise kasan_vmalloc_flags_t;
2929
#define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u)
3030
#define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u)
3131

32+
#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
33+
#define KASAN_VMALLOC_TLB_FLUSH 0x2 /* TLB flush */
34+
3235
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
3336

3437
#include <linux/pgtable.h>
@@ -564,7 +567,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
564567
int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
565568
void kasan_release_vmalloc(unsigned long start, unsigned long end,
566569
unsigned long free_region_start,
567-
unsigned long free_region_end);
570+
unsigned long free_region_end,
571+
unsigned long flags);
568572

569573
#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
570574

@@ -579,7 +583,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
579583
static inline void kasan_release_vmalloc(unsigned long start,
580584
unsigned long end,
581585
unsigned long free_region_start,
582-
unsigned long free_region_end) { }
586+
unsigned long free_region_end,
587+
unsigned long flags) { }
583588

584589
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
585590

@@ -614,7 +619,8 @@ static inline int kasan_populate_vmalloc(unsigned long start,
614619
static inline void kasan_release_vmalloc(unsigned long start,
615620
unsigned long end,
616621
unsigned long free_region_start,
617-
unsigned long free_region_end) { }
622+
unsigned long free_region_end,
623+
unsigned long flags) { }
618624

619625
static inline void *kasan_unpoison_vmalloc(const void *start,
620626
unsigned long size,

mm/kasan/shadow.c

+10-4
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
489489
*/
490490
void kasan_release_vmalloc(unsigned long start, unsigned long end,
491491
unsigned long free_region_start,
492-
unsigned long free_region_end)
492+
unsigned long free_region_end,
493+
unsigned long flags)
493494
{
494495
void *shadow_start, *shadow_end;
495496
unsigned long region_start, region_end;
@@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
522523
__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
523524
return;
524525
}
525-
apply_to_existing_page_range(&init_mm,
526+
527+
528+
if (flags & KASAN_VMALLOC_PAGE_RANGE)
529+
apply_to_existing_page_range(&init_mm,
526530
(unsigned long)shadow_start,
527531
size, kasan_depopulate_vmalloc_pte,
528532
NULL);
529-
flush_tlb_kernel_range((unsigned long)shadow_start,
530-
(unsigned long)shadow_end);
533+
534+
if (flags & KASAN_VMALLOC_TLB_FLUSH)
535+
flush_tlb_kernel_range((unsigned long)shadow_start,
536+
(unsigned long)shadow_end);
531537
}
532538
}
533539

mm/vmalloc.c

+26-8
Original file line numberDiff line numberDiff line change
@@ -2182,6 +2182,25 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay)
21822182
reclaim_list_global(&decay_list);
21832183
}
21842184

2185+
static void
2186+
kasan_release_vmalloc_node(struct vmap_node *vn)
2187+
{
2188+
struct vmap_area *va;
2189+
unsigned long start, end;
2190+
2191+
start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
2192+
end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
2193+
2194+
list_for_each_entry(va, &vn->purge_list, list) {
2195+
if (is_vmalloc_or_module_addr((void *) va->va_start))
2196+
kasan_release_vmalloc(va->va_start, va->va_end,
2197+
va->va_start, va->va_end,
2198+
KASAN_VMALLOC_PAGE_RANGE);
2199+
}
2200+
2201+
kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
2202+
}
2203+
21852204
static void purge_vmap_node(struct work_struct *work)
21862205
{
21872206
struct vmap_node *vn = container_of(work,
@@ -2190,20 +2209,17 @@ static void purge_vmap_node(struct work_struct *work)
21902209
struct vmap_area *va, *n_va;
21912210
LIST_HEAD(local_list);
21922211

2212+
if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
2213+
kasan_release_vmalloc_node(vn);
2214+
21932215
vn->nr_purged = 0;
21942216

21952217
list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
21962218
unsigned long nr = va_size(va) >> PAGE_SHIFT;
2197-
unsigned long orig_start = va->va_start;
2198-
unsigned long orig_end = va->va_end;
21992219
unsigned int vn_id = decode_vn_id(va->flags);
22002220

22012221
list_del_init(&va->list);
22022222

2203-
if (is_vmalloc_or_module_addr((void *)orig_start))
2204-
kasan_release_vmalloc(orig_start, orig_end,
2205-
va->va_start, va->va_end);
2206-
22072223
nr_purged_pages += nr;
22082224
vn->nr_purged++;
22092225

@@ -4784,7 +4800,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
47844800
&free_vmap_area_list);
47854801
if (va)
47864802
kasan_release_vmalloc(orig_start, orig_end,
4787-
va->va_start, va->va_end);
4803+
va->va_start, va->va_end,
4804+
KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
47884805
vas[area] = NULL;
47894806
}
47904807

@@ -4834,7 +4851,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
48344851
&free_vmap_area_list);
48354852
if (va)
48364853
kasan_release_vmalloc(orig_start, orig_end,
4837-
va->va_start, va->va_end);
4854+
va->va_start, va->va_end,
4855+
KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
48384856
vas[area] = NULL;
48394857
kfree(vms[area]);
48404858
}

0 commit comments

Comments
 (0)