7 月 31 日下午,我们观看了《2022 全国大学生系统能力大赛操作系统设计赛第九场技术培训会》。会上,许多高水平的同学就他们的成果作了展示。和我们相同选题的北京航空航天大学 高世伟同学也在其中展示。据他介绍,他们组计划转储到 SQLite 数据库,同时加入前端数据展示界面。我想起自己曾经和马玉昆老师询问过关于如何展示的内容,但那时由于对于持有锁分析等更重要的内容的进度缓慢,前端展示这一问题便被搁置。就这一问题我咨询了队友叶景熙同学,他表示可以调研一下。三天后,他计划使用 Qt 读取文件等的方式来实现这样的功能,我们在 8 月 4 日的会议上与马玉昆导师就此事进行了交流,马老师表示对此方面可能不是很了解。
此次技术培训会也我认识到自己的不足,自己费了很大劲还不完全弄明白的内容,别人轻轻松松就能搞懂。自己应该放弃不切实际的幻想,更加脚踏实地地做事。
基数树(Radix tree)是内核中的一种数据结构,可以将值映射为整型关键字,使用 RADIX_TREE
宏进行定义及初始化,调用 radix_tree_insert
、radix_tree_delete
可以插入或删除某个结点,而 radix_tree_lookup
可以查找基数树;对于重复元素,会返回 –EEXIST。
这些都比较好理解,不过删除整棵基数树就有些没那么直白了。由于基数树在插入元素的过程中也在增长,其内部也动态分配了内存,我找到的方案是遍历并删除元素。
原来自己并没有想过用 gdb 调试,但频频出现的内核挂起或死锁等问题却让我感到举步维艰,于是我阅读了这篇文章:
(gdb) hbreak do_init_module
Hardware assisted breakpoint 1 at 0xffffffff8125d46d: file kernel/module.c, line 3485.
(gdb) target remote 127.0.0.1:1234
Remote debugging using 127.0.0.1:1234
0xffffffff8206870a in native_safe_halt ()
at ./arch/x86/include/asm/irqflags.h:57
57 asm volatile("sti; hlt": : :"memory");
(gdb) c
Continuing.
[Switching to Thread 1.3]
Thread 3 hit Breakpoint 1, do_init_module (mod=mod@entry=0xffffffffa0a2ab80)
at kernel/module.c:3485
3485 {
(gdb) print mod->sect_attrs->attrs[1]->name
$1 = 0xffff888107ea5e88 ".text"
......
(gdb) print mod->sect_attrs->attrs[12]->name
$15 = 0xffff888107ea5d70 ".data"
......
(gdb) print mod->sect_attrs->attrs[16]->name
$19 = 0xffff888107ea5848 ".bss"
(gdb) print /x mod->sect_attrs->attrs[1]->address
$20 = 0xffffffffa0a28000
(gdb) print /x mod->sect_attrs->attrs[12]->address
$21 = 0xffffffffa0a2a000
(gdb) print /x mod->sect_attrs->attrs[16]->address
$22 = 0xffffffffa0a2af80
(gdb) add-symbol-file /home/qcwl/LinuxLearn/LockTest/lock_test.ko 0xffffffffa0a28000 -s .data 0xffffffffa0a2a000 -s .bss 0xffffffffa0a2af80
add symbol table from file "/home/qcwl/LinuxLearn/LockTest/lock_test.ko" at
.text_addr = 0xffffffffa0a28000
.data_addr = 0xffffffffa0a2a000
.bss_addr = 0xffffffffa0a2af80
(y or n) y
Reading symbols from /home/qcwl/LinuxLearn/LockTest/lock_test.ko...done.
此时我们已经获得了内核模块被加载的地址,因此可以让 GDB 加载其符号。如下所示,不加载调试信息的情况下是看不到具体的代码行数的。
.......
(gdb) c
Continuing.
[Switching to Thread 1.4]
Thread 4 hit Breakpoint 2, add_into_hashtable (lock_addr=18446744071615679776,
lock_type=lock_type@entry=0)
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:40
40 static void add_into_hashtable(unsigned long lock_addr, int lock_type) {
(gdb) n
44 pr_info("0\n");
(gdb) n
45 hash_for_each_possible(lock_table, current_lock_info, node, lock_addr) {
(gdb) n
50 if (current_lock_info->lock_address == lock_addr) {
(gdb) n
53 pr_info("1\n");
(gdb) n
56 current_lock_info->called_num++;
(gdb) n
59 list_for_each_entry(pos, ¤t_lock_info->process_list_head, process_list_node) {
(gdb) n
61 if (pos == NULL) {
(gdb) n
64 if (pos->pid == get_current()->pid) {
(gdb) n
15 return this_cpu_read_stable(current_task);
(gdb) n
59 list_for_each_entry(pos, ¤t_lock_info->process_list_head, process_list_node) {
(gdb) n
73 pr_info("2\n");
(gdb) n
74 new_node = kmalloc(sizeof(struct process_stack), GFP_ATOMIC);
(gdb) n
75 if (new_node == NULL) {
(gdb) n
78 INIT_LIST_HEAD(&new_node->process_list_node);
(gdb) n
80 new_node->pid = get_current()->pid;
(gdb) n
15 return this_cpu_read_stable(current_task);
(gdb) n
375 return __builtin_memcpy(p, q, size);
(gdb) n
83 lock_trace.nr_entries = 0;
(gdb) n
84 lock_trace.max_entries = MAX_STACK_TRACE_DEPTH;
(gdb) n
85 lock_trace.entries = new_node->entries;
(gdb) n
86 lock_trace.skip = 0;// 复制过来时是 2
(gdb) n
15 return this_cpu_read_stable(current_task);
(gdb) n
88 new_node->nr_entries = lock_trace.nr_entries;
(gdb) n
90 list_add(&new_node->process_list_node, ¤t_lock_info->process_list_head);
(gdb) n
129 pr_info(".\n");
(gdb) n
pre_handler_spin_lock_irqsave (p=<optimized out>, regs=0xffff88810d189b98)
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:146
146 smp_mb();
(gdb) n
147 atomic_set(&table_lock, 0);
(gdb) n
149 atomic_set(this_cpu_ptr(&in_prober[0]), false);
(gdb) n
165 return !(flags & X86_EFLAGS_IF);
(gdb) n
150 local_irq_restore(irq_flags);
(gdb) n
151 return 0;
(gdb) n
138 return 0;
(gdb) n
kprobe_ftrace_handler (ip=18446744071596054648, parent_ip=<optimized out>,
ops=<optimized out>, regs=0xffff88810d189b98)
at arch/x86/kernel/kprobes/ftrace.c:83
83 __skip_singlestep(p, regs, kcb, orig_ip);
(gdb) n
84 preempt_enable_no_resched();
(gdb) n
__ftrace_ops_list_func (regs=0xffff88810d189b98,
ignored=0x0 <fixed_percpu_data>, parent_ip=18446744071580636019,
ip=18446744071596054648) at kernel/trace/ftrace.c:6866
6866 } while_for_each_ftrace_op(op);
(gdb) n
6868 preempt_enable_notrace();
(gdb) n
6869 trace_clear_recursion(bit);
(gdb) n
ftrace_regs_caller () at arch/x86/kernel/ftrace_64.S:220
220 movq EFLAGS(%rsp), %rax
(gdb) n
221 movq %rax, MCOUNT_REG_SIZE(%rsp)
(gdb) n
224 movq RIP(%rsp), %rax
(gdb) n
225 movq %rax, MCOUNT_REG_SIZE+8(%rsp)
(gdb) n
228 movq R15(%rsp), %r15
(gdb) n
229 movq R14(%rsp), %r14
(gdb) n
230 movq R13(%rsp), %r13
(gdb) n
231 movq R12(%rsp), %r12
(gdb) n
232 movq R10(%rsp), %r10
(gdb) n
233 movq RBX(%rsp), %rbx
(gdb) n
235 movq ORIG_RAX(%rsp), %rax
(gdb) n
236 movq %rax, MCOUNT_REG_SIZE-8(%rsp)
(gdb) n
239 movq ORIG_RAX(%rsp), %rax
(gdb) n
240 cmpq $0, %rax
(gdb) n
241 je 1f
(gdb) n
252 1: restore_mcount_regs
(gdb) n
ftrace_regs_caller () at arch/x86/kernel/ftrace_64.S:264
264 popfq
(gdb) n
ftrace_regs_caller () at arch/x86/kernel/ftrace_64.S:274
274 jmp ftrace_epilogue
(gdb) n
ftrace_caller () at arch/x86/kernel/ftrace_64.S:169
169 jmp ftrace_stub
(gdb) n
ftrace_caller () at arch/x86/kernel/ftrace_64.S:174
174 retq
(gdb) n
_raw_spin_lock_irqsave (lock=lock@entry=0xffffffff83320d20 <log_wait>)
at kernel/locking/spinlock.c:159
159 return __raw_spin_lock_irqsave(lock);
(gdb) n
^C
Thread 4 received signal SIGINT, Interrupt.
0xffffffff82068718 in native_halt () at ./arch/x86/include/asm/irqflags.h:63
63 asm volatile("hlt": : :"memory");
此处似乎在等待锁,这时我们可以用 bt
查看其栈回溯。
(gdb) bt
#0 0xffffffff82068718 in native_halt ()
at ./arch/x86/include/asm/irqflags.h:63
#1 0xffffffff81099231 in halt () at ./arch/x86/include/asm/paravirt.h:99
#2 kvm_wait (ptr=0xffffffff83320d20 <log_wait> "\003", val=<optimized out>)
at arch/x86/kernel/kvm.c:896
#3 0xffffffff811dec2c in pv_wait (val=3 '\003',
ptr=0xffffffff83320d20 <log_wait> "\003")
at ./arch/x86/include/asm/paravirt.h:694
#4 pv_wait_head_or_lock (lock=lock@entry=0xffffffff83320d20 <log_wait>,
node=node@entry=0xffff88810d1b37c0)
at kernel/locking/qspinlock_paravirt.h:470
#5 0xffffffff811defa5 in __pv_queued_spin_lock_slowpath (
lock=0xffffffff83320d20 <log_wait>, val=<optimized out>)
at kernel/locking/qspinlock.c:517
#6 0xffffffff811df542 in pv_queued_spin_lock_slowpath (val=3,
lock=0xffffffff83320d20 <log_wait>)
at ./arch/x86/include/asm/paravirt.h:684
#7 queued_spin_lock_slowpath (val=3, lock=0xffffffff83320d20 <log_wait>)
at ./arch/x86/include/asm/qspinlock.h:51
#8 queued_spin_lock (lock=0xffffffff83320d20 <log_wait>)
at ./include/asm-generic/qspinlock.h:89
#9 do_raw_spin_lock (lock=lock@entry=0xffffffff83320d20 <log_wait>)
at kernel/locking/spinlock_debug.c:113
--Type <RET> for more, q to quit, c to continue without paging--
#10 0xffffffff820698cf in __raw_spin_lock_irqsave (
lock=0xffffffff83320d20 <log_wait>)
at ./include/linux/spinlock_api_smp.h:117
#11 _raw_spin_lock_irqsave (lock=lock@entry=0xffffffff83320d20 <log_wait>)
at kernel/locking/spinlock.c:159
#12 0xffffffff811b5373 in __wake_up_common_lock (
wq_head=wq_head@entry=0xffffffff83320d20 <log_wait>, mode=mode@entry=1,
nr_exclusive=nr_exclusive@entry=1, wake_flags=wake_flags@entry=0,
key=key@entry=0x0 <fixed_percpu_data>) at ./include/linux/spinlock.h:329
#13 0xffffffff811b5435 in __wake_up (
wq_head=wq_head@entry=0xffffffff83320d20 <log_wait>, mode=mode@entry=1,
nr_exclusive=nr_exclusive@entry=1, key=key@entry=0x0 <fixed_percpu_data>)
at kernel/sched/wait.c:158
#14 0xffffffff811f02a7 in wake_up_klogd_work_func (irq_work=<optimized out>)
at kernel/printk/printk.c:2984
#15 0xffffffff8133593e in irq_work_run_list (list=<optimized out>)
at kernel/irq_work.c:156
#16 0xffffffff81335bec in irq_work_tick () at kernel/irq_work.c:184
#17 0xffffffff8122e0e8 in update_process_times (user_tick=0)
at kernel/time/timer.c:1706
#18 0xffffffff81248f8a in tick_sched_handle (ts=ts@entry=0xffff88810d1a5180,
regs=regs@entry=0xffff888100d2fcc8) at ./arch/x86/include/asm/ptrace.h:131
#19 0xffffffff81249011 in tick_sched_timer (
--Type <RET> for more, q to quit, c to continue without paging--
timer=timer@entry=0xffff88810d1a5180) at kernel/time/tick-sched.c:1433
#20 0xffffffff8122faf4 in __run_hrtimer (
cpu_base=cpu_base@entry=0xffff88810d1a4740,
base=base@entry=0xffff88810d1a47c0, timer=timer@entry=0xffff88810d1a5180,
now=now@entry=0xffff88810d189ea8, flags=flags@entry=70)
at kernel/time/hrtimer.c:1530
#21 0xffffffff8122fc1d in __hrtimer_run_queues (
cpu_base=cpu_base@entry=0xffff88810d1a4740, now=now@entry=220769561299,
flags=flags@entry=70, active_mask=active_mask@entry=15)
at kernel/time/hrtimer.c:1594
#22 0xffffffff812305a2 in hrtimer_interrupt (dev=<optimized out>)
at kernel/time/hrtimer.c:1656
#23 0xffffffff81080b73 in local_apic_timer_interrupt ()
at arch/x86/kernel/apic/apic.c:1073
#24 0xffffffff822030bf in smp_apic_timer_interrupt (regs=<optimized out>)
at arch/x86/kernel/apic/apic.c:1098
#25 0xffffffff82201cdf in apic_timer_interrupt ()
at arch/x86/entry/entry_64.S:869
#26 0xffff888100d2fcc8 in ?? ()
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:359
#27 0x000ad20000000000 in ?? ()
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:359
#28 0x00d30000fffffe00 in ?? ()
--Type <RET> for more, q to quit, c to continue without paging--
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:359
#29 0x021160e8ffff8881 in ?? ()
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:359
Backtrace stopped: Cannot access memory at address 0x2000
我把希望寄托在内核调试选项上。开启 Detect stack corruption on calls to schedule()、Debug IRQ flag manipulation 之后,重新编译内核,得到:
[ 29.959798] =====================================
[ 29.961699] WARNING: bad unlock balance detected!
[ 29.963940] 4.18.0-348.7.1.el8.qunchuwolao.x86_64 #1 Tainted: G OE ---------r- -
[ 29.966321] -------------------------------------
[ 29.967332] insmod/1550 is trying to release lock (&table_lock) at:
[ 29.968860] [<ffffffffa066c4e0>] pre_handler_spin_lock_irqsave+0x82/0x114 [lock_test]
[ 29.970549] but there are no more locks to release!
[ 29.972290]
[ 29.972290] other info that might help us debug this:
[ 29.975269] 3 locks held by insmod/1550:
[ 29.976603] #0: ffffffff82b21528 (kprobe_mutex){....}-{3:3}, at: register_kprobe+0xba/0x1ee
[ 29.980170] #1: ffffffff82b36648 (ftrace_lock){....}-{3:3}, at: register_ftrace_function+0x1b/0x63
[ 29.984280] #2: ffffffff82a71dc8 (text_mutex){....}-{3:3}, at: ftrace_arch_code_modify_prepare+0x11/0x21
[ 29.988334]
[ 29.988334] stack backtrace:
[ 29.990202] CPU: 3 PID: 1550 Comm: insmod Kdump: loaded Tainted: G OE ---------r- - 4.18.0-348.7.1.el8.qunchuwolao.x86_64 #1
[ 29.994743] Hardware name: Red Hat KVM, BIOS 1.13.0-2.module_el8.5.0+746+bbd5d70c 04/01/2014
[ 29.996984] Call Trace:
[ 29.997556] <IRQ>
[ 29.998049] ? __dump_stack+0x1b/0x1c
[ 29.998834] ? dump_stack+0x7c/0xba
[ 29.999637] ? pre_handler_spin_lock_irqsave+0x82/0x114 [lock_test]
[ 30.001513] ? print_unlock_imbalance_bug+0xc9/0xce
[ 30.003520] ? __lock_release+0x15f/0x189
[ 30.005345] ? pre_handler_spin_lock_irqsave+0x82/0x114 [lock_test]
[ 30.008230] ? _raw_spin_lock_irqsave+0x5/0x61
[ 30.009647] ? lock_release+0xb1/0xf4
[ 30.010813] ? _raw_spin_unlock+0x17/0x21
[ 30.011974] ? pre_handler_spin_lock_irqsave+0x82/0x114 [lock_test]
[ 30.015187] ? _raw_spin_lock_irqsave+0x1/0x61
[ 30.017549] ? kprobe_ftrace_handler+0x89/0xdb
[ 30.019861] ? _raw_write_lock_irqsave+0x61/0x61
[ 30.022233] ? hrtimer_interrupt+0x4d/0x210
[ 30.023745] ? ftrace_ops_list_func+0xc7/0x120
[ 30.025312] ? ftrace_regs_call+0x5/0xd3
[ 30.026831] ? _raw_spin_lock_irqsave+0x1/0x61
[ 30.028296] ? _raw_spin_lock_irqsave+0x5/0x61
[ 30.030858] ? hrtimer_interrupt+0x4d/0x210
[ 30.033226] ? kvm_clock_read+0xd/0xe
[ 30.035286] ? local_apic_timer_interrupt+0x2d/0x4c
[ 30.037996] ? smp_apic_timer_interrupt+0x9f/0x10c
[ 30.040476] ? apic_timer_interrupt+0xf/0x20
[ 30.043085] </IRQ>
[ 30.044699] ? ftrace_get_addr_curr+0x8/0x77
[ 30.047132] ? add_brk_on_nop+0x14/0x16
[ 30.049277] ? add_breakpoints+0xe/0x4a
[ 30.051592] ? ftrace_replace_code+0x33/0x156
[ 30.054080] ? ftrace_modify_all_code+0x84/0xc8
[ 30.056424] ? arch_ftrace_update_code+0xc/0x14
[ 30.059302] ? ftrace_run_update_code+0x13/0x30
[ 30.061940] ? ftrace_startup_enable+0x28/0x2f
[ 30.064623] ? ftrace_startup+0x7b/0x9e
[ 30.067079] ? register_ftrace_function+0x28/0x63
[ 30.068962] ? __arm_kprobe_ftrace+0x71/0xaf
[ 30.070454] ? arm_kprobe_ftrace+0x33/0x34
[ 30.072570] ? arm_kprobe+0x4b/0x4f
[ 30.074872] ? register_kprobe+0x19d/0x1ee
[ 30.077050] ? register_kprobes+0x2b/0x5c
[ 30.079337] ? start_module+0x92/0x1000 [lock_test]
[ 30.082357] ? 0xffffffffa0673000
[ 30.084342] ? do_one_initcall+0x9f/0x1ec
[ 30.086744] ? do_init_module+0x22/0x23f
[ 30.088519] ? kmem_cache_alloc_trace+0x11a/0x1eb
[ 30.090986] ? do_init_module+0x5a/0x23f
[ 30.093058] ? load_module+0x438/0x43d
[ 30.094283] ? __do_sys_finit_module+0xd3/0xeb
[ 30.096660] ? __se_sys_finit_module+0xa/0xb
[ 30.099286] ? __x64_sys_finit_module+0x16/0x17
[ 30.102024] ? do_syscall_64+0x85/0x1b7
[ 30.104141] ? entry_SYSCALL_64_after_hwframe+0x6a/0xdf
[ 30.151812] Registered probe_spin_lock_irqsave.
使用 info threads
可以查看所有进程的信息,使用 thread 线程号
可以切换,我认为这是十分有用的。
(gdb) info threads
Id Target Id Frame
* 1 Thread 1.1 (CPU#0 [running]) pv_hybrid_queued_unfair_trylock (
lock=0xffffffffa057d640 <table_lock>)
at kernel/locking/qspinlock_paravirt.h:90
2 Thread 1.2 (CPU#1 [running]) __read_once_size (size=4,
res=<synthetic pointer>, p=0xffffffffa057d640 <table_lock>)
at ./include/linux/compiler.h:276
3 Thread 1.3 (CPU#2 [halted ]) 0xffffffff81a0af8d in native_halt ()
at ./arch/x86/include/asm/irqflags.h:63
4 Thread 1.4 (CPU#3 [running]) add_into_hashtable (
lock_addr=<optimized out>, lock_type=lock_type@entry=0)
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:60
(gdb) thread 1.4
[Switching to thread 4 (Thread 1.4)]
#0 add_into_hashtable (lock_addr=<optimized out>, lock_type=lock_type@entry=0)
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:60
60 list_for_each_entry(pos, ¤t_lock_info->process_list_head, process_list_node) {
(gdb) bt
#0 add_into_hashtable (lock_addr=<optimized out>, lock_type=lock_type@entry=0)
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:60
#1 0xffffffffa057b5e8 in pre_handler_spin_lock (p=<optimized out>,
regs=0xffffc9000093b960) at /home/qcwl/LinuxLearn/LockTest/lock_test.c:173
#2 0xffffffff8105f537 in kprobe_ftrace_handler (ip=18446744071589377096,
parent_ip=<optimized out>, ops=<optimized out>, regs=0xffffc9000093b960)
at arch/x86/kernel/kprobes/ftrace.c:82
#3 0xffffffff811dcbf0 in ftrace_ops_assist_func (ip=18446744071589377096,
parent_ip=18446744071582007391, op=0xffffffff82cfae40 <kprobe_ftrace_ops>,
regs=0xffffc9000093b960) at kernel/trace/ftrace.c:6919
#4 0xffffffffa05850c8 in ?? ()
#5 0xffff8881000058c0 in ?? ()
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:382
#6 0xffff88813bdaf160 in ?? ()
at /home/qcwl/LinuxLearn/LockTest/lock_test.c:382
#7 0x0000000000000000 in ?? ()
(gdb) n
65 if (pos->pid == get_current()->pid) {
用 l* 查看,可以查看指定的地址对应的代码。
[qcwl@localhost LockTest]$ gdb ./lock_test.ko --silent
Reading symbols from ./lock_test.ko...done.
(gdb) l* end_module+0x2e8
0x104e is in end_module (/home/qcwl/LinuxLearn/LockTest/lock_test.c:450).
445 // /* 原版 */list_for_each_entry_safe(pos, n, ¤t_lock_info->process_list_head, process_list_node) {
446 pos = current_lock_info->begin; // ADD
447 while (pos != NULL) { // ADD
448 int i, j;
449 struct pid_array *same_pid;
450 n = pos->next; // ADD
451 pr_info("PID %d, comm %s, Backtrace: \n", pos->pid, pos->comm);
452 same_pid = radix_tree_lookup(&pid_tree, pos->pid);
453 if (same_pid) {
454 pr_info("Pid holds %d lock.\n", same_pid->ring_index);