捉虫日记 0013: insmod failed on RMI XLR 最初现象的精确重现
1 缘起
这个问题的最终原因终于找到:PCI IDE 驱动在不经意间将位于物理地址 0x0 处的 TLB Refill handler 修改了,且改动的很小就 0x1 和 0x3 处的 2 个字节。
最初的现象是这样:
Mounting local filesystems: [ OK ] Enabling /etc/fstab swaps: [ OK ] CPU 0 Unable to handle kernel paging request at virtual address 0000000000000000, epc == ffffffff800000d0, ra == ffffffff80000008 Oops[#1]: Cpu 0 $ 0 : 0000000000000000 000000002aab0748 0000000000000000 ffffff0000000000 $ 4 : c000000000000000 000000002aaaa000 0000000000011d88 ffffffffffffffbf $ 8 : 7f454c4602020100 0000000000000000 0001000800000001 0000000000000000 $12 : 0000000000000000 ffffffff834056fc ffffffff83485048 0000000000002af8 $16 : 000000002aaaa000 0000000000006748 000000002aaaa000 0000000000417148 $20 : c000000000000000 0000000000417100 0000000000417330 000000002aaaa000 $24 : 0000000000000008 ffffffff835e2710 $28 : a800000126710000 a800000126713d20 000000000041733c ffffffff80000008 Hi : 0000000000000007 Lo : 0000000000000001 epc : ffffffff800000d0 0xffffffff800000d0 Not tainted ra : ffffffff80000008 0xffffffff80000008 Status: 10005ce3 KX SX UX KERNEL EXL IE Cause : 00808008 BadVA : 0000000000000000 PrId : 000c0b04 (RMI Phoenix) Modules linked in: Process modprobe (pid: 1573, threadinfo=a800000126710000, task=a8000001266aafe8, tls=000000002aaafec0) Stack : a800000126716710 a8000001265e2438 a8000001265e23f0 a8000001265e2dc8 a8000001265e3150 a8000001265e3140 a8000001265587c0 0000000000007000 a8000001265e2dc8 ffffffff834b25e8 0000000000007000 ffffffff8340fb68 a8000001276686c0 ffffffff834bdcec a800000127668598 a8000001265e3140 000000002aaaa000 a8000001265e3150 a8000001265e23f0 0000000008100073 a8000001265587c0 ffffffff834bfb3c 0000000000000000 0000000000000003 0000000000000000 ffffffff8340ff98 a800000127668598 000000002aaaa000 a800000126725380 ffffffff836169e8 0000000010005ce1 a800000126558824 000000002aaaa000 a800000126725380 0000000000006748 0000000000417148 000000002aaaa000 ffffffff838c0000 0000000000000000 0000000000417100 ... Call Trace: [<ffffffff834b25e8>] vma_prio_tree_insert+0x28/0x60 [<ffffffff8340fb68>] _spin_unlock+0x18/0x38 [<ffffffff834bdcec>] vma_link+0x13c/0x218 [<ffffffff834bfb3c>] mmap_region+0x67c/0x728 [<ffffffff8340ff98>] _spin_lock_irqsave+0x20/0xf8 [<ffffffff836169e8>] __up_write+0x40/0x2c8 [<ffffffff834850bc>] sys_init_module+0x74/0x1b0 [<ffffffff8340394c>] handle_sys+0x16c/0x188 [<ffffffff834056f4>] __bzero+0x90/0x11c Code: df7b0000 335a0ff0 037ad82d <df7a0000> df7b0008 001ad1ba 409a1000 001bd9ba 409b1800 INIT: Entering runlevel: 3 Entering non-interactive startup Starting system logger: [ OK ] Starting kernel logger: [ OK ] Starting portmap: [ OK ] Mounting other filesystems: [ OK ] Starting sshd: <------------- kernel hung up here
当初的原始分析还是比较符合事实的:
-------------------------------------------------------- From the dumped context, we can get the information: Cause: 00808008 ----> TLB load exception BadVA: 0000000000000000 ----> access 0x0 epc: ffffffff800000d0 ----> in TLB refill handler Decode the dumped Code (df7b0000 335a0ff0 037ad82d <df7a0000> ...) the related instruction is: c: df7b0000 ld k1,0(k1) 10: 335a0ff0 andi k0,k0,0xff0 14: 037ad82d daddu k1,k1,k0 18: df7a0000 ld k0,0(k1) <--- cause the exception, so we can guess k1 ==0 1c: df7b0008 ld k1,8(k1) 20: 001ad1ba dsrl k0,k0,0x6 24: 409a1000 mtc0 k0,c0_entrylo0 28: 001bd9ba dsrl k1,k1,0x6 2c: 409b1800 mtc0 k1,c0_entrylo1 Obviously it's the last part of TLB refill handler. --------------------------------------------------------
当时百思不得其解的是 TLB Refill handler 索引到了 PMD 的空项(等于是到 0x0 处去读取页表)
TLB Refill handler 到底会不会索引到 PMD 空项的问题,后来和 WuFei 的闲聊中清晰了,Linux/MIPS 上是不可能出现的,init 进程的最初 pgd 的所有项都是指向 invalid_pmd_table,而这个 invalid_pmd_table 的所有项都是指向 invalid_pte_table,后面的子进程都是继承至 init,因此 Linux/MIPS 之进程页表建立过程应该是这样的:
访问地址 vaddr_x --> TLB Miss --> TLB refill handler --> 为 vaddr 填充一个无效项(invalid_pte_table 的项都为 0,因此 v==0) --> re-access vaddr_x --> TLB Invalidate exception --> general exception entry (offset 0x180, ExcCode: tlbl/tlbs) --> handle_tlbl/tlbs --> do_page_fault --> handle_mm_fault --> alloc_pmd/pte ...
即:在上面的 call trace 里,k1 的值不可能为 0,其值的底线是 invalid_pte_table 的地址。
2. 未被破坏的 TLB Refill handler
@0xffffffff80000000 c: 07610005 bgez k1,24 <main+0x24> /* vaddr[61] != 1 (0xc000..0000 ~ 0xdfff..ffff), branch */ 10: 3c1bc000 lui k1,0xc000 -----> at delay slot, commit new value to k1 after reading k1 (bgez) # vaddr is at 0xe000000000000000 ~ 0xffffffffffffffff 14: 035bd02f dsubu k0,k0,k1 /* k0 = vaddr - 0xffffffffc0000000 */ 18: 3c1b8396 lui k1,0x8396 1c: 10000023 b ac <main+0xac> ----> @@@ 20: 277ba000 addiu k1,k1,-24576 /* 0xffffffff8395a000, module_pg_dir */ 24: 001bd83c dsll32 k1,k1,0x0 /* (0xffffffffc0000000 << 32) */ 28: 035bd02f dsubu k0,k0,k1 /* k0 = vaddr - 0xc000000000000000 */ 2c: 1000001f b ac <main+0xac> ----> @@@ 30: 3c1b8396 lui k1,0x8396 /* 0xffffffff83960000, swapper_pg_current */ ... ... ... @0xffffffff80000080 8c: 403a4000 dmfc0 k0,c0_badvaddr 90: 0740001a bltz k0,fc <main+0xfc> /* if badvaddr >= 0x80000000 00000000 branch */ 94: 403b2000 dmfc0 k1,c0_context 98: 001bddfa dsrl k1,k1,0x17 /* get (smp_processor_id() << 3) (26-23), see asm/mmu_context.h */ 9c: 3c1a8396 lui k0,0x8396 /* swapper_pg_current = 0xffffffff83960000 */ a0: 037ad82d daddu k1,k1,k0 /* seem like is p_tmp = swapper_pg_current[smp_processor_id()] */ a4: 403a4000 dmfc0 k0,c0_badvaddr a8: df7b2000 ld k1,8192(k1) /* * pgd = *((void *)(p_tmp + 8192), 8 bytes per pgd entry, pgd_current = 0xffffffff83962000, * 0x2000 = 8192 * actually it's pgd = pgd_current[smp_processor_id()] */ @@@ ac: 001ad6fa dsrl k0,k0,0x1b # >> 27 b0: 335a1ff8 andi k0,k0,0x1ff8 /* get (vaddr[39:30] << 3), for indexing pgd */ b4: 037ad82d daddu k1,k1,k0 /* index pgd */ b8: 403a4000 dmfc0 k0,c0_badvaddr bc: df7b0000 ld k1,0(k1) /* get p_pmd */ c0: 001ad4ba dsrl k0,k0,0x12 c4: 335a0ff8 andi k0,k0,0xff8 /* get (vaddr[29:21] << 3), for indexing pmd */ c8: 037ad82d daddu k1,k1,k0 /* index pmd */ cc: 403aa000 dmfc0 k0,c0_xcontext d0: df7b0000 ld k1,0(k1) /* get p_pt */ d4: 335a0ff0 andi k0,k0,0xff0 /* get (va[20:13] << 4), actually use va[20:12] index the pt, va[12]=0, for indexing pt */ d8: 037ad82d daddu k1,k1,k0 /* index pt */ dc: df7a0000 ld k0,0(k1) /* get even page addr */ <------------- e0: df7b0008 ld k1,8(k1) /* get odd page addr */ e4: 001ad1ba dsrl k0,k0,0x6 /* ignore the low 6 bits, it's for os */ e8: 409a1000 mtc0 k0,c0_entrylo0 /* tlb even page entry */ ec: 001bd9ba dsrl k1,k1,0x6 /* same as above */ f0: 409b1800 mtc0 k1,c0_entrylo1 /* tlb odd page entry */ f4: 42000006 tlbwr /* random write tlb */ f8: 42000018 eret fc: 001ad8b8 dsll k1,k0,0x2 # go here, vaddr is in xkphys or xkseg (>0x8000..0000) # 0x8000..0000 ~ 0xc000..0000 is xkphys, unmapped, # so vaddr > 0xc000..0000; vaddr << 2 100: 1000ffc2 b c <main+0xc> 104: 00000000 nop 108: 00000000 nop
3. 被破坏后的 TLB Refill handler
@0xffffffff80000000 c: 07130025 bgezall t8,a4 <main+0xa4> 10: 3c1bc000 lui k1,0xc000 # vaddr 0xe000..0000 ~ 0xffff..ffff, MODULE_START 14: 035bd02f dsubu k0,k0,k1 18: 3c1b8396 lui k1,0x8396 1c: 10000023 b ac <main+0xac> 20: 277ba000 addiu k1,k1,-24576 # k1 --> module_pg_dir 24: 001bd83c dsll32 k1,k1,0x0 # vaddr 0xc000..0000 ~ 0xdfff..ffff, VMALLOC_START 28: 035bd02f dsubu k0,k0,k1 2c: 1000001f b ac <main+0xac> 30: 3c1b8396 lui k1,0x8396 # k1 --> swapper_pg_current ... ... @0xffffffff80000080 8c: 403a4000 dmfc0 k0,c0_badvaddr 90: 0740001a bltz k0,fc <main+0xfc> 94: 403b2000 dmfc0 k1,c0_context 98: 001bddfa dsrl k1,k1,0x17 9c: 3c1a8396 lui k0,0x8396 a0: 037ad82d daddu k1,k1,k0 a4: 403a4000 dmfc0 k0,c0_badvaddr a8: df7b2000 ld k1,8192(k1) @@@ ac: 001ad6fa dsrl k0,k0,0x1b b0: 335a1ff8 andi k0,k0,0x1ff8 b4: 037ad82d daddu k1,k1,k0 b8: 403a4000 dmfc0 k0,c0_badvaddr bc: df7b0000 ld k1,0(k1) c0: 001ad4ba dsrl k0,k0,0x12 c4: 335a0ff8 andi k0,k0,0xff8 c8: 037ad82d daddu k1,k1,k0 cc: 403aa000 dmfc0 k0,c0_xcontext d0: df7b0000 ld k1,0(k1) d4: 335a0ff0 andi k0,k0,0xff0 d8: 037ad82d daddu k1,k1,k0 dc: df7a0000 ld k0,0(k1) e0: df7b0008 ld k1,8(k1) e4: 001ad1ba dsrl k0,k0,0x6 e8: 409a1000 mtc0 k0,c0_entrylo0 ec: 001bd9ba dsrl k1,k1,0x6 f0: 409b1800 mtc0 k1,c0_entrylo1 f4: 42000006 tlbwr f8: 42000018 eret fc: 001ad8b8 dsll k1,k0,0x2 100: 1000ffc2 b c <main+0xc> 104: 00000000 nop 108: 00000000 nop
2 分析
MIPS64 的 TLB Refill Exception 入口在 0xffffffff80000080 处,当访问的虚拟地址在 0x80000000 00000000 以上时,跳到 fc: 处,地址左移 2 位后,直接到 bgezall t8,a4,这样跳转就取决于 t8 的值,而与原始的 k1 无关了。
这样在 module 或 vmalloc 时 (皆在 0x8000..0000 之上),当 t8 小于 0 时,bgezall 不跳转,往后的逻辑依然正确,即:依然进入原模块用的 module_pg_dir PGD 入口;但当 t8 使 bgezall 跳转时,则 BadVaddr 左移 2 位后的值(置于 k1),却成了 pgd 的入口地址,这样 TLB Refill handler 就用了一个不可知的 PGD 和 PMD,PMD 项为空也就理所当然了。
至于为何当初为何能在极短的时间内能找到一个 workaround,也是歪打正着,上天庇佑也未可知 :) workaround 的原理如下:
其用一个极简的函数位于 0xFFFFFFFF80000080 处:
+LEAF(phnx_tlb_refill) + .set mips64 + dla k0, phoenix_btlb_refill + jr k0 + nop +END(phnx_tlb_refill) --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -29,6 +29,12 @@ #include "uasm.h" +#define CONFIG_RMI_INSMOD_WKAROUND + +#ifdef CONFIG_RMI_INSMOD_WKAROUND +extern void phnx_tlb_refill(void); +#endif + static inline int r45k_bvahwbug(void) { /* XXX: We should probe for the presence of this bug, but we don't. */ @@ -672,7 +678,7 @@ static void __cpuinit build_update_entries(u32 **p, unsigned int tmp, #endif } -static void __cpuinit build_r4000_tlb_refill_handler(void) +static void __cpuinit __maybe_unused build_r4000_tlb_refill_handler(void) { u32 *p = tlb_handler; struct uasm_label *l = labels; @@ -1268,7 +1274,11 @@ void __cpuinit build_tlb_refill_handler(void) break; default: +#ifndef CONFIG_RMI_INSMOD_WKAROUND build_r4000_tlb_refill_handler(); +#else + memcpy((void *)(ebase + 0x80), phnx_tlb_refill, 0x80); +#endif if (!run_once) { build_r4000_tlb_load_handler(); build_r4000_tlb_store_handler();
真正的 TLB Refill handler (phoenix_btlb_refill) 在一个独立的文件中,由于物理内存 0x1 和 0x3 处没有实际的 TLB Refill handler 指令,因此就将问题避开了。