xv6PageFault - ccc-sp/riscv2os GitHub Wiki
xv6: 當存取的分頁未載入時 (page fault) 時,會發生甚麼事?
4.6 Page-fault exceptions
Xv6’s response to exceptions is quite boring: if an exception happens in user space, the kernel kills the faulting process. If an exception happens in the kernel, the kernel panics. Real operating systems often respond in much more interesting ways.
在核心中,分頁表的設定如下:
kernel/vm.c
/* 虛擬記憶體 -- 分頁表的管理模組
* the kernel's page table.
*/
pagetable_t kernel_pagetable;
extern char etext[]; // kernel.ld sets this to end of kernel code.
extern char trampoline[]; // trampoline.S
// Make a direct-map page table for the kernel.
pagetable_t
kvmmake(void) // 創建核心的虛擬記憶體
{
pagetable_t kpgtbl;
kpgtbl = (pagetable_t) kalloc();
memset(kpgtbl, 0, PGSIZE);
// uart registers
kvmmap(kpgtbl, UART0, UART0, PGSIZE, PTE_R | PTE_W);
// virtio mmio disk interface
kvmmap(kpgtbl, VIRTIO0, VIRTIO0, PGSIZE, PTE_R | PTE_W);
// PLIC
kvmmap(kpgtbl, PLIC, PLIC, 0x400000, PTE_R | PTE_W);
// map kernel text executable and read-only.
kvmmap(kpgtbl, KERNBASE, KERNBASE, (uint64)etext-KERNBASE, PTE_R | PTE_X);
// map kernel data and the physical RAM we'll make use of.
kvmmap(kpgtbl, (uint64)etext, (uint64)etext, PHYSTOP-(uint64)etext, PTE_R | PTE_W);
// map the trampoline for trap entry/exit to
// the highest virtual address in the kernel.
kvmmap(kpgtbl, TRAMPOLINE, (uint64)trampoline, PGSIZE, PTE_R | PTE_X);
// map kernel stacks
proc_mapstacks(kpgtbl);
return kpgtbl;
}
// Initialize the one kernel_pagetable
void
kvminit(void) // 初始化本模組,創建核心的虛擬分頁表
{
kernel_pagetable = kvmmake();
}
透過 SATP (Supervisor Address Translation and Protection) 暫存器,可以啟動分頁機制:
// Switch h/w page table register to the kernel's page table,
// and enable paging.
void
kvminithart() // 啟動分頁機制
{
w_satp(MAKE_SATP(kernel_pagetable));
sfence_vma();
}
於是當我們存取一個不在分頁表中的分頁時,將會產生 page fault
對於使用者模式,分頁機制的切換主要在 trampoline.S 當中:
# 彈跳床 -- 在 user/kernel 間切換的程式 (進入 user/ 從 user 返回)
# code to switch between user and kernel space.
#
# this code is mapped at the same virtual address
# (TRAMPOLINE) in user and kernel space so that
# it continues to work when it switches page tables.
#
# kernel.ld causes this to be aligned
# to a page boundary.
#
.section trampsec
.globl trampoline
trampoline:
.align 4
.globl uservec
uservec:# 進入使用者中斷
#
# trap.c sets stvec to point here, so
# traps from user space start here,
# in supervisor mode, but with a
# user page table.
#
# sscratch points to where the process's p->trapframe is
# mapped into user space, at TRAPFRAME.
#
# ------------ 以下是儲存 user process 暫存器的程式段落 ------------------
# swap a0 and sscratch # 原本 sscratch = p->trapframe, 現在兩者交換 swap(sscratch, a0)
# so that a0 is TRAPFRAME # 所以現在 a0 = p->trapframe
csrrw a0, sscratch, a0
# save the user registers in TRAPFRAME # 儲存所有 user process 的暫存器到 p->trapframe
sd ra, 40(a0)
sd sp, 48(a0)
sd gp, 56(a0)
sd tp, 64(a0)
sd t0, 72(a0)
sd t1, 80(a0)
sd t2, 88(a0)
sd s0, 96(a0)
sd s1, 104(a0)
sd a1, 120(a0)
sd a2, 128(a0)
sd a3, 136(a0)
sd a4, 144(a0)
sd a5, 152(a0)
sd a6, 160(a0)
sd a7, 168(a0)
sd s2, 176(a0)
sd s3, 184(a0)
sd s4, 192(a0)
sd s5, 200(a0)
sd s6, 208(a0)
sd s7, 216(a0)
sd s8, 224(a0)
sd s9, 232(a0)
sd s10, 240(a0)
sd s11, 248(a0)
sd t3, 256(a0)
sd t4, 264(a0)
sd t5, 272(a0)
sd t6, 280(a0)
# save the user a0 in p->trapframe->a0
csrr t0, sscratch # sscratch 就是剛剛和 a0 交換的,所以就是 p->trapframe
sd t0, 112(a0) # 將 p->trapframe 存入 112(p->trapframe)
# ------- 以下是恢復核心暫存器的處理程式 --------------------
# restore kernel stack pointer from p->trapframe->kernel_sp
ld sp, 8(a0) # 恢復堆疊
# make tp hold the current hartid, from p->trapframe->kernel_hartid
ld tp, 32(a0) # tp = kernel.hartid
# load the address of usertrap(), p->trapframe->kernel_trap
ld t0, 16(a0) # t0 = usertrap
# restore kernel page table from p->trapframe->kernel_satp
ld t1, 0(a0)
csrw satp, t1 # 恢復核心的分頁表
sfence.vma zero, zero # 清除 TLB 讓緩存失效
# a0 is no longer valid, since the kernel page
# table does not specially map p->tf.
# jump to usertrap(), which does not return
jr t0 # 跳到 usertrap()
.globl userret
userret:# 從使用者中斷返回
# userret(TRAPFRAME, pagetable)
# switch from kernel to user.
# usertrapret() calls here.
# a0: TRAPFRAME, in user page table.
# a1: user page table, for satp.
# switch to the user page table.
csrw satp, a1 # 切回使用者分頁表
sfence.vma zero, zero
# ------- 以下是恢復使用者暫存器的處理程式 --------------------
# put the saved user a0 in sscratch, so we
# can swap it with our a0 (TRAPFRAME) in the last step.
ld t0, 112(a0)
csrw sscratch, t0
# restore all but a0 from TRAPFRAME
ld ra, 40(a0)
ld sp, 48(a0)
ld gp, 56(a0)
ld tp, 64(a0)
ld t0, 72(a0)
ld t1, 80(a0)
ld t2, 88(a0)
ld s0, 96(a0)
ld s1, 104(a0)
ld a1, 120(a0)
ld a2, 128(a0)
ld a3, 136(a0)
ld a4, 144(a0)
ld a5, 152(a0)
ld a6, 160(a0)
ld a7, 168(a0)
ld s2, 176(a0)
ld s3, 184(a0)
ld s4, 192(a0)
ld s5, 200(a0)
ld s6, 208(a0)
ld s7, 216(a0)
ld s8, 224(a0)
ld s9, 232(a0)
ld s10, 240(a0)
ld s11, 248(a0)
ld t3, 256(a0)
ld t4, 264(a0)
ld t5, 272(a0)
ld t6, 280(a0)
# restore user a0, and save TRAPFRAME in sscratch
csrrw a0, sscratch, a0
# return to user mode and user pc. // sret 會取回 user pc 然後繼續執行該使用者行程
# usertrapret() set up sstatus and sepc.
sret
還有在 trap.c 當中
//
// handle an interrupt, exception, or system call from user space.
// called from trampoline.S
//
void
usertrap(void) // 使用者中斷 (自陷 trap)
{
int which_dev = 0;
if((r_sstatus() & SSTATUS_SPP) != 0) // 是否來自 user mode ?
panic("usertrap: not from user mode");
// send interrupts and exceptions to kerneltrap(),
// since we're now in the kernel.
w_stvec((uint64)kernelvec); // 設定中斷向量為 kernelvec ,這樣才能從 usermode 跳回 kernel mode.
struct proc *p = myproc(); // 取得目前 process
// save user program counter.
p->trapframe->epc = r_sepc(); // 儲存程式計數器 sepc
if(r_scause() == 8){ // 如果是系統呼叫
// system call
if(p->killed) // 如果行程已經被殺死
exit(-1);
// sepc points to the ecall instruction,
// but we want to return to the next instruction.
p->trapframe->epc += 4; // 得儲存 ecall 的下一個指令
// an interrupt will change sstatus &c registers,
// so don't enable until done with those registers.
intr_on(); // 允許裝置中斷
syscall(); // 執行系統呼叫
} else if((which_dev = devintr()) != 0){ // 如果是裝置中斷,呼叫 devintr() 處理之。
// ok
} else { // 否則,錯誤處理
printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
printf(" sepc=%p stval=%p\n", r_sepc(), r_stval());
p->killed = 1;
}
if(p->killed)
exit(-1);
// give up the CPU if this is a timer interrupt.
if(which_dev == 2) // 如果是時間中斷,禮讓給其他 process
yield();
usertrapret(); // 從 usertrap 返回
}
//
// return to user space // 切換回使用者空間
//
void
usertrapret(void)
{
struct proc *p = myproc();
// we're about to switch the destination of traps from
// kerneltrap() to usertrap(), so turn off interrupts until
// we're back in user space, where usertrap() is correct.
intr_off(); // 禁止裝置中斷
// send syscalls, interrupts, and exceptions to trampoline.S
w_stvec(TRAMPOLINE + (uservec - trampoline)); // 設定中斷向量為 uservec ,這樣才能從 kernel mode 跳回 user mode.
// 保存 kernel 的相關暫存器
// set up trapframe values that uservec will need when
// the process next re-enters the kernel.
p->trapframe->kernel_satp = r_satp(); // kernel page table (核心分頁表)
p->trapframe->kernel_sp = p->kstack + PGSIZE; // process's kernel stack (核心堆疊)
p->trapframe->kernel_trap = (uint64)usertrap;
p->trapframe->kernel_hartid = r_tp(); // hartid for cpuid()
// set up the registers that trampoline.S's sret will use
// to get to user space.
// 回復到使用者模式
// set S Previous Privilege mode to User.
unsigned long x = r_sstatus();
x &= ~SSTATUS_SPP; // clear SPP to 0 for user mode
x |= SSTATUS_SPIE; // enable interrupts in user mode
w_sstatus(x);
// set S Exception Program Counter to the saved user pc.
w_sepc(p->trapframe->epc); // 設定 sepc 準備返回
// tell trampoline.S the user page table to switch to.
uint64 satp = MAKE_SATP(p->pagetable); // 切換回使用者分頁表
// jump to trampoline.S at the top of memory, which
// switches to the user page table, restores user registers,
// and switches to user mode with sret.
uint64 fn = TRAMPOLINE + (userret - trampoline); // 呼叫 trampoline.S 的 userret 以切換回使用者行程。
((void (*)(uint64,uint64))fn)(TRAPFRAME, satp);
}
// interrupts and exceptions from kernel code go here via kernelvec,
// on whatever the current kernel stack is.
void
kerneltrap() // 核心中斷
{
int which_dev = 0;
uint64 sepc = r_sepc();
uint64 sstatus = r_sstatus();
uint64 scause = r_scause();
if((sstatus & SSTATUS_SPP) == 0)
panic("kerneltrap: not from supervisor mode");
if(intr_get() != 0)
panic("kerneltrap: interrupts enabled");
if((which_dev = devintr()) == 0){ // 1. 裝置中斷
printf("scause %p\n", scause);
printf("sepc=%p stval=%p\n", r_sepc(), r_stval());
panic("kerneltrap");
}
// give up the CPU if this is a timer interrupt.
if(which_dev == 2 && myproc() != 0 && myproc()->state == RUNNING) // 1. 時間中斷,禮讓給別人
yield(); // 註:時間中斷對 user mode 與 kernel mode 都是有效的,都必須禮讓給別人。
// the yield() may have caused some traps to occur,
// so restore trap registers for use by kernelvec.S's sepc instruction.
w_sepc(sepc);
w_sstatus(sstatus);
}
問題是,這裡完全沒看到 page fault 發生時,是如何去載入新頁到記憶體中的那些處理?
難道是,沒有這個處理?
問題是,當 init 行程一載入時,只先分配一頁,之後 fork() 時,也不會讓分頁表長大,那肯定是在 exec() 時才會載入程式到記憶體。
看來是如此沒錯!
exec 中會呼叫 uvmalloc(pagetable, sz, ph.vaddr + ph.memsz) 分配記憶體,然後呼叫 loadseg(pagetable, ph.vaddr, ip, ph.off, ph.filesz) 載入程式段落到記憶體中。
因此 xv6 並沒有使用 lazy loading. (所以分頁錯誤應該會導致程式直接被結束 kill 掉,因為代表非法的存取或指標指錯地方)
kernel/exec.c
int
exec(char *path, char **argv) // 將行程換成 path 指定的執行檔。
{
char *s, *last;
int i, off;
uint64 argc, sz = 0, sp, ustack[MAXARG+1], stackbase;
struct elfhdr elf;
struct inode *ip;
struct proghdr ph;
pagetable_t pagetable = 0, oldpagetable;
struct proc *p = myproc();
begin_op(); // 使用檔案系統前需要呼叫 log.c 中的 begin_op() ,這樣才會記錄存取動作到日誌中
if((ip = namei(path)) == 0){ // 取得 path ELF 檔對應的 inode ptr (ip)
end_op();
return -1;
}
ilock(ip); // 鎖定該 inode
// Check ELF header
if(readi(ip, 0, (uint64)&elf, 0, sizeof(elf)) != sizeof(elf)) // 讀取該 inode
goto bad;
if(elf.magic != ELF_MAGIC) // 若不是 ELF 則失敗
goto bad;
if((pagetable = proc_pagetable(p)) == 0) // 創建分頁表
goto bad;
// Load program into memory.
for(i=0, off=elf.phoff; i<elf.phnum; i++, off+=sizeof(ph)){ // 對於 ELF 的每一段
if(readi(ip, 0, (uint64)&ph, off, sizeof(ph)) != sizeof(ph)) // 讀入該段
goto bad;
if(ph.type != ELF_PROG_LOAD) // 只處理 LOAD 類型的段
continue;
if(ph.memsz < ph.filesz) // 記憶體大小一定大於檔案大小 (一般是相等,但 BSS 段則是 memsz > filesz)
goto bad;
if(ph.vaddr + ph.memsz < ph.vaddr) // 溢位 => 失敗
goto bad;
uint64 sz1;
if((sz1 = uvmalloc(pagetable, sz, ph.vaddr + ph.memsz)) == 0) // 為每個ELF段分配記憶體
goto bad;
sz = sz1;
if(ph.vaddr % PGSIZE != 0)
goto bad;
if(loadseg(pagetable, ph.vaddr, ip, ph.off, ph.filesz) < 0) // 把每個段加載到 ph.vaddr 記憶體中
goto bad;
}
iunlockput(ip); // 載入完成,解除 inode 鎖定
end_op(); // 使用完檔案系統前需要呼叫 log.c 中的 end_op()
ip = 0;
p = myproc();
uint64 oldsz = p->sz;
// Allocate two pages at the next page boundary. 為何分配兩頁?第二個是堆疊,那第一個幹嘛用?
// Use the second as the user stack. 答:第一個是不可訪問頁,當堆疊溢位時會觸發錯誤中斷。
sz = PGROUNDUP(sz); // 讓 sz = n 個完整分頁大小
uint64 sz1;
if((sz1 = uvmalloc(pagetable, sz, sz + 2*PGSIZE)) == 0) // 多分配兩頁,用途如上所述
goto bad;
sz = sz1;
uvmclear(pagetable, sz-2*PGSIZE); // 清除分頁內容
sp = sz;
stackbase = sp - PGSIZE; // 堆疊在最後一頁
// Push argument strings, prepare rest of stack in ustack. (在堆疊中推入 argv 字串)
for(argc = 0; argv[argc]; argc++) {
if(argc >= MAXARG)
goto bad;
sp -= strlen(argv[argc]) + 1;
sp -= sp % 16; // riscv sp must be 16-byte aligned (對齊 sp)
if(sp < stackbase)
goto bad;
if(copyout(pagetable, sp, argv[argc], strlen(argv[argc]) + 1) < 0) // 複製到使用者行程,失敗就離開
goto bad;
ustack[argc] = sp;
}
ustack[argc] = 0;
// push the array of argv[] pointers. (推入 argv 的指標)
sp -= (argc+1) * sizeof(uint64);
sp -= sp % 16; // 對齊
if(sp < stackbase)
goto bad;
if(copyout(pagetable, sp, (char *)ustack, (argc+1)*sizeof(uint64)) < 0) // 複製到使用者行程,失敗就離開
goto bad;
// arguments to user main(argc, argv)
// argc is returned via the system call return
// value, which goes in a0.
p->trapframe->a1 = sp; // 設定 a1=argv
// Save program name for debugging. (儲存程式名稱,除錯用)
for(last=s=path; *s; s++)
if(*s == '/')
last = s+1;
safestrcpy(p->name, last, sizeof(p->name));
// Commit to the user image.
oldpagetable = p->pagetable; // oldpagetable 指向 fork 時的舊行程頁表
p->pagetable = pagetable; // 指向新的分頁表
p->sz = sz;
p->trapframe->epc = elf.entry; // initial program counter = main (進入點 epc 為 elf.entry)
p->trapframe->sp = sp; // initial stack pointer (設定 p 的堆疊暫存器)
proc_freepagetable(oldpagetable, oldsz); // 將舊的行程頁表釋放
return argc; // this ends up in a0, the first argument to main(argc, argv)
bad: // 錯誤處理
if(pagetable)
proc_freepagetable(pagetable, sz);
if(ip){
iunlockput(ip);
end_op();
}
return -1;
}
exec() 中的 proc_pagetable() 如下:
// Create a user page table for a given process,
// with no user memory, but with trampoline pages.
pagetable_t
proc_pagetable(struct proc *p) // 創建新行程的分頁表 (只有一頁彈跳床)
{
pagetable_t pagetable;
// An empty page table.
pagetable = uvmcreate(); // 創建空的分頁表
if(pagetable == 0)
return 0;
// map the trampoline code (for system call return)
// at the highest user virtual address.
// only the supervisor uses it, on the way
// to/from user space, so not PTE_U.
// 映射彈跳床頁 TRAMPOLINE 到實體頁 trampoline
if(mappages(pagetable, TRAMPOLINE, PGSIZE,
(uint64)trampoline, PTE_R | PTE_X) < 0){
uvmfree(pagetable, 0);
return 0;
}
// 將彈跳床後的那頁設為防護頁 (trapframe)
// map the trapframe just below TRAMPOLINE, for trampoline.S.
if(mappages(pagetable, TRAPFRAME, PGSIZE,
(uint64)(p->trapframe), PTE_R | PTE_W) < 0){
uvmunmap(pagetable, TRAMPOLINE, 1, 0);
uvmfree(pagetable, 0);
return 0;
}
return pagetable;
}
然後 uvmalloc() 會讓分頁表長大
// Allocate PTEs and physical memory to grow process from oldsz to
// newsz, which need not be page aligned. Returns new size or 0 on error.
uint64
uvmalloc(pagetable_t pagetable, uint64 oldsz, uint64 newsz) // 讓分頁表從 oldsz 長大到 newsz
{
char *mem;
uint64 a;
if(newsz < oldsz)
return oldsz;
oldsz = PGROUNDUP(oldsz);
for(a = oldsz; a < newsz; a += PGSIZE){
mem = kalloc();
if(mem == 0){
uvmdealloc(pagetable, a, oldsz);
return 0;
}
memset(mem, 0, PGSIZE);
if(mappages(pagetable, a, PGSIZE, (uint64)mem, PTE_W|PTE_X|PTE_R|PTE_U) != 0){
kfree(mem);
uvmdealloc(pagetable, a, oldsz);
return 0;
}
}
return newsz;
}
這樣就不需要靠 page fault 去載入分頁了!