xv6ReadYield - ccc-sp/riscv2os GitHub Wiki
回答:會!
原因:請看下列追蹤:
usys.pl 會產生 usys.S 是系統呼叫檔, read() 函數定義於此:
user/usys.S
.global read
read:
li a7, SYS_read
ecall
ret
.global write
write:
li a7, SYS_write
ecall
retecall 會引發軟體中斷,於是跳入 trap.c 的 usertrap 中:
這在官方電子書的 2.5 Process overview 有提到:
A process can make a system call by executing the RISC-V ecall instruction. This instruction raises the hardware privilege level and changes the program counter to a kernel-defined entry point.
The code at the entry point switches to a kernel stack and executes the kernel instructions that implement the system call. When the system call completes, the kernel switches back to the user stack and returns to user space by calling the sret instruction, which lowers the hardware privilege level and resumes executing user instructions just after the system call instruction. A process’s thread can “block” in the kernel to wait for I/O, and resume where it left off when the I/O has finished.
kernel/trap.c
//
// handle an interrupt, exception, or system call from user space.
// called from trampoline.S
//
void
usertrap(void) // 使用者中斷 (自陷 trap)
{
int which_dev = 0;
if((r_sstatus() & SSTATUS_SPP) != 0) // 是否來自 user mode ?
panic("usertrap: not from user mode");
// send interrupts and exceptions to kerneltrap(),
// since we're now in the kernel.
w_stvec((uint64)kernelvec); // 設定中斷向量為 kernelvec ,這樣才能從 usermode 跳回 kernel mode.
struct proc *p = myproc(); // 取得目前 process
// save user program counter.
p->trapframe->epc = r_sepc(); // 儲存程式計數器 sepc
if(r_scause() == 8){ // 如果是系統呼叫
// system call
if(p->killed) // 如果行程已經被殺死
exit(-1);
// sepc points to the ecall instruction,
// but we want to return to the next instruction.
p->trapframe->epc += 4; // 得儲存 ecall 的下一個指令
// an interrupt will change sstatus &c registers,
// so don't enable until done with those registers.
intr_on(); // 允許裝置中斷
syscall(); // 執行系統呼叫
} else if((which_dev = devintr()) != 0){ // 如果是裝置中斷,呼叫 devintr() 處理之。
// ok
} else { // 否則,錯誤處理
printf("usertrap(): unexpected scause %p pid=%d\n", r_scause(), p->pid);
printf(" sepc=%p stval=%p\n", r_sepc(), r_stval());
p->killed = 1;
}
if(p->killed)
exit(-1);
// give up the CPU if this is a timer interrupt.
if(which_dev == 2) // 如果是時間中斷,禮讓給其他 process
yield();
usertrapret(); // 從 usertrap 返回
}
然後透過 syscall() 函數,開始執行系統呼叫:
kernel/syscall.c
static uint64 (*syscalls[])(void) = {
[SYS_fork] sys_fork,
[SYS_exit] sys_exit,
[SYS_wait] sys_wait,
[SYS_pipe] sys_pipe,
[SYS_read] sys_read,
[SYS_kill] sys_kill,
[SYS_exec] sys_exec,
[SYS_fstat] sys_fstat,
[SYS_chdir] sys_chdir,
[SYS_dup] sys_dup,
[SYS_getpid] sys_getpid,
[SYS_sbrk] sys_sbrk,
[SYS_sleep] sys_sleep,
[SYS_uptime] sys_uptime,
[SYS_open] sys_open,
[SYS_write] sys_write,
[SYS_mknod] sys_mknod,
[SYS_unlink] sys_unlink,
[SYS_link] sys_link,
[SYS_mkdir] sys_mkdir,
[SYS_close] sys_close,
};
void
syscall(void) // 系統呼叫
{
int num;
struct proc *p = myproc();
num = p->trapframe->a7; // a7 中放了系統呼叫代號
if(num > 0 && num < NELEM(syscalls) && syscalls[num]) { // 系統呼叫代號正確
p->trapframe->a0 = syscalls[num](); // 呼叫該系統呼叫
} else { // 系統呼叫代號錯誤
printf("%d %s: unknown sys call %d\n",
p->pid, p->name, num);
p->trapframe->a0 = -1;
}
}
syscall() 函數會呼叫 syscallsnum ,其中的 num 是 SYS_read,於是會跳入 sys_read 中:
kernel/sysfile.c
uint64
sys_read(void) // 例: read(fd, p, n) 會從檔案讀取 n 個 byte 放入 p 中。
{
struct file *f;
int n;
uint64 p;
if(argfd(0, 0, &f) < 0 || argint(2, &n) < 0 || argaddr(1, &p) < 0)
return -1;
return fileread(f, p, n);
}其中的 fileread() 會真正讀取檔案:
// Read from file f.
// addr is a user virtual address.
int
fileread(struct file *f, uint64 addr, int n) // 檔案讀取
{
int r = 0;
if(f->readable == 0) // 若非可讀,離開
return -1;
if(f->type == FD_PIPE){ // 若是管道
r = piperead(f->pipe, addr, n); // 呼叫 piperead
} else if(f->type == FD_DEVICE){ // 若是裝置
if(f->major < 0 || f->major >= NDEV || !devsw[f->major].read)
return -1;
r = devsw[f->major].read(1, addr, n); // 呼叫裝置 read 函數
} else if(f->type == FD_INODE){ // 若是磁碟 inode
ilock(f->ip);
if((r = readi(f->ip, 1, addr, f->off, n)) > 0) // 呼叫 readi
f->off += r;
iunlock(f->ip);
} else {
panic("fileread");
}
return r;
}對於一般的檔案,會找到其 inode (f->ip) ,然後呼叫 fs.c 中的 readi() 讀進來:
kernel/fs.c
// Read data from inode.
// Caller must hold ip->lock.
// If user_dst==1, then dst is a user virtual address;
// otherwise, dst is a kernel address.
int
readi(struct inode *ip, int user_dst, uint64 dst, uint off, uint n) // 從 inode 中讀取資料 (off 開始 n 個 bytes)
{
uint tot, m;
struct buf *bp;
if(off > ip->size || off + n < off)
return 0;
if(off + n > ip->size)
n = ip->size - off;
for(tot=0; tot<n; tot+=m, off+=m, dst+=m){
bp = bread(ip->dev, bmap(ip, off/BSIZE));
m = min(n - tot, BSIZE - off%BSIZE);
if(either_copyout(user_dst, dst, bp->data + (off % BSIZE), m) == -1) {
brelse(bp);
tot = -1;
break;
}
brelse(bp);
}
return tot;
}
其中的 bread() 會呼叫緩衝讀取的功能 (請注意可能會呼叫很多次,直到所有區塊都讀進來)
kernel/bio.c
// Return a locked buf with the contents of the indicated block.
struct buf*
bread(uint dev, uint blockno) // 讀取 blockno 對應的區塊
{
struct buf *b;
b = bget(dev, blockno); // 取得該緩衝區塊
if(!b->valid) { // 若還沒有從磁碟載入
virtio_disk_rw(b, 0); // 則呼叫 virtio 模組從磁碟讀入該區塊
b->valid = 1;
}
return b;
}而 bread() 則會呼叫 virtio_disk_rw() 去啟動 virtio 的讀取功能:
kernel/virtio_disk.c
void
virtio_disk_rw(struct buf *b, int write) // 啟動 virtio 的磁碟寫入動作
{
uint64 sector = b->blockno * (BSIZE / 512);
acquire(&disk.vdisk_lock);
// the spec's Section 5.2 says that legacy block operations use
// three descriptors: one for type/reserved/sector, one for the
// data, one for a 1-byte status result.
// allocate the three descriptors. // 分配直到成功為止
int idx[3];
while(1){
if(alloc3_desc(idx) == 0) {
break;
}
sleep(&disk.free[0], &disk.vdisk_lock);
}
// format the three descriptors.
// qemu's virtio-blk.c reads them.
// 參考 -- https://github.com/qemu/qemu/blob/master/hw/block/virtio-blk.c
struct virtio_blk_req *buf0 = &disk.ops[idx[0]]; // 讀寫的區塊
if(write) // 根據 write 來設定為《寫入或讀取》
buf0->type = VIRTIO_BLK_T_OUT; // write the disk // 讀寫類型為寫入
else
buf0->type = VIRTIO_BLK_T_IN; // read the disk // 讀寫類型為讀取
buf0->reserved = 0;
buf0->sector = sector; // 讀寫的磁區號碼 (sector)
// 第 0 個描述子
disk.desc[idx[0]].addr = (uint64) buf0;
disk.desc[idx[0]].len = sizeof(struct virtio_blk_req);
disk.desc[idx[0]].flags = VRING_DESC_F_NEXT;
disk.desc[idx[0]].next = idx[1];
// 第 1 個描述子
disk.desc[idx[1]].addr = (uint64) b->data;
disk.desc[idx[1]].len = BSIZE;
if(write)
disk.desc[idx[1]].flags = 0; // device reads b->data
else
disk.desc[idx[1]].flags = VRING_DESC_F_WRITE; // device writes b->data
disk.desc[idx[1]].flags |= VRING_DESC_F_NEXT;
disk.desc[idx[1]].next = idx[2];
// 第 2 個描述子
disk.info[idx[0]].status = 0xff; // device writes 0 on success
disk.desc[idx[2]].addr = (uint64) &disk.info[idx[0]].status;
disk.desc[idx[2]].len = 1;
disk.desc[idx[2]].flags = VRING_DESC_F_WRITE; // device writes the status
disk.desc[idx[2]].next = 0;
// record struct buf for virtio_disk_intr().
b->disk = 1;
disk.info[idx[0]].b = b;
// tell the device the first index in our chain of descriptors.
disk.avail->ring[disk.avail->idx % NUM] = idx[0];
__sync_synchronize();
// tell the device another avail ring entry is available.
disk.avail->idx += 1; // not % NUM ...
__sync_synchronize();
*R(VIRTIO_MMIO_QUEUE_NOTIFY) = 0; // value is queue number
// Wait for virtio_disk_intr() to say request has finished.
while(b->disk == 1) { // b->disk=1 代表磁碟正在讀取到緩衝區 buf
sleep(b, &disk.vdisk_lock);
}
// 讀完了,釋放 idx[0]
disk.info[idx[0]].b = 0;
free_chain(idx[0]);
release(&disk.vdisk_lock);
}
其中的兩個 sleep() 呼叫都有可能導致行程進入睡眠狀態。
然後當 virtio 讀取完成之後,會引發外部中斷,進而呼叫 virtio_disk_intr()
kernel/virtio_disk.c
void
virtio_disk_intr() // virtio_disk_rw() 請求讀寫,完成後 qemu 會發中斷給 xv6
{
acquire(&disk.vdisk_lock);
// the device won't raise another interrupt until we tell it
// we've seen this interrupt, which the following line does.
// this may race with the device writing new entries to
// the "used" ring, in which case we may process the new
// completion entries in this interrupt, and have nothing to do
// in the next interrupt, which is harmless.
*R(VIRTIO_MMIO_INTERRUPT_ACK) = *R(VIRTIO_MMIO_INTERRUPT_STATUS) & 0x3;
__sync_synchronize();
// the device increments disk.used->idx when it
// adds an entry to the used ring.
while(disk.used_idx != disk.used->idx){
__sync_synchronize();
int id = disk.used->ring[disk.used_idx % NUM].id;
if(disk.info[id].status != 0)
panic("virtio_disk_intr status");
struct buf *b = disk.info[id].b;
b->disk = 0; // disk is done with buf
wakeup(b); // 讀取完成,喚醒等待此磁碟事件的行程 (加入排程佇列)
disk.used_idx += 1;
}
release(&disk.vdisk_lock);
}
virtio_disk_intr() 當中的 wakeup() 會喚醒那些正在等待該區塊讀入的行程,於是行程又醒來繼續進入排程系統。
因為 xv6 的檔案系統分為七層,因此上述的追蹤很長:

所以結論是 read() 會禮讓給其他行程,進入睡眠狀態,直到磁碟區塊讀入後,才喚醒該行程。