二、书接上文,上一节大概弄清了从通电到第一个程序运行的脉络。本节将深入探讨上节最后一部分:从 Kernel(内核态)切换到 User(用户态)的执行逻辑,并详细解析 从 User 返回 Kernel 的全过程。
kexec 进程加载与启动流程
阅读kexec所需声明:用户栈大小、程序头结构体定义、proc_pagetable和copyout用处
- #define USERSTACK 1 // user stack pages
- // Program section header
- struct proghdr {
- uint32 type;
- uint32 flags;
- uint64 off;
- uint64 vaddr;
- uint64 paddr;
- uint64 filesz;
- uint64 memsz;
- uint64 align;
- };
- // Create a user page table for a given process, with no user memory,
- // but with trampoline and trapframe pages.
- pagetable_t proc_pagetable(struct proc* p);
- // Copy from kernel to user.
- // Copy len bytes from src to virtual address dstva in a given page table.
- // Return 0 on success, -1 on error.
- int copyout(pagetable_t pagetable, uint64 dstva, char* src, uint64 len);
复制代码kexec代码块
- int kexec(char* path, char** argv) {
- char *s, *last;
- int i, off;
- uint64 argc, sz = 0, sp, ustack[MAXARG], stackbase;
- struct elfhdr elf;
- struct inode* ip;
- struct proghdr ph;
- pagetable_t pagetable = 0, oldpagetable;
- struct proc* p = myproc();
- begin_op();
- // Open the executable file.
- if ((ip = namei(path)) == 0) {
- end_op();
- return -1;
- }
- ilock(ip);
- // Read the ELF header.
- if (readi(ip, 0, (uint64)&elf, 0, sizeof(elf)) != sizeof(elf)) goto bad;
- // Is this really an ELF file?
- if (elf.magic != ELF_MAGIC) goto bad;
- if ((pagetable = proc_pagetable(p)) == 0) goto bad;
- // Load program into memory.
- for (i = 0, off = elf.phoff; i < elf.phnum; i++, off += sizeof(ph)) {
- if (readi(ip, 0, (uint64)&ph, off, sizeof(ph)) != sizeof(ph)) goto bad;
- if (ph.type != ELF_PROG_LOAD) continue;
- if (ph.memsz < ph.filesz) goto bad;
- if (ph.vaddr + ph.memsz < ph.vaddr) goto bad;
- if (ph.vaddr % PGSIZE != 0) goto bad;
- uint64 sz1;
- if ((sz1 = uvmalloc(pagetable, sz, ph.vaddr + ph.memsz, flags2perm(ph.flags))) == 0) goto bad;
- sz = sz1;
- if (loadseg(pagetable, ph.vaddr, ip, ph.off, ph.filesz) < 0) goto bad;
- }
- iunlockput(ip);
- end_op();
- ip = 0;
- p = myproc();
- uint64 oldsz = p->sz;
- // Allocate some pages at the next page boundary.
- // Make the first inaccessible as a stack guard.
- // Use the rest as the user stack.
- sz = PGROUNDUP(sz);
- uint64 sz1;
- if ((sz1 = uvmalloc(pagetable, sz, sz + (USERSTACK + 1) * PGSIZE, PTE_W)) == 0) goto bad;
- sz = sz1;
- uvmclear(pagetable, sz - (USERSTACK + 1) * PGSIZE);
- sp = sz;
- stackbase = sp - USERSTACK * PGSIZE;
- // Copy argument strings into new stack, remember their
- // addresses in ustack[].
- for (argc = 0; argv[argc]; argc++) {
- if (argc >= MAXARG) goto bad;
- sp -= strlen(argv[argc]) + 1;
- sp -= sp % 16; // riscv sp must be 16-byte aligned
- if (sp < stackbase) goto bad;
- if (copyout(pagetable, sp, argv[argc], strlen(argv[argc]) + 1) < 0) goto bad;
- ustack[argc] = sp;
- }
- ustack[argc] = 0;
- // push a copy of ustack[], the array of argv[] pointers.
- sp -= (argc + 1) * sizeof(uint64);
- sp -= sp % 16;
- if (sp < stackbase) goto bad;
- if (copyout(pagetable, sp, (char*)ustack, (argc + 1) * sizeof(uint64)) < 0) goto bad;
- // a0 and a1 contain arguments to user main(argc, argv)
- // argc is returned via the system call return
- // value, which goes in a0.
- p->trapframe->a1 = sp;
- // Save program name for debugging.
- for (last = s = path; *s; s++)
- if (*s == '/') last = s + 1;
- safestrcpy(p->name, last, sizeof(p->name));
- // Commit to the user image.
- oldpagetable = p->pagetable;
- p->pagetable = pagetable;
- p->sz = sz;
- p->trapframe->epc = elf.entry; // initial program counter = ulib.c:start()
- p->trapframe->sp = sp; // initial stack pointer
- proc_freepagetable(oldpagetable, oldsz);
- return argc; // this ends up in a0, the first argument to main(argc, argv)
- bad:
- if (pagetable) proc_freepagetable(pagetable, sz);
- if (ip) {
- iunlockput(ip);
- end_op();
- }
- return -1;
- }
复制代码 1. ELF 文件解析与内存布局
kexec 的任务是读取磁盘上的可执行文件(ELF 格式),并把它布置到内存中。ELF 文件由 ELF Header(elfhdr)、Program Header Table、Sections 三部分组成。其中 elfhdr 包含用于判断文件有效性的 magic,并存放了程序头表地址 phoff。通过 phoff 定位程序头后,根据其中 Segment 包含的信息,识别类型为 ELF_PROG_LOAD 的段。系统按 filesz 计算出所需的虚拟内存大小 memsz,并将其读入从 vaddr 开始的对应区域,完成用户进程代码和数据的加载。
2. 用户栈初始化与参数传递
随后,系统为用户分配 2 页内存,分别作为 userstack 和 guard 页。加载过程将参数逐个存入 userstack 中,并遵循 16B 对齐要求。为了让用户程序能够定位这些参数,系统还会将这些参数的地址同样保存到 userstack 中。最后将 a1 寄存器指向栈指针 sp,使得程序进入用户态后能根据地址找到对应的字符串。
3. 进程状态更新与硬件跳转
最后,更新用户进程的 name、pagetable 和 sz,并令 epc 指向 elf.entry。在准备返回阶段,epc 的值被赋给 sepc。当执行 userret 中的 sret 指令后,硬件执行 PC = sepc,处理器便从 elf.entry 开始正式执行用户态程序。
2. 从elf.entry到main
使用user.ld把程序+库链接成一个用户态ELF可执行文件
- _%: %.o $(ULIB) $U/user.ld
- $(LD) $(LDFLAGS) -T $U/user.ld -o $@ $< $(ULIB)
复制代码- //
- // wrapper so that it's OK if main() does not call exit().
- //
- void start(int argc, char** argv) {
- int r;
- extern int main(int argc, char** argv);
- r = main(argc, argv);
- exit(r);
- }
复制代码使用反汇编得到如下结果
- objdump -f user/_init
- user/_init: file format elf64-littleriscv
- architecture: riscv64
- start address: 0x00000000000000bc
复制代码在得到ELF可执行文件的过程中,在链接环节,得到start的地址为0xbc,将0xbc赋值给了elf.entry,最后这个sret执行,PC指向start函数。
- void start(int argc, char** argv) {
- int r;
- extern int main(int argc, char** argv);
- r = main(argc, argv);
- exit(r);
- }
复制代码start函数会调用init下的main函数
- char* argv[] = {"sh", 0};
- int main(void) {
- int pid, wpid;
- if (open("console", O_RDWR) < 0) {
- mknod("console", CONSOLE, 0);
- mknod("statistics", STATS, 0);
- open("console", O_RDWR);
- }
- dup(0); // stdout
- dup(0); // stderr
- for (;;) {
- printf("init: starting sh\n");
- pid = fork();
- if (pid < 0) {
- printf("init: fork failed\n");
- exit(1);
- }
- if (pid == 0) {
- exec("sh", argv);
- printf("init: exec sh failed\n");
- exit(1);
- }
- for (;;) {
- // this call to wait() returns if the shell exits,
- // or if a parentless process exits.
- wpid = wait((int*)0);
- if (wpid == pid) {
- // the shell exited; restart it.
- break;
- } else if (wpid < 0) {
- printf("init: wait returned an error\n");
- exit(1);
- } else {
- // it was a parentless process; do nothing.
- }
- }
- }
- }
复制代码 1. 文件描述符与子进程创建
系统初始化时,将 console 对应的文件描述符设置为 0,并将标准输出与标准错误重定向到 console 中。随后通过 fork 创建子进程,子进程得到的 pid 为 0,并开始执行 sh 程序。子进程在执行完指定的命令后,通过 exit 退出 shell。
2. 父进程的监控与循环
与此同时,父进程拿到子进程的真实 pid。父进程进入循环状态,持续等待并检查子进程是否结束。一旦子进程结束,父进程则退出当前循环并重启一个新的 shell,从而实现交互界面的持续存在。
3. sh 程序的功能实现
sh 程序的核心功能是解析用户输入的命令。在解析完成后,它通过调用相应的系统调用并传递必要的参数,驱动内核完成具体的任务执行。
3. 系统调用从用户态到内核态的流转
以最常见的write命令为例:
- #!/usr/bin/perl -w
- # Generate usys.S, the stubs for syscalls.
- sub entry {
- my $prefix = "sys_";
- my $name = shift;
- if ($name eq "sbrk") {
- print ".global $prefix$name\n";
- print "$prefix$name:\n";
- } else {
- print ".global $name\n";
- print "$name:\n";
- }
- print " li a7, SYS_${name}\n";
- print " ecall\n";
- print " ret\n";
- }
- entry("fork");
- entry("exit");
- entry("wait");
- entry("pipe");
- entry("read");
- entry("write");
复制代码批量生成usys.S,write如下:
- .global write
- write:
- li a7, SYS_write
- ecall
- ret
复制代码uservec部分流程,其中 t0 指向 kernel/usertrap 函数。
- .section trampsec
- .globl trampoline
- .globl usertrap
- trampoline:
- .align 4
- .globl uservec
- uservec:
- # trap.c sets stvec to point here, so
- # traps from user space start here,
- # in supervisor mode, but with a
- # user page table.
- # load the address of usertrap(), from p->trapframe->kernel_trap
- ld t0, 16(a0)
- # call usertrap()
- jalr t0
复制代码构建系统调用函数的函数指针数组
- extern uint64 sys_fork(void);
- extern uint64 sys_exit(void);
- extern uint64 sys_wait(void);
- ...
- #define SYS_fork 1
- #define SYS_exit 2
- #define SYS_wait 3
- ...
- static uint64 (*syscalls[])(void) = {
- [SYS_fork] sys_fork,
- [SYS_exit] sys_exit,
- [SYS_wait] sys_wait,
- ...
- }
复制代码write系统调用到sys_write
- uint64 sys_write(void) {
- struct file* f;
- int n;
- uint64 p;
- argaddr(1, &p);
- argint(2, &n);
- if (argfd(0, 0, &f) < 0) return -1;
- return filewrite(f, p, n);
- }
- void syscall(void) {
- int num;
- struct proc* p = myproc();
- num = p->trapframe->a7;
- if (num > 0 && num < NELEM(syscalls) && syscalls[num]) {
- // Use num to lookup the system call function for num, call it,
- // and store its return value in p->trapframe->a0
- p->trapframe->a0 = syscalls[num]();
- } else {
- printf("%d %s: unknown sys call %d\n", p->pid, p->name, num);
- p->trapframe->a0 = -1;
- }
- }
- uint64 usertrap(void) {
- int which_dev = 0;
- // send interrupts and exceptions to kerneltrap(),
- // since we're now in the kernel.
- w_stvec((uint64)kernelvec); // DOC: kernelvec
- struct proc* p = myproc();
- // save user program counter.
- p->trapframe->epc = r_sepc();
- if (r_scause() == 8) {
- // system call
- if (killed(p)) kexit(-1);
- // sepc points to the ecall instruction,
- // but we want to return to the next instruction.
- p->trapframe->epc += 4;
- // an interrupt will change sepc, scause, and sstatus,
- // so enable only now that we're done with those registers.
- intr_on();
- syscall();
- }
- }
复制代码 1. 异常触发与现场保存
执行流程首先将系统调用号写入 a7 寄存器,随后通过 ecall 指令触发一次异常(trap)。硬件自动记录 trap 原因为 user ecall 并存入 scause,同时将返回地址存入 sepc。此时权限提升至 S mode,硬件跳转到 uservec 进行异常处理。在 uservec 中,系统首先将当前进程的运行快照保存到 trapframe 中,最后跳转至寄存器 t0 所指向的 kernel/usertrap 函数。
2. 内核态异常处理与跳转
进入 usertrap 函数后,首先将异常向量表地址从 uservec 切换为 kernelvec,以处理内核态可能发生的异常。随后保存返回用户态时所需的指令地址,并正式进入 syscall 处理环节。
3. 函数分发与内核执行
在 syscall 函数内部,系统通过 a7 寄存器中的 num 确定本次调用的具体命令类型。接着利用该编号访问函数指针数组,精准跳转到对应的内核函数。例如,若本次调用号为 SYS_write,系统将获取相应参数并执行 filewrite 内核函数,最终完成实际的写操作。
从用户态到内核态的参数传递:
- static uint64 argraw(int n) {
- struct proc* p = myproc();
- switch (n) {
- case 0:
- return p->trapframe->a0;
- case 1:
- return p->trapframe->a1;
- case 2:
- return p->trapframe->a2;
- case 3:
- return p->trapframe->a3;
- case 4:
- return p->trapframe->a4;
- case 5:
- return p->trapframe->a5;
- }
- panic("argraw");
- return -1;
- }
- // Fetch the nth 32-bit system call argument.
- void argint(int n, int* ip) {
- *ip = argraw(n);
- }
复制代码 根据参数位次,使用p->trapframe用户态寄存器快照信息进行传参,从a0到a5都可用作传参。
来源:程序园用户自行投稿发布,如果侵权,请联系站长删除
免责声明:如果侵犯了您的权益,请联系站长,我们会及时删除侵权内容,谢谢合作! |