22. File‐Related Syscalls - josehu07/hux-kernel GitHub Wiki

This chapters lists the implementation details of a set of system calls related to the file system. User programs make use of persistence brought by the file system through these file-related syscall APIs.

Main References of This Chapter

Scan through them before going forth:

File-Related Syscalls

These syscall define an organized way in which user programs interact with persistent storage.

open()

Opens a file at given path name with given mode. Mode is a flag integer of value:

  • OPEN_RD: open as readable
  • OPEN_WR: open as writable
  • OPEN_RD | OPEN_WR: both readable & writable
  • Can't be null

Returns a file descriptor which is essentially an index in the process's open file list (hence a truly process-private name for an open file) ✭. The process could later use that file descriptor to read or write the file. Returns -1 on failures.

int32_t open(char *path, uint32_t mode);

Implementation @ src/filesys/sysfile.c:

int32_t
syscall_open(void)
{
    char *path;
    uint32_t mode;

    if (sysarg_get_str(0, &path) <= 0)
        return SYS_FAIL_RC;
    if (!sysarg_get_uint(1, &mode))
        return SYS_FAIL_RC;
    if ((mode & (OPEN_RD | OPEN_WR)) == 0) {
        warn("open: mode is neither readable nor writable");
        return SYS_FAIL_RC;
    }

    return filesys_open(path, mode);
}


// src/filesys/vsfs.c

/**
 * Allocates a free file descriptor of the caller process. Returns -1
 * if no free fd in this process.
 */
static int8_t
_alloc_process_fd(file_t *file)
{
    process_t *proc = running_proc();

    for (int8_t fd = 0; fd < MAX_FILES_PER_PROC; ++fd) {
        if (proc->files[fd] == NULL) {
            proc->files[fd] = file;
            return fd;
        }
    }

    return -1;
}

/** Find out what's the `file_t` for given FD for current process. */
static file_t *
_find_process_file(int8_t fd)
{
    process_t *proc = running_proc();

    if (fd < 0 || fd >= MAX_FILES_PER_PROC || proc->files[fd] == NULL)
        return NULL;
    return proc->files[fd];
}

/**
 * Open a file for the caller process. Returns the file descriptor (>= 0)
 * on success, and -1 on failure.
 */
int8_t
filesys_open(char *path, uint32_t mode)
{
    mem_inode_t *inode = _path_lookup(path);
    if (inode == NULL)
        return -1;

    inode_lock(inode);

    if (inode->d_inode.type == INODE_TYPE_DIR && mode != OPEN_RD) {
        inode_unlock(inode);
        inode_put(inode);
        return -1;
    }

    file_t *file = file_get();
    if (file == NULL) {
        warn("open: failed to allocate open file structure, reached max?");
        inode_unlock(inode);    // Maybe use goto.
        inode_put(inode);
        return -1;
    }

    int8_t fd = _alloc_process_fd(file);
    if (fd < 0) {
        warn("open: failed to allocate file descriptor, reached max?");
        file_put(file);
        inode_unlock(inode);    // Maybe use goto.
        inode_put(inode);
        return -1;
    }

    inode_unlock(inode);

    file->inode = inode;
    file->readable = (mode & OPEN_RD) != 0;
    file->writable = (mode & OPEN_WR) != 0;
    file->offset = 0;

    return fd;
}

close()

Closes an open file descriptor.

int32_t close(int32_t fd);

Implementation @ src/filesys/sysfile.c:

int32_t
syscall_close(void)
{
    int32_t fd;

    if (!sysarg_get_int(0, &fd))
        return SYS_FAIL_RC;
    if (fd < 0 || fd >= MAX_FILES_PER_PROC)
        return SYS_FAIL_RC;

    if (!filesys_close(fd))
        return SYS_FAIL_RC;
    return 0;
}


// src/filesys/vsfs.c

/** Closes an open file, actually closing if reference count drops to 0. */
bool
filesys_close(int8_t fd)
{
    file_t *file = _find_process_file(fd);
    if (file == NULL) {
        warn("close: cannot find file for fd %d", fd);
        return false;
    }

    running_proc()->files[fd] = NULL;
    file_put(file);
    return true;
}

create()

Creates a file at given path name with given mode. Mode is a flag integer of value:

  • CREATE_FILE: a regular file
  • CREATE_DIR: a directory
  • Can't be both or null

Returns 0 on success and -1 on failures.

int32_t create(char *path, uint32_t mode);

Implementation @ src/filesys/sysfile.c:

int32_t
syscall_create(void)
{
    char *path;
    uint32_t mode;

    if (sysarg_get_str(0, &path) <= 0)
        return SYS_FAIL_RC;
    if (!sysarg_get_uint(1, &mode))
        return SYS_FAIL_RC;
    if ((mode & (CREATE_FILE | CREATE_DIR)) == 0) {
        warn("create: mode is neigher file nor directory");
        return SYS_FAIL_RC;
    }
    if ((mode & CREATE_FILE) != 0 && (mode & CREATE_DIR) != 0) {
        warn("create: mode is both file and directory");
        return SYS_FAIL_RC;
    }

    if (!filesys_create(path, mode))
        return SYS_FAIL_RC;
    return 0;
}


// src/filesys/vsfs.c

/**
 * Create a file or directory at the given path name. Returns true on
 * success and false on failures.
 */
bool
filesys_create(char *path, uint32_t mode)
{
    char filename[MAX_FILENAME];
    mem_inode_t *parent_inode = _path_lookup_parent(path, filename);
    if (parent_inode == NULL) {
        warn("create: cannot find parent directory of '%s'", path);
        return false;
    }

    inode_lock(parent_inode);

    mem_inode_t *file_inode = _dir_find(parent_inode, filename, NULL);
    if (file_inode != NULL) {
        warn("create: file '%s' already exists", path);
        inode_unlock(parent_inode);
        inode_put(parent_inode);
        inode_put(file_inode);
        return false;
    }

    uint32_t type = (mode & CREATE_FILE) ? INODE_TYPE_FILE : INODE_TYPE_DIR;
    file_inode = inode_alloc(type);
    if (file_inode == NULL) {
        warn("create: failed to allocate inode on disk, out of space?");
        return false;
    }

    inode_lock(file_inode);

    /** Create '.' and '..' entries for new directory. */
    if (type == INODE_TYPE_DIR) {
        if (!_dir_add(file_inode, ".", file_inode->inumber)
            || !_dir_add(file_inode, "..", parent_inode->inumber)) {
            warn("create: failed to create '.' or '..' entries");
            inode_free(file_inode);
            return false;
        }
    }

    /** Put file into parent directory. */
    if (!_dir_add(parent_inode, filename, file_inode->inumber)) {
        warn("create: failed to put '%s' into its parent directory", path);
        inode_free(file_inode);
        return false;
    }

    inode_unlock(parent_inode);
    inode_put(parent_inode);

    inode_unlock(file_inode);
    inode_put(file_inode);

    return true;
}

remove()

Removes a file or an empty directory at given path name. A directory must be emtpy (with only "." and ".." entries) to be removed. Returns 0 on success and -1 on failures.

int32_t remove(char *path);

Implementation @ src/filesys/sysfile.c:

int32_t
syscall_remove(void)
{
    char *path;

    if (sysarg_get_str(0, &path) <= 0)
        return SYS_FAIL_RC;

    if (!filesys_remove(path))
        return SYS_FAIL_RC;
    return 0;
}


// src/filesys/vsfs.c

/**
 * Remove a file or directory from the file system. If is removing a
 * directory, the directory must be empty.
 */
bool
filesys_remove(char *path)
{
    char filename[MAX_FILENAME];
    mem_inode_t *parent_inode = _path_lookup_parent(path, filename);
    if (parent_inode == NULL) {
        warn("remove: cannot find parent directory of '%s'", path);
        return false;
    }

    inode_lock(parent_inode);

    /** Cannot remove '.' or '..'. */
    if (strncmp(filename, ".", MAX_FILENAME) == 0
        || strncmp(filename, "..", MAX_FILENAME) == 0) {
        warn("remove: cannot remove '.' or '..' entries");
        inode_unlock(parent_inode);     // Maybe use goto.
        inode_put(parent_inode);
        return false;
    }

    uint32_t offset;
    mem_inode_t *file_inode = _dir_find(parent_inode, filename, &offset);
    if (file_inode == NULL) {
        warn("remove: cannot find file '%s'", path);
        inode_unlock(parent_inode);     // Maybe use goto.
        inode_put(parent_inode);
        return false;
    }

    inode_lock(file_inode);

    /** Cannot remove a non-empty directory. */
    if (file_inode->d_inode.type == INODE_TYPE_DIR
        && !_dir_empty(file_inode)) {
        warn("remove: cannot remove non-empty directory '%s'", path);
        inode_unlock(file_inode);       // Maybe use goto.
        inode_put(file_inode);
        inode_unlock(parent_inode);     // Maybe use goto.
        inode_put(parent_inode);
        return false;
    }

    /** Write zeros into the corresponding entry in parent directory. */
    dentry_t dentry;
    memset(&dentry, 0, sizeof(dentry_t));
    if (inode_write(parent_inode, (char *) &dentry, offset,
                    sizeof(dentry_t)) != sizeof(dentry_t)) {
        warn("remove: failed to write at offset %u", offset);
        inode_unlock(file_inode);       // Maybe use goto.
        inode_put(file_inode);
        inode_unlock(parent_inode);     // Maybe use goto.
        inode_put(parent_inode);
        return false;
    }

    inode_unlock(parent_inode);
    inode_put(parent_inode);

    /** Erase its metadata on disk. */
    inode_free(file_inode);

    inode_unlock(file_inode);
    inode_put(file_inode);

    return true;
}

read()

Read a certain number of bytes from an open file (by giving the file descriptor) at its current offset into a buffer. On success, returns the number of bytes actually read and updates the file offset accordingly. Returns -1 on failures.

int32_t read(int32_t fd, char *dst, uint32_t len);

Implementation @ src/filesys/syysfile.c:

int32_t
syscall_read(void)
{
    int32_t fd;
    char *dst;
    uint32_t len;

    if (!sysarg_get_int(0, &fd))
        return SYS_FAIL_RC;
    if (fd < 0 || fd >= MAX_FILES_PER_PROC)
        return SYS_FAIL_RC;
    if (!sysarg_get_uint(2, &len))
        return SYS_FAIL_RC;
    if (!sysarg_get_mem(1, &dst, len))
        return SYS_FAIL_RC;

    return filesys_read(fd, dst, len);
}


// src/filesys/vsfs.c

/** Read from current offset of file into user buffer. */
int32_t
filesys_read(int8_t fd, char *dst, size_t len)
{
    file_t *file = _find_process_file(fd);
    if (file == NULL) {
        warn("read: cannot find file for fd %d", fd);
        return -1;
    }

    if (!file->readable) {
        warn("read: file for fd %d is not readable", fd);
        return -1;
    }

    inode_lock(file->inode);
    size_t bytes_read = inode_read(file->inode, dst, file->offset, len);
    if (bytes_read > 0)         /** Update file offset. */
        file->offset += bytes_read;
    inode_unlock(file->inode);

    return bytes_read;
}

write()

Writes a certain number of bytes into an open file (by giving the file descriptor) at its current offset from a buffer. On success, returns the number of bytes actually written and updates the file offset accordingly. Returns -1 on failures.

int32_t write(int32_t fd, char *src, uint32_t len);

Implementation @ src/filesys/sysfile.c:

int32_t
syscall_write(void)
{
    int32_t fd;
    char *src;
    uint32_t len;

    if (!sysarg_get_int(0, &fd))
        return SYS_FAIL_RC;
    if (fd < 0 || fd >= MAX_FILES_PER_PROC)
        return SYS_FAIL_RC;
    if (!sysarg_get_uint(2, &len))
        return SYS_FAIL_RC;
    if (!sysarg_get_mem(1, &src, len))
        return SYS_FAIL_RC;

    return filesys_write(fd, src, len);
}


// src/filesys/vsfs.c

/** Write from user buffer into current offset of file. */
int32_t
filesys_write(int8_t fd, char *src, size_t len)
{
    file_t *file = _find_process_file(fd);
    if (file == NULL) {
        warn("read: cannot find file for fd %d", fd);
        return -1;
    }

    if (!file->writable) {
        warn("write: file for fd %d is not writable", fd);
        return -1;
    }

    inode_lock(file->inode);
    size_t bytes_written = inode_write(file->inode, src, file->offset, len);
    if (bytes_written > 0)      /** Update file offset. */
        file->offset += bytes_written;
    inode_unlock(file->inode);

    return bytes_written;
}

chdir()

Change the caller process's current working directory to the one given by path name. Returns 0 on success and -1 on failures.

int32_t chdir(char *path);

Implementation @ src/filesys/sysfile.c:

int32_t
syscall_chdir(void)
{
    char *path;

    if (sysarg_get_str(0, &path) <= 0)
        return SYS_FAIL_RC;

    if (!filesys_chdir(path))
        return SYS_FAIL_RC;
    return 0;
}


// src/filesys/vsfs.c

/** Change the current working directory (cwd) of caller process. */
bool
filesys_chdir(char *path)
{
    mem_inode_t *inode = _path_lookup(path);
    if (inode == NULL) {
        warn("chdir: target path '%s' does not exist", path);
        return false;
    }

    inode_lock(inode);
    if (inode->d_inode.type != INODE_TYPE_DIR) {
        warn("chdir: target path '%s' is not a directory", path);
        inode_unlock(inode);
        inode_put(inode);
        return false;
    }
    inode_unlock(inode);

    /** Put the old cwd and keep the new one. */
    process_t *proc = running_proc();
    inode_put(proc->cwd);
    proc->cwd = inode;

    return true;
}

getcwd()

Get a string representation of the absolute path name of current working directory. Returns -1 if the buffer limit is too small for the result.

int32_t getcwd(char *buf, uint32_t limit);

Implementation @ src/filesys/sysfile.c:

int32_t
syscall_getcwd(void)
{
    char *buf;
    uint32_t limit;

    if (!sysarg_get_uint(1, &limit))
        return SYS_FAIL_RC;
    if (limit < 2)
        return SYS_FAIL_RC;
    if (!sysarg_get_mem(0, &buf, limit))
        return SYS_FAIL_RC;

    if (!filesys_getcwd(buf, limit))
        return SYS_FAIL_RC;
    return 0;
}


// src/filesys/vsfs.c

static size_t
_recurse_abs_path(mem_inode_t *inode, char *buf, size_t limit)
{
    if (inode->inumber == ROOT_INUMBER) {
        buf[0] = '/';
        return 1;
    }

    inode_lock(inode);

    /** Check the parent directory. */
    mem_inode_t *parent_inode = _dir_find(inode, "..", NULL);
    if (parent_inode == NULL) {
        warn("abs_path: failed to get parent inode of %u", inode->inumber);
        inode_unlock(inode);
        return limit;
    }

    inode_unlock(inode);

    /** If parent is root, stop recursion.. */
    if (parent_inode->inumber == ROOT_INUMBER) {
        buf[0] = '/';

        inode_lock(parent_inode);
        size_t written = _dir_filename(parent_inode, inode->inumber,
                                       &buf[1], limit - 1);
        inode_unlock(parent_inode);
        inode_put(parent_inode);

        return 1 + written;
    }

    size_t curr = _recurse_abs_path(parent_inode, buf, limit);
    if (curr >= limit - 1)
        return limit;

    inode_lock(parent_inode);
    size_t written = _dir_filename(parent_inode, inode->inumber,
                                   &buf[curr], limit - curr);
    inode_unlock(parent_inode);
    inode_put(parent_inode);

    return curr + written;
}

bool
filesys_getcwd(char *buf, size_t limit)
{
    mem_inode_t *inode = running_proc()->cwd;
    inode_ref(inode);

    size_t written = _recurse_abs_path(inode, buf, limit);
    if (written >= limit)
        return false;
    else
        buf[limit - 1] = '\0';

    inode_put(inode);
    return true;
}

fstat()

Get metadata information about an open file.

/** Struct & type code for `fstat()`. */
struct file_stat {
    uint32_t inumber;
    uint32_t type;
    uint32_t size;
};
typedef struct file_stat file_stat_t;

#define INODE_TYPE_EMPTY 0
#define INODE_TYPE_FILE  1
#define INODE_TYPE_DIR   2


int32_t fstat(int32_t fd, file_stat_t *stat);

Implementation @ src/filesys/sysfile.c:

int32_t
syscall_fstat(void)
{
    int32_t fd;
    file_stat_t *stat;

    if (!sysarg_get_int(0, &fd))
        return SYS_FAIL_RC;
    if (!sysarg_get_mem(1, (char **) &stat, sizeof(file_stat_t)))
        return SYS_FAIL_RC;

    if (!filesys_fstat(fd, stat))
        return SYS_FAIL_RC;
    return 0;
}


// src/filesys/vsfs.c

/** Get metadata information about an open file. */
bool
filesys_fstat(int8_t fd, file_stat_t *stat)
{
    file_t *file = _find_process_file(fd);
    if (file == NULL) {
        warn("fstat: cannot find file for fd %d", fd);
        return false;
    }

    file_stat(file, stat);
    return true;
}


// src/filesys/file.c

/** Get metadata information of a file. */
void
file_stat(file_t *file, file_stat_t *stat)
{
    inode_lock(file->inode);

    stat->inumber = file->inode->inumber;
    stat->type = file->inode->d_inode.type;
    stat->size = file->inode->d_inode.size;

    inode_unlock(file->inode);
}

seek()

Change open file offset to given absolute value.

int32_t seek(int32_t fd, uint32_t offset);

Implementation @ src/filesys/sysfile.c:

int32_t
syscall_seek(void)
{
    int32_t fd;
    uint32_t offset;

    if (!sysarg_get_int(0, &fd))
        return SYS_FAIL_RC;
    if (!sysarg_get_uint(1, &offset))
        return SYS_FAIL_RC;

    if (!filesys_seek(fd, offset))
        return SYS_FAIL_RC;
    return 0;
}


// src/filesys/vsfs.c

/** Seek to absolute file offset. */
bool
filesys_seek(int8_t fd, size_t offset)
{
    file_t *file = _find_process_file(fd);
    if (file == NULL) {
        warn("seek: cannot find file for fd %d", fd);
        return false;
    }

    inode_lock(file->inode);
    size_t filesize = file->inode->d_inode.size;
    inode_unlock(file->inode);

    if (offset > filesize) {
        warn("seek: offset %lu beyond filesize %lu", offset, filesize);
        return false;
    }

    file->offset = offset;
    return true;
}

Check that it works!

Program Loading With exec()

So far, we have been doing user-level tests in a single init.c program. Apparently, any useful OS kernel cannot assume that all user-level process logics be written in a single executable file. It must have some way to load a process with an individual executable binary. In UNIX-like systems, the exec() syscall does this job.

ELF-32 Executable Format

Both the kernel and the user programs under user/ will be compiled & linked into canonical ELF-32 format. We have mentioned the ELF format briefly in an early debugging chapter, where we talked about parsing the section headers of the kernel ELF to locate its symbol table section for looking up symbol names. For loading & running a user program ELF, however, the more interesting parts are the program headers.

  • Every ELF file begins with a meta file header that describes things like:
    • How many program headers are there in the file (phnum)
    • Where to find them (phoff)
    • How many section headers are there (shnum)
    • ... (more)
  • Each program header describes one program segment of the executable code, including information of:
    • Its type (type of LOAD means this segment needs to be loaded at exec)
    • Its offset location (off)
    • Its file size (filesz, how many bytes to load from off)
    • Its memory image size (memsz, how much memory to map, might be larger than filesz due to data sections)
    • ... (more)

Please refer to this page and this page for the official specifications. Add corresponding definitions @ src/boot/elf.h:

/** ELF file header. */
struct elf_file_header {
    uint32_t magic;         /** In little-endian on x86. */
    uint8_t  ident[12];     /** Rest of `e_ident`. */
    uint16_t type;
    uint16_t machine;
    uint32_t version;
    uint32_t entry;
    uint32_t phoff;
    uint32_t shoff;
    uint32_t flags;
    uint16_t ehsize;
    uint16_t phentsize;
    uint16_t phnum;
    uint16_t shentsize;
    uint16_t shnum;
    uint16_t shstrndx;
} __attribute__((packed));
typedef struct elf_file_header elf_file_header_t;

/**
 * ELF magic number 0x7F'E''L''F' in little endian.
 * See https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html#elfid
 */
#define ELF_MAGIC 0x464C457F


/** ELF program header. */
struct elf_program_header {
    uint32_t type;
    uint32_t offset;
    uint32_t vaddr;
    uint32_t paddr;
    uint32_t filesz;
    uint32_t memsz;
    uint32_t flags;
    uint32_t align;
} __attribute__((packed));
typedef struct elf_program_header elf_program_header_t;

/** ELF program header flags. */
#define ELF_PROG_FLAG_EXEC  0x1
#define ELF_PROG_FLAG_WRITE 0x2
#define ELF_PROG_FLAG_READ  0x4

/** ELF program header types. */
#define ELF_PROG_TYPE_LOAD  0x1

To load an ELF executable, the kernel simply loops through its program segments and maps them into the ELF region of the virtual address space sequentially.

exec()

Loads an ELF executable file at given path. On success, this syscall does not actually "return", because the trap frame's %eip will be set to the entry of newly loaded code. In any case that a call to exec() returns, it means loading failed.

The argv argument must be a NULL-terminated array of char *s, each pointing to an argument string. Seeing a NULL pointer in argv indicates the end of the argument list. The maximum number of arguments is 32.

int32_t exec(char *path, char **argv);

Implementation @ src/filesys/sysfile.c:

int32_t
syscall_exec(void)
{
    char *path;
    uint32_t uargv;

    if (!sysarg_get_str(0, &path))
        return SYS_FAIL_RC;
    if (!sysarg_get_uint(1, &uargv))
        return SYS_FAIL_RC;

    char *argv[MAX_EXEC_ARGS];
    memset(argv, 0, MAX_EXEC_ARGS * sizeof(char *));
    for (size_t argc = 0; argc < MAX_EXEC_ARGS; ++argc) {
        uint32_t uarg;
        if (!sysarg_addr_uint(uargv + 4 * argc, &uarg))
            return SYS_FAIL_RC;
        if (uarg == 0) {    /** Reached end of list. */
            argv[argc] = 0;
            if (!filesys_exec(path, argv))
                return SYS_FAIL_RC;
            return 0;
        }
        if (sysarg_addr_str(uarg, &argv[argc]) < 0)
            return SYS_FAIL_RC;
    }
    return SYS_FAIL_RC;
}


// src/filesys/vsfs.c

/** Wrapper over `exec_program()`. */
bool
filesys_exec(char *path, char **argv)
{
    mem_inode_t *inode = _path_lookup(path);
    if (inode == NULL) {
        warn("exec: failed to lookup path '%s'", path);
        return false;
    }

    char *filename = &path[strlen(path) - 1];
    while (*filename != '/' && filename != path)
        filename--;
    return exec_program(inode, filename, argv);
}


// src/filesys/exec.h

/** Maximum number of arguments allowed in `argv` list. */
#define MAX_EXEC_ARGS 32

Actual implementation of exec is in a separate file @ src/filesys/exec.c:

/**
 * Refresh page table, load an executable ELF program at given inode, and
 * start execution at the beginning of its text section. ARGV is an array
 * of strings (`char *`s) where the last one must be NULL, indicating
 * the end of argument array.
 *
 * The syscall does not actually return on success, since the process
 * should have jumped to the newly loaded code after returning from this
 * trap frame. Returns false on failures.
 */
bool
exec_program(mem_inode_t *inode, char *filename, char **argv)
{
    process_t *proc = running_proc();
    pde_t *pgdir = NULL;

    inode_lock(inode);

    /** Read in ELF header, sanity check magic number. */
    elf_file_header_t elf_header;
    if (inode_read(inode, (char *) &elf_header, 0,
                   sizeof(elf_file_header_t)) != sizeof(elf_file_header_t)) {
        warn("exec: failed to read ELF file header");
        goto fail;
    }
    if (elf_header.magic != ELF_MAGIC) {
        warn("exec: ELF header magic number mismatch");
        goto fail;
    }

    /**
     * Mimicks `initproc_init()` in `process.c`. Sets up a brand-new page table
     * and pre-maps necessary pages:
     *   - kernel mapped to lower 512MiB
     *   - program ELF binary follows
     *   - top-most stack page
     *
     * Need to set up a brand-new copy of page table because if there are any
     * errors that occur during the process, we can gracefully return an error
     * to the caller process instead of breaking it.
     */
    pgdir = (pde_t *) salloc_page();
    if (pgdir == NULL) {
        warn("exec: failed to allocate new page directory");
        goto fail;
    }
    memset(pgdir, 0, sizeof(pde_t) * PDES_PER_PAGE);

    uint32_t vaddr_btm = 0;                     /** Kernel-mapped. */
    while (vaddr_btm < PHYS_MAX) {
        pte_t *pte = paging_walk_pgdir(pgdir, vaddr_btm, true);
        if (pte == NULL)
            goto fail;
        paging_map_kpage(pte, vaddr_btm);

        vaddr_btm += PAGE_SIZE;
    }
    
    elf_program_header_t prog_header;           /** ELF binary. */
    uint32_t vaddr_elf_max = USER_BASE;
    for (size_t idx = 0; idx < elf_header.phnum; ++idx) {
        /** Read in this program header. */
        size_t offset = elf_header.phoff + idx * sizeof(elf_program_header_t);
        if (inode_read(inode, (char *) &prog_header, offset,
                       sizeof(elf_program_header_t)) != sizeof(elf_program_header_t)) {
            goto fail;
        }
        if (prog_header.type != ELF_PROG_TYPE_LOAD)
            continue;
        if (prog_header.memsz < prog_header.filesz)
            goto fail;

        /** Read in program segment described by this header. */
        uint32_t vaddr_curr = prog_header.vaddr;
        uint32_t vaddr_end = prog_header.vaddr + prog_header.memsz;
        uint32_t elf_curr = prog_header.offset;
        uint32_t elf_end = prog_header.offset + prog_header.filesz;
        while (vaddr_curr < vaddr_end) {
            size_t effective_v = PAGE_SIZE - ADDR_PAGE_OFFSET(vaddr_curr);
            if (effective_v > vaddr_end - vaddr_curr)
                effective_v = vaddr_end - vaddr_curr;
            size_t effective_e = effective_v;
            if (effective_e > elf_end - elf_curr)
                effective_e = elf_end - elf_curr;

            if (vaddr_curr < USER_BASE) {
                vaddr_curr += effective_v;
                elf_curr += effective_e;
                continue;
            }

            pte_t *pte = paging_walk_pgdir(pgdir, vaddr_curr, true);
            if (pte == NULL)
                goto fail;
            uint32_t paddr = pte->present == 0 ? paging_map_upage(pte, true)
                                               : ENTRY_FRAME_ADDR(*pte);
            if (paddr == 0)
                goto fail;
            uint32_t paddr_curr = paddr + ADDR_PAGE_OFFSET(vaddr_curr);

            if (effective_e > 0) {
                if (inode_read(inode, (char *) paddr_curr, elf_curr,
                               effective_e) != effective_e) {
                    goto fail;
                }
                elf_curr += effective_e;
            }

            vaddr_curr += effective_v;
        }

        if (vaddr_curr > vaddr_elf_max)
            vaddr_elf_max = ADDR_PAGE_ROUND_UP(vaddr_curr);
    }

    inode_unlock(inode);
    inode_put(inode);
    inode = NULL;

    while (vaddr_elf_max < HEAP_BASE) {         /** Rest of ELF region. */
        pte_t *pte = paging_walk_pgdir(pgdir, vaddr_elf_max, true);
        if (pte == NULL)
            goto fail;
        uint32_t paddr = paging_map_upage(pte, true);
        if (paddr == 0)
            goto fail;

        vaddr_elf_max += PAGE_SIZE;
    }
    
    uint32_t vaddr_top = USER_MAX - PAGE_SIZE;  /** Top stack page. */
    pte_t *pte_top = paging_walk_pgdir(pgdir, vaddr_top, true);
    if (pte_top == NULL)
        goto fail;
    uint32_t paddr_top = paging_map_upage(pte_top, true);
    if (paddr_top == 0)
        goto fail;
    memset((char *) paddr_top, 0, PAGE_SIZE);

    /**
     * Push argument strings to the stack, then push the argv list
     * pointing to those strings, followed by `argv`, `argc`.
     */
    uint32_t sp = USER_MAX;
    uint32_t ustack[3 + MAX_EXEC_ARGS + 1];
    size_t argc = 0;
    for (argc = 0; argv[argc] != NULL; ++argc) {
        if (argc >= MAX_EXEC_ARGS)
            goto fail;
        sp = sp - (strlen(argv[argc]) + 1);
        sp &= 0xFFFFFFFC;   /** Align to 32-bit words. */
        memcpy((char *) (paddr_top + PAGE_SIZE - (USER_MAX - sp)), argv[argc],
               strlen(argv[argc]) + 1);
        ustack[3 + argc] = sp;
    }
    ustack[3 + argc] = 0;       /** End of argv list. */

    ustack[2] = sp - (argc + 1) * 4;       /** `argv` */
    ustack[1] = argc;                      /** `argv` */
    ustack[0] = 0x0000DEAD;  /** Fake return address. */

    sp -= (3 + argc + 1) * 4;
    memcpy((char *) (paddr_top + PAGE_SIZE - (USER_MAX - sp)), ustack,
           (3 + argc + 1) * 4);

    /** Change process name. */
    strncpy(proc->name, filename, strlen(filename));

    /** Switch to the new page directory, discarding old state. */
    pde_t *old_pgdir = proc->pgdir;
    uint32_t old_heap_high = proc->heap_high;
    uint32_t old_stack_low = proc->stack_low;

    proc->pgdir = pgdir;
    proc->stack_low = vaddr_top;
    proc->heap_high = HEAP_BASE;
    proc->trap_state->esp = sp;
    proc->trap_state->eip = elf_header.entry;   /** `main()` function. */
    paging_switch_pgdir(proc->pgdir);

    paging_unmap_range(old_pgdir, USER_BASE, old_heap_high);
    paging_unmap_range(old_pgdir, old_stack_low, USER_MAX);
    paging_destroy_pgdir(old_pgdir);
    return true;

fail:
    if (pgdir != NULL) {
        paging_unmap_range(pgdir, USER_BASE, HEAP_BASE);
        paging_unmap_range(pgdir, USER_MAX - PAGE_SIZE, USER_MAX);
        paging_destroy_pgdir(pgdir);
    }
    if (inode != NULL) {
        inode_unlock(inode);
        inode_put(inode);
    }
    return false;
}

Please see the comments for more explanation on this long function. Notice that it tries to build up a new page directory instead of directly mapping into the the process's current page directory. The reason is that, in case of any failures, we can simply discard the new page directory without breaking the process ✭.

Progress So Far

In previous chapters, we've been writing a temporary shell function in init.c. Let's try moving it to a separate user program file that will be compiled into an individual ELF! Code @ user/shell.c:

#include "lib/syscall.h"
#include "lib/printf.h"
#include "lib/debug.h"
#include "lib/string.h"


void
main(int argc, char *argv[])
{
    _shell_welcome_logo();

    printf("Hello from the exec'ed shell program!\n");

    char cmd_buf[256];
    memset(cmd_buf, 0, 256);

    while (1) {
        printf("temp shell $ ");
        
        if (kbdstr(cmd_buf, 256) < 0)
            warn("shell: failed to get keyboard string");
        else
            printf("%s", cmd_buf);

        memset(cmd_buf, 0, 256);
    }
}

The init program now looks like:

// user/init.c

#include "lib/syscall.h"
#include "lib/debug.h"


void
main(void)
{
    // info("init: starting the shell process...");

    int8_t shell_pid = fork(0);
    if (shell_pid < 0) {
        error("init: failed to fork a child process");
        exit();
    }

    if (shell_pid == 0) {
        /** Child: exec the command line shell. */
        char *argv[2];
        argv[0] = "shell";
        argv[1] = NULL;
        exec("shell", argv);
        error("init: failed to exec the shell program");

    } else {
        /** Parent: loop in catching zombie processes. */
        int8_t wait_pid;
        do {
            wait_pid = wait();
            if (wait_pid > 0 && wait_pid != shell_pid)
                warn("init: caught zombie process %d", wait_pid);
        } while (wait_pid > 0 && wait_pid != shell_pid);
        error("init: the shell process exits, should not happen");
    }
}

This should produce a terminal window as the following after booting up:

A kindly reminder: changes to the file system image vsfs.img will persist across multiple QEMU runs, and the mkfs.py script skips if it sees the output file already exists. During development and debugging of the file system, be sure to remove and re-format the file system image by e.g. rm vsfs.img; make to guarantee a fresh state after every build.

Current repo structure:

hux-kernel
├── Makefile
├── scripts
│   ├── gdb_init
│   ├── grub.cfg
│   ├── kernel.ld
│   └── mkfs.py
├── src
│   ├── boot
│   │   ├── boot.s
│   │   ├── elf.h
│   │   └── multiboot.h
│   ├── common
│   │   ├── bitmap.c
│   │   ├── bitmap.h
│   │   ├── debug.c
│   │   ├── debug.h
│   │   ├── intstate.c
│   │   ├── intstate.h
│   │   ├── parklock.c
│   │   ├── parklock.h
│   │   ├── port.c
│   │   ├── port.h
│   │   ├── printf.c
│   │   ├── printf.h
│   │   ├── spinlock.c
│   │   ├── spinlock.h
│   │   ├── string.c
│   │   ├── string.h
│   │   ├── types.c
│   │   └── types.h
│   ├── device
│   │   ├── idedisk.c
│   │   ├── idedisk.h
│   │   ├── keyboard.c
│   │   ├── keyboard.h
│   │   ├── sysdev.c
│   │   ├── sysdev.h
│   │   ├── timer.c
│   │   └── timer.h
│   ├── display
│   │   ├── sysdisp.c
│   │   ├── sysdisp.h
│   │   ├── terminal.c
│   │   ├── terminal.h
│   │   └── vga.h
│   ├── filesys
│   │   ├── block.c
│   │   ├── block.h
│   │   ├── exec.c
│   │   ├── exec.h
│   │   ├── file.c
│   │   ├── file.h
│   │   ├── sysfile.c
│   │   ├── sysfile.h
│   │   ├── vsfs.c
│   │   └── vsfs.h
│   ├── interrupt
│   │   ├── idt-load.s
│   │   ├── idt.c
│   │   ├── idt.h
│   │   ├── isr-stub.s
│   │   ├── isr.c
│   │   ├── isr.h
│   │   ├── syscall.c
│   │   └── syscall.h
│   ├── memory
│   │   ├── gdt-load.s
│   │   ├── gdt.c
│   │   ├── gdt.h
│   │   ├── kheap.c
│   │   ├── kheap.h
│   │   ├── paging.c
│   │   ├── paging.h
│   │   ├── slabs.c
│   │   ├── slabs.h
│   │   ├── sysmem.c
│   │   └── sysmem.h
│   ├── process
│   │   ├── layout.h
│   │   ├── process.c
│   │   ├── process.h
│   │   ├── scheduler.c
│   │   ├── scheduler.h
│   │   ├── switch.s
│   │   ├── sysproc.c
│   │   └── sysproc.h
│   └── kernel.c
├── user
│   ├── lib
│   │   ├── debug.h
│   │   ├── malloc.c
│   │   ├── malloc.h
│   │   ├── printf.c
│   │   ├── printf.h
│   │   ├── string.c
│   │   ├── string.h
│   │   ├── syscall.h
│   │   ├── syscall.s
│   │   ├── syslist.s
│   │   ├── types.c
│   │   └── types.h
│   ├── init.c
│   └── shell.c
⚠️ **GitHub.com Fallback** ⚠️