ubifs: How ubifs_link affects an tmpfile's orphan node - 549642238/linux-stable GitHub Wiki

orphan node是一个已经打开的文件进行unlink或删除操作导致其nlink count减为0的inode。在ubifs中,创建tmpfile也会生成orphan inode。下面以mount后创建tmpfile并生成orphan inode到umount再mount后对orphan inode的处理流程为例,在中间过程发生link操作对orphan inode的影响:

A. 在同一个成功mount的ubifs文件系统上创建tmpfile,i_op_tmpfile -> ubifs_tmpfile -> do_tmpfile -> ubifs_add_orphan -> ubifs_add_orphan

static int do_tmpfile(struct inode *dir, struct dentry *dentry,
		      umode_t mode, struct inode **whiteout)
{
	...
        if (whiteout) {
                mark_inode_dirty(inode);
                drop_nlink(inode);                                              // i_nlink减1,此时临时文件inode的i_nlink=0
                *whiteout = inode;
        } else {
                d_tmpfile(dentry, inode);                                       // 调用drop_nlink使i_nlink减1,此时临时文件inode的i_nlink=0
        }
        // 此时临时文件inode的i_nlink=0
	...
	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
	...
}

int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
		     const struct qstr *nm, const struct inode *inode,
		     int deletion, int xent)
{
	int last_reference = !!(deletion && inode->i_nlink == 0);               // last_reference = 1
        if (last_reference) {
                err = ubifs_add_orphan(c, inode->i_ino);                        // 对创建tmpfile传进来的inode一定会执行这一步
	...
}

// ubifs_add_orphan -> orphan_add
static struct ubifs_orphan *orphan_add(struct ubifs_info *c, ino_t inum,
                                       struct ubifs_orphan *parent_orphan)
{
	struct ubifs_orphan *orphan, *o;
	orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);      // orphan node在内存中的表示
	orphan->inum = inum;
	orphan->new = 1;                                              // orphan node的new标志位置1
	...
	spin_lock(&c->orphan_lock);                                   // 加锁c->orphan_lock
	p = &c->orph_tree.rb_node;
	while (*p) {                                                  // 将orphan node插入红黑树c->orph_tree
		parent = *p;
		o = rb_entry(parent, struct ubifs_orphan, rb);
		...
	}
	c->tot_orphans += 1;
	c->new_orphans += 1;                                         // 代表新的orphan node(没有被commit)
	rb_link_node(&orphan->rb, parent, p);
	rb_insert_color(&orphan->rb, &c->orph_tree);
	list_add_tail(&orphan->list, &c->orph_list);                 // 将orphan node加入c->orph_list
	list_add_tail(&orphan->new_list, &c->orph_new);              // 将orphan node加入c->orph_new
	spin_unlock(&c->orphan_lock);                                // 解锁c->orphan_lock
	...
}

B. 很多操作都可以触发do_commit,比如mkdir、link、mknod等。do_commit对orphan node的处理有两个阶段,分别是ubifs_orphan_start_commit和ubifs_orphan_end_commit。

B1. ubifs_orphan_start_commit

int ubifs_orphan_start_commit(struct ubifs_info *c)
{
	struct ubifs_orphan *orphan, **last;

	spin_lock(&c->orphan_lock);                                  // 加锁c->orphan_lock
	last = &c->orph_cnext;
	list_for_each_entry(orphan, &c->orph_new, new_list) {        // 遍历每个c->orph_new链表中的orphan node
		ubifs_assert(orphan->new);
		ubifs_assert(!orphan->cmt);
		orphan->new = 0;                                     // 清掉orphan node的new标志位
		orphan->cmt = 1;                                     // 置上orphan node的cmt标志位,代表正在执行commit操作
		*last = orphan;                                      // 将orphan node加入c->cnext链表
		last = &orphan->cnext;
	}
	*last = NULL;
	c->cmt_orphans = c->new_orphans;                             // 更新commit orphan node的数量
	c->new_orphans = 0;                                          // 现在没有新的orphan node
	dbg_cmt("%d orphans to commit", c->cmt_orphans);
	INIT_LIST_HEAD(&c->orph_new);                                // 清空c->orph_new链表
	if (c->tot_orphans == 0)
		c->no_orphs = 1;
	else
		c->no_orphs = 0;
	spin_unlock(&c->orphan_lock);                                // 解锁c->orphan_lock
	return 0;
}

B2. ubifs_orphan_end_commit -> commit_orphans -> write_orph_nodes -> do_write_orph_node -> do_write_orph_node

int ubifs_orphan_end_commit(struct ubifs_info *c)
{
	int err;

	if (c->cmt_orphans != 0) {
		err = commit_orphans(c);                             // 如果要提交的orphan node数量不为0
		if (err)
			return err;
	}
	erase_deleted(c);                                            // 必须执行一次erase_deleted,删除c->orph_dnext链表上所有的orpan node并将orphan node从rb tree和链表中删除后释放orphan node,因为在commit_orphans中提交是一个批量执行过程,中间会释放c->orphan_lock锁,如果期间发生link操作会将cmt位为1的orphan node放入c->orph_dnext,放入c->orph_dnext的orphan node仍旧会被写入Flash,因为cmt位一旦置位1就会被放入c->orph_cnext链表,c->orph_cnext链表中的orphan node一定会被写入Flash
	err = dbg_check_orphans(c);
	return err;
}

static void erase_deleted(struct ubifs_info *c)
{
        struct ubifs_orphan *orphan, *dnext;

        spin_lock(&c->orphan_lock);                                             // 加锁c->orphan_lock
        dnext = c->orph_dnext;
        while (dnext) {                                                         // 遍历c->orph_dnext中每个orphan node,将orphan node从c->orph_tree和c->orph_list中删除,并释放orphan node,c->tot_orphans -= 1,所以一个orphan node要么在c->orph_list中,要么在c->orph_dnext中
                orphan = dnext;
                dnext = orphan->dnext;
                ubifs_assert(c, !orphan->new);
                ubifs_assert(c, orphan->del);
                rb_erase(&orphan->rb, &c->orph_tree);
                list_del(&orphan->list);
                c->tot_orphans -= 1;
                dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum);
                kfree(orphan);
        }
        c->orph_dnext = NULL;                                                   // 清空c->orph_dnext链表
        spin_unlock(&c->orphan_lock);                                           // 解锁c->orphan_lock
}

static int write_orph_nodes(struct ubifs_info *c, int atomic)
{
	...
	while (c->cmt_orphans > 0) {
		err = write_orph_node(c, atomic);                     // 批量处理cmt orphan node,直到c->cmt_orphans全部处理完
		...
	}
	...
}

static int write_orph_node(struct ubifs_info *c, int atomic)
{
	struct ubifs_orph_node *orph;                                 // Flash上orphan node的表示
	...
	orph = c->orph_buf;                                           // c->orph_buf会被写入Flash,填充orph就相当于填充c->orph_buf
	spin_lock(&c->orphan_lock);                                   // 加锁c->orphan_lock
	cnext = c->orph_cnext;
	for (i = 0; i < cnt; i++) {                                   // 遍历c->orph_cnext链表上的每个orphan node
		orphan = cnext;
		ubifs_assert(orphan->cmt);
		orph->inos[i] = cpu_to_le64(orphan->inum);            // 记录orphan inode号到orph
		orphan->cmt = 0;                                      // 清掉orphan node的cmt位
		cnext = orphan->cnext;
		orphan->cnext = NULL;                                 // 遍历结束后c->orph_cnext链表为空
	}
	c->orph_cnext = cnext;
	c->cmt_orphans -= cnt;                                        // write_orph_node是一个批量处理cmt orphan node的过程,当前处理cnt个
	spin_unlock(&c->orphan_lock);                                 // 解锁c->orphan_lock
	...
	err = do_write_orph_node(c, len, atomic);                     // 将c->orph_buf写入Flash
	...
}

C. 在ubifs umount时,会调用generic_shutdown_super -> sop_put_super -> ubifs_put_super -> ubifs_umount -> free_orphans

static void free_orphans(struct ubifs_info *c)
{
	struct ubifs_orphan *orph;

	while (c->orph_dnext) {                                        // 清空c->orph_dnext链表,释放上面的每一个orphan node
		orph = c->orph_dnext;
		c->orph_dnext = orph->dnext;
		list_del(&orph->list);
		kfree(orph);
	}

	while (!list_empty(&c->orph_list)) {                          // 清空c->orph_list链表,释放上面的每一个orphan node
		orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
		list_del(&orph->list);
		kfree(orph);
		ubifs_err(c, "orphan list not empty at unmount");
	}

	vfree(c->orph_buf);
	c->orph_buf = NULL;
}

D. mount时处理Flash上的orphan node记录。ubifs_mount -> ubifs_fill_super -> mount_ubifs -> ubifs_mount_orphans -> ubifs_clear_orphans

int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
{
	...
	if (unclean)                                                 // 上次umount操作不是clean的,需要恢复
		err = kill_orphans(c);
	else if (!read_only)                                         // 上次umount操作是clean,文件系统不是只读
		err = ubifs_clear_orphans(c);                        // 擦除Flash上的orphan node记录

	return err;
}

kill_orphans -> do_kill_orphans:

static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
			   unsigned long long *last_cmt_no, int *outofdate,
			   int *last_flagged)
{
	struct ubifs_scan_node *snod;
	struct ubifs_orph_node *orph;
	...
	list_for_each_entry(snod, &sleb->nodes, list) {             // sleb是从Flash上读起来的包含orphan node的LEB数据
		orph = snod->node;
		...
                n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
                for (i = 0; i < n; i++) {                                       // 对于每个orphan inode
                        ...
                        err = ubifs_tnc_lookup(c, &key1, ino);                  // 根据ino查找对应的inode结构
                        if (err)
                                goto out_free;

                        /*
                         * Check whether an inode can really get deleted.
                         * linkat() with O_TMPFILE allows rebirth of an inode.
                         */
                        if (ino->nlink == 0) {                                  // 对于tmpfile,虽然创建时被加入orphan list,如果在commit结束后其orphan node写入Flash,之后
发生link操作(不再是orphan文件),之后又发生unclean umount,再次mount时如果不对ino->nlink计数做判断可能会将其从TNC Tree中删除
                                dbg_rcvry("deleting orphaned inode %lu",
                                          (unsigned long)inum);

                                lowest_ino_key(c, &key1, inum);
                                highest_ino_key(c, &key2, inum);

                                err = ubifs_tnc_remove_range(c, &key1, &key2);  // 将orphan inode从TNC Tree上移除,由于unclean umount可能没有从TNC Tree来得及删除orphan inod
e
                                if (err)
                                        goto out_ro;
                        }

                        err = insert_dead_orphan(c, inum);
                        if (err)
                                goto out_free;
                }
		...
	}
	...
}

关于TNC Tree:

TNC Tree是ubifs在内存管理索引节点(每个inode对应一个index node)的数据结构,对TNC Tree更新直接影响到inode在Flash上的存储。

1. 对于tmpfile,创建时将inode加入TNC Tree:

ubifs_tmpfile -> do_tmpfile -> ubifs_jnl_update -> ubifs_tnc_add

2. 关闭文件,iput中检查inode的nlink计数,如果为0,则从TNC Tree中移除:

SYSCALL_DEFINE1(close, unsigned int, fd) -> __close_fd -> filp_close -> fput -> ____fput -> __fput -> dput -> dentry_kill -> __dentry_kill -> dentry_iput -> iput(如果nlink为0) -> iput_final -> evict -> sb_evict_inode(inode) -> ubifs_evict_inode -> ubifs_tnc_remove_ino -> ubifs_tnc_remove_ino

ubifs_link操作:

static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
		      struct dentry *dentry)
{
	...
	/* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */
	if (inode->i_nlink == 0)                                       // 如果链接的inode原来的引用计数为0,可能是一个tmpfile
		ubifs_delete_orphan(c, inode->i_ino);                  // 将对应inode从orphan node记录中删除,防止将其记录到Flash中的orphan node,发生unclean umount时防止遍历Flash记录的orphan inode然后从TNC Tree上删除对应的inode(见步骤D)

	inc_nlink(inode);                                              // 对inode的硬链接计数++,如果是tmpfile则硬链接计数为1
	...
}

ubifs_delete_orphan:

void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
{
        struct ubifs_orphan *orph, *child_orph, *tmp_o;

        spin_lock(&c->orphan_lock);                                             // 加锁c->orphan_lock

        orph = lookup_orphan(c, inum);                                          // orphan node必须在红黑树c->c->orph_tree中存在
        if (!orph) {
                spin_unlock(&c->orphan_lock);
                ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum);
                dump_stack();

                return;
        }

        list_for_each_entry_safe(child_orph, tmp_o, &orph->child_list, child_list) { // 如果有child,释放child_orphan
                list_del(&child_orph->child_list);
                orphan_delete(c, child_orph);
        }
        
        orphan_delete(c, orph);

        spin_unlock(&c->orphan_lock);                                           // 释放orphan node
}

static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph)
{
        if (orph->del) {                                                        // 如果orphan node状态为是del,不做处理
                dbg_gen("deleted twice ino %lu", orph->inum);
                return;
        }

        if (orph->cmt) {                                                        // 如果orphan node处于提交状态
                orph->del = 1;                                                  // 修改orphan node状态位为del
                orph->dnext = c->orph_dnext;
                c->orph_dnext = orph;                                           // 将orphan node移入c->orph_dnext
                dbg_gen("delete later ino %lu", orph->inum);
                return;
        }                                                                       // 如果orphan node已经完成提交(do_commit -> ubifs_orphan_end_commit )或者orphan node还处于c-
>orph_new链表中(代表orphan未提交)

        __orphan_drop(c, orph);
}

static void __orphan_drop(struct ubifs_info *c, struct ubifs_orphan *o)
{
        rb_erase(&o->rb, &c->orph_tree);                                        // 将orphan node从红黑树删除
        list_del(&o->list);                                                     // 将orphan node从c->orph_list链表删除
        c->tot_orphans -= 1;                                                    // orphan node总数减1

        if (o->new) {
                list_del(&o->new_list);                                         // 如果orphan node还没有提交
                c->new_orphans -= 1;
        }

        kfree(o);                                                               // 释放orphan node
}

那么现在考虑ubifs_link操作在一个tmpfile上发生在do_commit各阶段所带来的影响。假设tmpfile的inode为tmp_inode:

1. 如果ubifs_link发生在ubifs_orphan_start_commit对orphan node置位cmt之前(orphan node执行提交操作之前):

ubifs_link -> ubifs_delete_orphan直接将orpan node从各种链表和红黑树中删除,orphan node被释放,orphan node没有被放入c->orph_cnext链表,也不会被写入Flash。在关闭tmpfile时,检测到tmp_inode的nlink计数大于0,不需要从TNC Tree中删除。在umount后再mount时,不管umount操作是否clean,tmp_inode都不会从TNC Tree中移除,因为Flash中没有对应tmp_inode的orphan node的记录。

2. 如果ubifs_link发生在do_commit -> ubifs_orphan_end_commit -> commit_orphans中对orpan node清位cmt之前,并且在do_commit -> ubifs_orphan_start_commit对orphan node置位cmt之后(orphan node正在执行提交操作):

ubifs_link -> ubifs_delete_orphan将orphan node的del位置1,将orphan node加入c->orph_dnext链表,此时orphan node已经被放入c->orph_cnext链表,放入c->orph_cnext链表的orpan node一定会被写入Flash。然后ubifs_orphan_end_commit -> erase_deleted将所有在c->orph_dnext链表中的orphan node从其他所有链表和红黑树中移除,并释放orphan node。在关闭tmpfile时,检测到tmp_inode的nlink计数大于0,不需要从TNC Tree中删除。在umount后再mount时,如果上次umount是clean,直接将Flash中orphan node记录擦除,正常结束;如果上次umount是unclean,do_kill_orphans根据Flash中记录的orphan node(inum)会将tmp_inode从TNC Tree中删除,而这个文件其实在umount前被link过了,nlink计数为1,不应该被删除。所以在do_kill_orphans中要对每个读取的orphan node的nlink进行判断,为0则从TNC Tree上删除。

3. 如果ubifs_link发生在do_commit -> ubifs_orphan_end_commit -> commit_orphans中对orpan node清位cmt之后(完成orphan node提交操作):

ubifs_link -> ubifs_delete_orphan直接将orpan node从各种链表和红黑树中删除,orphan node被释放,orphan node已经被写入Flash(因为write_orph_node中cmt清位的orphan node会被写入Flash)。在关闭tmpfile时,检测到tmp_inode的nlink计数大于0,不需要从TNC Tree中删除。在umount后再mount时,如果上次umount是clean,直接将Flash中orphan node记录擦除,正常结束;如果上次umount是unclean,do_kill_orphans根据Flash中记录的orphan node(inum)会将tmp_inode从TNC Tree中删除,而这个文件其实在umount前被link过了,nlink计数为1,不应该被删除。所以在do_kill_orphans中要对每个读取的orphan node的nlink进行判断,为0则从TNC Tree上删除。

基于tmpfile进行link操作的示例代码如下:

#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

int main()
{
	int err;
	int fd = open("/root", __O_TMPFILE | O_RDWR, S_IRUSR | S_IWUSR);
	if (fd < 0)
		printf("E 1\n");
	write(fd, "123", 3);
	//close(fd);

	char tmp_fname[30];
	sprintf(tmp_fname, "/proc/self/fd/%d", fd);

	err = linkat(AT_FDCWD, tmp_fname, -1, "/root/tmp/newfile", AT_SYMLINK_FOLLOW);
	if (err)
		printf("E 3 %d\n", err);
	return 0;
}
⚠️ **GitHub.com Fallback** ⚠️