Bridge框架 - awokezhou/LinuxPage GitHub Wiki

概述

本文通过对Linux Bridge相关源码进行分析,梳理Bridge框架和原理、数据转发和学习机制,主要内容包括brctl命令、Bridge初始化、Bridge添加操作、Bridge绑定网卡操作和Bridge数据处理流程

brctl源码分析

Linux环境下配置Bridge的命令为brctl,其源码位于busybox/networks/brctl.c文件,brctl的命令用法为

命令 参数 作用
brctl addbr br_name 为系统添加一个名为“br_name”的网桥设备
brctk addif br_name if_name 为名为“br_name”的网桥设备绑定一个名为“if_name”的网络接口

最简单的一个应用场景,假如系统里有两个网络接口,eth0和eth1,如何做到两个接口数据互通呢?

brctl addbr br0
brctl addif br0 eth0
brctl addif br0 eth1
ifconfig 192.168.1.1 netmask 255.255.255.0 up

这样就将eth0和eth1都绑定到了br0这个网桥设备上,br0可以理解为一个软交换机,它有两个端口,分别为eth0和eth1,从任何一个端口进来的数据,对其源mac进行学习,根据目的mac进行转发,在内核中维护一个桥接表

那么它是如何实现这些功能的呢,先来看一下busybox中的brctl在做addbr和addif时都做了什么

	br = *argv++;

	if (key == ARG_addbr || key == ARG_delbr) { /* addbr or delbr */
		ioctl_or_perror_and_die(fd,
				key == ARG_addbr ? SIOCBRADDBR : SIOCBRDELBR,
				br, "bridge %s", br);
		goto done;
	}

	if (!*argv) /* all but 'addif/delif' need at least two arguments */
		bb_show_usage();

	strncpy(ifr.ifr_name, br, IFNAMSIZ);
	if (key == ARG_addif || key == ARG_delif) { /* addif or delif */
		brif = *argv;
		ifr.ifr_ifindex = if_nametoindex(brif);
		if (!ifr.ifr_ifindex) {
			bb_perror_msg_and_die("iface %s", brif);
		}
		ioctl_or_perror_and_die(fd,
				key == ARG_addif ? SIOCBRADDIF : SIOCBRDELIF,
				&ifr, "bridge %s", br);
		goto done_next_argv;
	}

以上代码节选了brctl.c中addbr和addif操作的一段,当输入addbr、delbr、addif、delif时,都会调用ioctl_or_perror_and_die()这个函数来处理,向这个函数传的参数有所变化,ioctl_or_perror_and_die()函数实际上就是调用了一个ioctl将参数传递到内核,如下

int FAST_FUNC ioctl_or_perror_and_die(int fd, unsigned request, void *argp, const char *fmt,...)
{
	int ret;
	va_list p;

	ret = ioctl(fd, request, argp);
	if (ret < 0) {
		va_start(p, fmt);
		bb_verror_msg(fmt, p, strerror(errno));
		/* xfunc_die can actually longjmp, so be nice */
		va_end(p);
		xfunc_die();
	}
	return ret;
}

总结一下,不同操作命令下,向内核传递的ioctl类型和传递值分别为

命令 ioctl参数 传递值
brctl addbr SIOCBRADDBR 网桥设备名称
brctl delbr SIOCBRDELBR 网桥设备名称
brctl addif SIOCBRADDIF ifr结构,网桥设备名称放在ifr.ifr_name中,要绑定的网络接口index放在ifr.ifr_ifindex中
brctl delif SIOCBRDELIF ifr结构,网桥设备名称放在ifr.ifr_name中,要解绑的网络接口index放在ifr.ifr_ifindex中

Brideg源码分析

梳理清楚了brctl命令的传参情况,进入kernel代码里看看Bridge是怎么运行的

Bridge初始化

先分析一下初始化代码。Bridge相关代码位于net/bridge/下,其初始化代码位于br.c中

static int __init br_init(void)
{
	int err;

	err = stp_proto_register(&br_stp_proto);
	if (err < 0) {
		pr_err("bridge: can't register sap for STP\n");
		return err;
	}

	err = br_fdb_init();
	if (err)
		goto err_out;

	err = register_pernet_subsys(&br_net_ops);
	if (err)
		goto err_out1;

	err = br_netfilter_init();
	if (err)
		goto err_out2;

	err = register_netdevice_notifier(&br_device_notifier);
	if (err)
		goto err_out3;

	err = br_netlink_init();
	if (err)
		goto err_out4;

	brioctl_set(br_ioctl_deviceless_stub);

#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
	br_fdb_test_addr_hook = br_fdb_test_addr;
#endif

	return 0;
err_out4:
	unregister_netdevice_notifier(&br_device_notifier);
err_out3:
	br_netfilter_fini();
err_out2:
	unregister_pernet_subsys(&br_net_ops);
err_out1:
	br_fdb_fini();
err_out:
	stp_proto_unregister(&br_stp_proto);
	return err;
}
module_init(br_init)
module_exit(br_deinit)

在最下方可以看到,Bridge和网络子系统不同,是以module方式被内核加载的,由于在对内核做make menuconfig操作时,将Bridge选择为了静态module,所以这段初始化代码在Linux系统启动时,就会被加载

首先是调用了一个stp_proto_register()函数,没有仔细研究过,应该是和生成树协议相关的东西

接下来调用了br_fdb_init()函数,fdb很重要(后面会专门分析),指的就是Bridge的桥接表,这里对其做了初始化

static struct kmem_cache *br_fdb_cache __read_mostly;
int __init br_fdb_init(void)
{
	br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
					 sizeof(struct net_bridge_fdb_entry),
					 0,
					 SLAB_HWCACHE_ALIGN, NULL);
	if (!br_fdb_cache)
		return -ENOMEM;

	get_random_bytes(&fdb_salt, sizeof(fdb_salt));
	return 0;
}

调用kmem_cache_create()为br_fdb_cache这个指针分配了一段缓存空间,猜测是为了增强转发性能

接着调用了register_pernet_subsys(),这个没研究过

调用br_netfilter_init()初始化netfilter,在数据包流经的几个点挂接了钩子函数,应用层有一个ebtables工具,与Bridge里的netfilter相结合,来做一些mac层的转发规则,非常类似于iptables的做法

int __init br_netfilter_init(void)
{
	int ret;

	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
	if (ret < 0)
		return ret;
#ifdef CONFIG_SYSCTL
	brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table);
	if (brnf_sysctl_header == NULL) {
		printk(KERN_WARNING
		       "br_netfilter: can't register to sysctl.\n");
		nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
		return -ENOMEM;
	}
#endif
	printk(KERN_NOTICE "Bridge firewalling registered\n");
	return 0;
}

然后调用了register_netdevice_notifier(),这个也没研究过

调用br_netlink_init(),注册了netlink消息,应该是提供用户空间程序处理钩子函数的结构,比如ebtables

最后调用了brioctl_set(br_ioctl_deviceless_stub),这个函数也很重要,brctl的ioctl实际调用的就是这里

void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
{
	mutex_lock(&br_ioctl_mutex);
	br_ioctl_hook = hook;
	mutex_unlock(&br_ioctl_mutex);
}

这里就是用一个全局变量函数指针br_ioctl_hook指向函数br_ioctl_deviceless_stub(),ioctl的调用与这个hook的关系在net/socket.c的socket_ioctl()函数中

static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
    ......
	case SIOCBRADDBR:
	case SIOCBRDELBR:
		err = -ENOPKG;
		if (!br_ioctl_hook)
			request_module("bridge");

		mutex_lock(&br_ioctl_mutex);
		if (br_ioctl_hook)
			err = br_ioctl_hook(net, cmd, argp);
		mutex_unlock(&br_ioctl_mutex);
		break;
    ......
}

也就是说,brctl的ioctl参数为SIOCBRADDBR和SIOCBRDELBR时,实际上调用的是br_ioctl_deviceless_stub()这个函数

总结一下,Bridge在内核中是以模块的方式加载的,在其初始化函数中,主要做了这样几件事

  1. STP相关东西的初始化
  2. 为桥接表分配了一块缓存空间
  3. 注册了一个ernet子系统什么的
  4. 注册钩子函数
  5. 注册了notifier什么的
  6. netlink初始化
  7. 提供ioctl接口br_ioctl_deviceless_stub()

添加/删除网桥设备

由刚才的分析可以知道,要搞清楚内核是如何添加或删除一个网桥设备的,就要分析br_ioctl_deviceless_stub()这个函数

int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
	switch (cmd) {
	case SIOCGIFBR:
	case SIOCSIFBR:
		return old_deviceless(net, uarg);

	case SIOCBRADDBR:
	case SIOCBRDELBR:
	{
		char buf[IFNAMSIZ];

		if (!capable(CAP_NET_ADMIN))
			return -EPERM;

		if (copy_from_user(buf, uarg, IFNAMSIZ))
			return -EFAULT;

		buf[IFNAMSIZ-1] = 0;
		if (cmd == SIOCBRADDBR)
			return br_add_bridge(net, buf);

		return br_del_bridge(net, buf);
	}
	}
	return -EOPNOTSUPP;
}

这个函数的最后一个参数uarg就是用户空间传下来的值,即要添加的网桥设备名称,通过copy_from_user()拷贝到了buf中。如果是添加操作,会调用br_add_bridge(),如果是删除操作,会调用br_del_bridge()

添加网桥设备

添加网桥设备的函数br_del_bridge(),代码如下

int br_add_bridge(struct net *net, const char *name)
{
	struct net_device *dev;
	int ret;

	dev = new_bridge_dev(net, name);
	if (!dev)
		return -ENOMEM;

	rtnl_lock();
	if (strchr(dev->name, '%')) {
		ret = dev_alloc_name(dev, dev->name);
		if (ret < 0)
			goto out_free;
	}

	SET_NETDEV_DEVTYPE(dev, &br_type);

	ret = register_netdevice(dev);
	if (ret)
		goto out_free;

	ret = br_sysfs_addbr(dev);
	if (ret)
		unregister_netdevice(dev);
 out:
	rtnl_unlock();
	return ret;

out_free:
	free_netdev(dev);
	goto out;
}

函数new_bridge_dev()返回时,已经创建好了一个网络设备,可以发现,Bridge设备使用的数据结构也是net_device

调用SET_NETDEV_DEVTYPE()这个宏,将net_device的类型设置为&br_type,看看是怎么做的

#define SET_NETDEV_DEVTYPE(net, devtype)	((net)->dev.type = (devtype))

其实就是用&br_type对net_device的type赋值,那&br_type又是什么呢

static struct device_type br_type = {
	.name	= "bridge",
};

其实就是把net_device的type当作一个指针来用了,记录br_type这个全局结构的地址

然后调用register_netdevice()向内核注册创建好的网桥设备,再调用br_sysfs_addbr()将网桥添加到sys文件系统,这里没怎么研究

所以添加操作的核心封装在new_bridge_dev()这个函数中,看看它是怎么实现的

static struct net_device *new_bridge_dev(struct net *net, const char *name)
{
	struct net_bridge *br;
	struct net_device *dev;

	dev = alloc_netdev(sizeof(struct net_bridge), name,
			   br_dev_setup);

	if (!dev)
		return NULL;
	dev_net_set(dev, net);

	br = netdev_priv(dev);
	br->dev = dev;

	br->stats = alloc_percpu(struct br_cpu_netstats);
	if (!br->stats) {
		free_netdev(dev);
		return NULL;
	}

	spin_lock_init(&br->lock);
	INIT_LIST_HEAD(&br->port_list);
	spin_lock_init(&br->hash_lock);

	br->bridge_id.prio[0] = 0x80;
	br->bridge_id.prio[1] = 0x00;

	memcpy(br->group_addr, br_group_address, ETH_ALEN);

	br->feature_mask = dev->features;
	br->stp_enabled = BR_NO_STP;
	br->designated_root = br->bridge_id;
	br->root_path_cost = 0;
	br->root_port = 0;
	br->bridge_max_age = br->max_age = 20 * HZ;
	br->bridge_hello_time = br->hello_time = 2 * HZ;
	br->bridge_forward_delay = br->forward_delay = 15 * HZ;
	br->topology_change = 0;
	br->topology_change_detected = 0;
	br->ageing_time = 300 * HZ;

	br_netfilter_rtable_init(br);

	br_stp_timer_init(br);
	br_multicast_init(br);

	return dev;
}

首先调用了alloc_netdev()函数,为net_device数据结构分配了内存空间,待会儿要进去分析一下这个内存空间结构,然后调用了netdev_priv()从net_device中获得Bridge私有数据指针,并将Bridge的dev指向net_device,其关系简单来说如下图

然后初始化了2个spin锁、一个list,然后就是对很多net_bridge里的参数进行设置

重点看一下alloc_netdev()函数,传入的参数是一个net_bridge结构的大小,一个要创建的网桥设备名称,一个br_dev_setup()函数

#define alloc_netdev(sizeof_priv, name, setup) \
	alloc_netdev_mq(sizeof_priv, name, setup, 1)

这个宏调用alloc_netdev_mq()函数,追加了一个参数1

/**
 *	alloc_netdev_mq - allocate network device
 *	@sizeof_priv:	size of private data to allocate space for
 *	@name:		device name format string
 *	@setup:		callback to initialize device
 *	@queue_count:	the number of subqueues to allocate
 *
 *	Allocates a struct net_device with private data area for driver use
 *	and performs basic initialization.  Also allocates subquue structs
 *	for each queue on the device at the end of the netdevice.
 */
struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
		void (*setup)(struct net_device *), unsigned int queue_count)
{
	struct netdev_queue *tx;
	struct net_device *dev;
	size_t alloc_size;
	struct net_device *p;
#ifdef CONFIG_RPS
	struct netdev_rx_queue *rx;
	int i;
#endif

	BUG_ON(strlen(name) >= sizeof(dev->name));

	alloc_size = sizeof(struct net_device);
	if (sizeof_priv) {
		/* ensure 32-byte alignment of private area */
		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
		alloc_size += sizeof_priv;
	}
	/* ensure 32-byte alignment of whole construct */
	alloc_size += NETDEV_ALIGN - 1;

	p = kzalloc(alloc_size, GFP_KERNEL);
	if (!p) {
		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
		return NULL;
	}

	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
	if (!tx) {
		printk(KERN_ERR "alloc_netdev: Unable to allocate "
		       "tx qdiscs.\n");
		goto free_p;
	}

#ifdef CONFIG_RPS
	rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
	if (!rx) {
		printk(KERN_ERR "alloc_netdev: Unable to allocate "
		       "rx queues.\n");
		goto free_tx;
	}

	atomic_set(&rx->count, queue_count);

	/*
	 * Set a pointer to first element in the array which holds the
	 * reference count.
	 */
	for (i = 0; i < queue_count; i++)
		rx[i].first = rx;
#endif

	dev = PTR_ALIGN(p, NETDEV_ALIGN);
	dev->padded = (char *)dev - (char *)p;

	if (dev_addr_init(dev))
		goto free_rx;

	dev_mc_init(dev);
	dev_uc_init(dev);

	dev_net_set(dev, &init_net);

	dev->_tx = tx;
	dev->num_tx_queues = queue_count;
	dev->real_num_tx_queues = queue_count;

#ifdef CONFIG_RPS
	dev->_rx = rx;
	dev->num_rx_queues = queue_count;
#endif

	dev->gso_max_size = GSO_MAX_SIZE;

	netdev_init_queues(dev);

	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
	dev->ethtool_ntuple_list.count = 0;
	INIT_LIST_HEAD(&dev->napi_list);
	INIT_LIST_HEAD(&dev->unreg_list);
	INIT_LIST_HEAD(&dev->link_watch_list);
	dev->priv_flags = IFF_XMIT_DST_RELEASE;
	setup(dev);
	strcpy(dev->name, name);
	return dev;

free_rx:
#ifdef CONFIG_RPS
	kfree(rx);
free_tx:
#endif
	kfree(tx);
free_p:
	kfree(p);
	return NULL;
}

由函数上面的注释部分,大致可以了解到,该函数就是用来为网络设备分配内存空间的,可附加私有数据,参数的含义为

参数 含义
sizeof_priv 要分配的私有数据的大小
name 字符串形式的设备名称
setup 用于初始化设备的函数
queue_count 要分配的队列大小

这里的queue_count固定为1

函数首先计算了要分配的net_device空间大小,其大小是以32byte对齐的net_device结构大小与net_bridge结构大小之和,然后通过kalloc()函数为网络设备和队列分配了空间,并将队列挂接到了net_device上,然后初始化了一些list。最终分配的内存空间结构如图

最后会调用传入的初始化函数br_dev_setup()来进行一些初始化工作

void br_dev_setup(struct net_device *dev)
{
	random_ether_addr(dev->dev_addr);
	ether_setup(dev);

	dev->netdev_ops = &br_netdev_ops;
	dev->destructor = br_dev_free;
	SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
	dev->tx_queue_len = 0;
	dev->priv_flags = IFF_EBRIDGE;

	dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
			NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
			NETIF_F_NETNS_LOCAL | NETIF_F_GSO;
}

首先调用random_ether_addr()以随机数方式设置了设备mac地址,然后调用了ether_setup()函数,

void ether_setup(struct net_device *dev)
{
	dev->header_ops		= &eth_header_ops;
	dev->type		= ARPHRD_ETHER;
	dev->hard_header_len 	= ETH_HLEN;
	dev->mtu		= ETH_DATA_LEN;
	dev->addr_len		= ETH_ALEN;
	dev->tx_queue_len	= 1000;	/* Ethernet wants good queues */
	dev->flags		= IFF_BROADCAST|IFF_MULTICAST;

	memset(dev->broadcast, 0xFF, ETH_ALEN);

}

ether_setup()这个函数对net_device结构里的一些数据成员进行了设置,如MTU、TX队列大小等

然后回到br_dev_setup()函数,挂接了netdev_ops和ethtool_ops

至此,通过brctl addbr来创建一个网桥设备的整个过程已经完成了,但是这些代码里似乎看不到和数据处理、学习、转发相关的任何内容,所以,如果只是创建了一个网桥设备,而没有绑定任何的网络接口,这个网桥实际上什么也没干

删除网桥设备

删除操作相比添加操作,简单多了,代码如下

int br_del_bridge(struct net *net, const char *name)
{
	struct net_device *dev;
	int ret = 0;

	rtnl_lock();
	dev = __dev_get_by_name(net, name);
	if (dev == NULL)
		ret =  -ENXIO; 	/* Could not find device */

	else if (!(dev->priv_flags & IFF_EBRIDGE)) {
		/* Attempt to delete non bridge device! */
		ret = -EPERM;
	}

	else if (dev->flags & IFF_UP) {
		/* Not shutdown yet. */
		ret = -EBUSY;
	}

	else
		del_br(netdev_priv(dev), NULL);

	rtnl_unlock();
	return ret;
}

调用__dev_get_by_name(),通过用户空间传入的name来找到对应的net_device结构,

struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
	struct hlist_node *p;
	struct net_device *dev;
	struct hlist_head *head = dev_name_hash(net, name);

	hlist_for_each_entry(dev, p, head, name_hlist)
		if (!strncmp(dev->name, name, IFNAMSIZ))
			return dev;

	return NULL;
}

其实现就是从一个list表“name_hlist”中去找到name对应的结构,返回。添加操作中没有和这个name_hlist相关的内容,全局搜索了一下,该操作是在register_netdevice()中做的操作

然后会调用del_br()来完成操作,其内部主要调用了br_sysfs_delbr()删除sys文件系统中的内容,调用unregister_netdevice_queue()注销设备和队列

static void del_br(struct net_bridge *br, struct list_head *head)
{
	struct net_bridge_port *p, *n;

	list_for_each_entry_safe(p, n, &br->port_list, list) {
		del_nbp(p);
	}

	del_timer_sync(&br->gc_timer);

	br_sysfs_delbr(br->dev);
	unregister_netdevice_queue(br->dev, head);
}

为网桥添加/删除网络接口

通过前面的网桥设备添加操作,添加一个网桥设备,在其net_device结构会挂接br_netdev_ops,

static const struct net_device_ops br_netdev_ops = {
	.ndo_open		 = br_dev_open,
	.ndo_stop		 = br_dev_stop,
	.ndo_start_xmit		 = br_dev_xmit,
	.ndo_get_stats64	 = br_get_stats64,
	.ndo_set_mac_address	 = br_set_mac_address,
	.ndo_set_multicast_list	 = br_dev_set_multicast_list,
	.ndo_change_mtu		 = br_change_mtu,
	.ndo_do_ioctl		 = br_dev_ioctl,
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_netpoll_setup	 = br_netpoll_setup,
	.ndo_netpoll_cleanup	 = br_netpoll_cleanup,
	.ndo_poll_controller	 = br_poll_controller,
#endif
};

其中有一个br_dev_ioctl,点击进去,可以看到

int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
	struct net_bridge *br = netdev_priv(dev);

	switch(cmd) {
	case SIOCDEVPRIVATE:
		return old_dev_ioctl(dev, rq, cmd);

	case SIOCBRADDIF:
	case SIOCBRDELIF:
		return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);

	}

	br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
	return -EOPNOTSUPP;
}

这里面刚好就有SIOCBRADDIF和SIOCBRDELIF这两个ioctl参数,从这里就可以推断出,brctl addif br0 if_name,最终就会查找到这里来进行操作。但是这里首先要搞清楚一个问题,系统里可以提前创建多个网桥设备,brctl addif br0 eth0这条命令怎么就能把eth0刚好添加到br0上,而不是添加到br1、br2上呢?

在最开始分析brctl命令时,分析过brctl addif这个命令的参数内容,它是向内核传递了一个ifr,ifr.ifr_name是网桥设备的名称,ifr.ifr_ifindex是要绑定的网络接口的名称,这一点很重要

全局搜索一下SIOCBRADDIF都在什么地方出现过

---- SIOCBRADDIF Matches (6 in 4 files) ----
Br_ioctl.c (linux-2.6.36.x\net\bridge):	case SIOCBRADDIF:
Br_ioctl.c (linux-2.6.36.x\net\bridge):		return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
Dev.c (linux-2.6.36.x\net\core):		    cmd == SIOCBRADDIF ||
Dev.c (linux-2.6.36.x\net\core):	case SIOCBRADDIF:
Socket.c (linux-2.6.36.x\net):	case SIOCBRADDIF:
Sockios.h (linux-2.6.36.x\include\linux):#define SIOCBRADDIF	0x89a2		/* add interface to bridge      */

大概在socket.c、dev.c和br_ioctl.c里面出现过。这里就需要对linux的系统调用大概过程有一个认识,ioctl这个系统调用是怎么走的呢,我也不是很清楚,但是一定是从最抽象的层次一层一层到最具体的层次

                    syscall  
**********************************************
    net dev         char dev        block dev
**********************************************
  br_ioctl...      gpio_ioctl...   flash_ioctl...

就像我们在应用层使用read()、write()这些系统调用,内核怎么知道我们到底读的是什么东西呢,读的是文件系统里的一个文件,还是设备驱动,起始是通过名字和设备号来区分的,无论是字符设备、块设备、网络设备,都会有它自己的标识,当使用系统调用时,传入的参数就指定了访问的到底是哪种设备

通过socket()返回的fd来做ioctl,当然访问的是网络设备,这一点没问题,然后在网络设备里又有好多种,有网桥、网络接口、虚接口等,内核又怎么知道访问的是哪一种具体的设备呢?通过传入的iocto参数和ifr来区分大致可以推断出这个ioctl的执行流程是

socket.c -> dev.c -> br_ioctl.c

先来看socket.c里的iotcl部分

	case SIOCSIFTXQLEN:
	case SIOCBRADDIF:
	case SIOCBRDELIF:
	case SIOCSIFNAME:
	case SIOCGMIIPHY:
	case SIOCGMIIREG:
	case SIOCSMIIREG:
		return dev_ifsioc(net, sock, cmd, argp);

如果有以上这些cmd,就会调用dev_ifsioc()函数,进去看看

static int dev_ifsioc(struct net *net, struct socket *sock,
			 unsigned int cmd, struct compat_ifreq __user *uifr32)
{
	struct ifreq __user *uifr;
	int err;

	uifr = compat_alloc_user_space(sizeof(*uifr));
	if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
		return -EFAULT;

	err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);

	if (!err) {
		switch (cmd) {
		case SIOCGIFFLAGS:
		case SIOCGIFMETRIC:
		case SIOCGIFMTU:
		case SIOCGIFMEM:
		case SIOCGIFHWADDR:
		case SIOCGIFINDEX:
		case SIOCGIFADDR:
		case SIOCGIFBRDADDR:
		case SIOCGIFDSTADDR:
		case SIOCGIFNETMASK:
		case SIOCGIFPFLAGS:
		case SIOCGIFTXQLEN:
		case SIOCGMIIPHY:
		case SIOCGMIIREG:
			if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
				err = -EFAULT;
			break;
		}
	}
	return err;
}

这个函数调用copy_in_user()将用户空间传进来的ifr信息接收到一块内存上,用uifr这个指针记录了位置,然后调用sock_do_ioctl(),传入了uifr

static long sock_do_ioctl(struct net *net, struct socket *sock,
				 unsigned int cmd, unsigned long arg)
{
	int err;
	void __user *argp = (void __user *)arg;

	err = sock->ops->ioctl(sock, cmd, arg);

	/*
	 * If this ioctl is unknown try to hand it down
	 * to the NIC driver.
	 */
	if (err == -ENOIOCTLCMD)
		err = dev_ioctl(net, cmd, argp);

	return err;
}

这里调用了一个sock->ops->ioctl()来处理,如果返回值为ENOIOCTLCMD,就会调用dev_ioctl()。那怎么知道这个sock->ops->ioctl()是什么函数?它的调用结果是什么呢?这就得明白应用层调用socket()时内核都会干什么,内核是如何创建并初始化这个套接字的,才能准确定位这个sock->ops->ioctl()是哪个函数,这里不展开分析了

简单来说,内核会根据socket(domain, type, proto)传入的这些参数,来判断到底挂接哪种ops,因为brctl命令创建的是以“AF_INET、SOCK_STREAM、0”为参数的套接字,所以最后这个ops指向一个inet_stream_ops,

const struct proto_ops inet_stream_ops = {
	.family		   = PF_INET,
	.owner		   = THIS_MODULE,
	.release	   = inet_release,
	.bind		   = inet_bind,
	.connect	   = inet_stream_connect,
	.socketpair	   = sock_no_socketpair,
	.accept		   = inet_accept,
	.getname	   = inet_getname,
	.poll		   = tcp_poll,
	.ioctl		   = inet_ioctl,
	.listen		   = inet_listen,
	.shutdown	   = inet_shutdown,
	.setsockopt	   = sock_common_setsockopt,
	.getsockopt	   = sock_common_getsockopt,
	.sendmsg	   = inet_sendmsg,
	.recvmsg	   = inet_recvmsg,
	.mmap		   = sock_no_mmap,
	.sendpage	   = inet_sendpage,
	.splice_read	   = tcp_splice_read,
#ifdef CONFIG_COMPAT
	.compat_setsockopt = compat_sock_common_setsockopt,
	.compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);

可以看到,这个inet_stream_ops下挂的ioctl为inet_ioctl,看看它干了什么

int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
	struct sock *sk = sock->sk;
	int err = 0;
	struct net *net = sock_net(sk);

	switch (cmd) {
	case SIOCGSTAMP:
		err = sock_get_timestamp(sk, (struct timeval __user *)arg);
		break;
	case SIOCGSTAMPNS:
		err = sock_get_timestampns(sk, (struct timespec __user *)arg);
		break;
	case SIOCADDRT:
	case SIOCDELRT:
	case SIOCRTMSG:
		err = ip_rt_ioctl(net, cmd, (void __user *)arg);
		break;
	case SIOCDARP:
	case SIOCGARP:
	case SIOCSARP:
		err = arp_ioctl(net, cmd, (void __user *)arg);
		break;
	case SIOCGIFADDR:
	case SIOCSIFADDR:
	case SIOCGIFBRDADDR:
	case SIOCSIFBRDADDR:
	case SIOCGIFNETMASK:
	case SIOCSIFNETMASK:
	case SIOCGIFDSTADDR:
	case SIOCSIFDSTADDR:
	case SIOCSIFPFLAGS:
	case SIOCGIFPFLAGS:
	case SIOCSIFFLAGS:
		err = devinet_ioctl(net, cmd, (void __user *)arg);
		break;
	default:
		if (sk->sk_prot->ioctl)
			err = sk->sk_prot->ioctl(sk, cmd, arg);
		else
			err = -ENOIOCTLCMD;
		break;
	}
	return err;
}
EXPORT_SYMBOL(inet_ioctl);

其实就是找cmd,找不到的话,返回ENOIOCTLCMD,这样就和上面的ENOIOCTLCMD衔接上了

好,sock_do_ioctl()函数会调用dev_ioctl(),

case SIOCBONDSETHWADDR:
	case SIOCBONDCHANGEACTIVE:
	case SIOCBRADDIF:
	case SIOCBRDELIF:
	case SIOCSHWTSTAMP:
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
		/* fall through */
	case SIOCBONDSLAVEINFOQUERY:
	case SIOCBONDINFOQUERY:
		dev_load(net, ifr.ifr_name);
		rtnl_lock();
		ret = dev_ifsioc(net, &ifr, cmd);
		rtnl_unlock();
		return ret;

发现从这里,已经进入了dev.c,就是已经判断进入了网络设备的ioctl层。这里会调用dev_load()和dev_ifsioc(),

void dev_load(struct net *net, const char *name)
{
	struct net_device *dev;

	rcu_read_lock();
	dev = dev_get_by_name_rcu(net, name);
	rcu_read_unlock();

	if (!dev && capable(CAP_NET_ADMIN))
		request_module("%s", name);
}

dev_load()函数就是调用dev_get_by_name_rcu()根据name来找到设备,这里的name就是brctl命令的网桥设备名称,看看dev_get_by_name_rcu()是怎么找的

struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
	struct hlist_node *p;
	struct net_device *dev;
	struct hlist_head *head = dev_name_hash(net, name);

	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
		if (!strncmp(dev->name, name, IFNAMSIZ))
			return dev;

	return NULL;
}

就是从name_hlist这个list中去查找的,前面分析网桥添加的时候提到过,在注册网桥到网络设备时会操作这个name_hlist,这里根据网桥名称可以找到设备

然后会调用dev_ifsioc(),

static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
{
	int err;
	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
	const struct net_device_ops *ops;

	if (!dev)
		return -ENODEV;

	ops = dev->netdev_ops;

	switch (cmd) {
	......
	/*
	 *	Unknown or private ioctl
	 */
	default:
		if ((cmd >= SIOCDEVPRIVATE &&
		    cmd <= SIOCDEVPRIVATE + 15) ||
		    cmd == SIOCBONDENSLAVE ||
		    cmd == SIOCBONDRELEASE ||
		    cmd == SIOCBONDSETHWADDR ||
		    cmd == SIOCBONDSLAVEINFOQUERY ||
		    cmd == SIOCBONDINFOQUERY ||
		    cmd == SIOCBONDCHANGEACTIVE ||
		    cmd == SIOCGMIIPHY ||
		    cmd == SIOCGMIIREG ||
		    cmd == SIOCSMIIREG ||
		    cmd == SIOCBRADDIF ||
		    cmd == SIOCBRDELIF ||
		    cmd == SIOCSHWTSTAMP ||
		    cmd == SIOCWANDEV) {
			err = -EOPNOTSUPP;
			if (ops->ndo_do_ioctl) {
				if (netif_device_present(dev))
					err = ops->ndo_do_ioctl(dev, ifr, cmd);
				else
					err = -ENODEV;
			}
		} else
			err = -EINVAL;

	}
	return err;
}

首先调用了__dev_get_by_name()根据网桥名称获得网络设备结构体net_device,然后在switch语句里,判断net_device->ops->ndo_do_ioctl是否存在,因为添加过网桥设备的net_device结构体上挂接了br_netdev_ops,所以存在,然后就会执行ndo_do_ioctl,也就是br_netdev_ops->ndo_do_ioctl,也就是br_dev_ioctl()函数。

总结一下,用户空间通过brctl addif br0 eth0时,调用了系统调用ioctl()函数,这个函数根据传入的ioctl命令和ifr参数,从通用socket层开始查找,经过net dev层的ioctl,最终根据传入的网桥名称“br0”找到这个设备结构体net_device,该结构的net_device_ops挂接了br_netdev_ops,所以最终调用的是br_dev_ioctl()这个函数

br_dev_ioctl

进入该函数,分析一下

int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
	struct net_bridge *br = netdev_priv(dev);

	switch(cmd) {
	case SIOCDEVPRIVATE:
		return old_dev_ioctl(dev, rq, cmd);

	case SIOCBRADDIF:
	case SIOCBRDELIF:
		return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);

	}

	br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
	return -EOPNOTSUPP;
}

首先调用了netdev_priv()从net_device上获取私有设备结构体net_bridge,然后添加和删除都会调用add_del_if(),传入了ifr.ifr_index,也就是要添加的网络接口的索引号,以此来确定要绑定哪一个网络设备

static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
{
	struct net_device *dev;
	int ret;

	if (!capable(CAP_NET_ADMIN))
		return -EPERM;

	dev = __dev_get_by_index(dev_net(br->dev), ifindex);
	if (dev == NULL)
		return -EINVAL;

	if (isadd)
		ret = br_add_if(br, dev);
	else
		ret = br_del_if(br, dev);

	return ret;
}

调用了__dev_get_by_index()来根据索引获得网络接口设备结构体,然后如果是添加操作,调用br_add_if(),如果是删除操作,调用br_del_if()

先来看看br_add_if,

int br_add_if(struct net_bridge *br, struct net_device *dev)
{
	struct net_bridge_port *p;
	int err = 0;

	/* Don't allow bridging non-ethernet like devices */
	if ((dev->flags & IFF_LOOPBACK) ||
	    dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN)
		return -EINVAL;

	/* No bridging of bridges */
	if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
		return -ELOOP;

	/* Device is already being bridged */
	if (br_port_exists(dev))
		return -EBUSY;

	/* No bridging devices that dislike that (e.g. wireless) */
	if (dev->priv_flags & IFF_DONT_BRIDGE)
		return -EOPNOTSUPP;

	p = new_nbp(br, dev);
	if (IS_ERR(p))
		return PTR_ERR(p);

	err = dev_set_promiscuity(dev, 1);
	if (err)
		goto put_back;

	err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
				   SYSFS_BRIDGE_PORT_ATTR);
	if (err)
		goto err0;

	err = br_fdb_insert(br, p, dev->dev_addr);
	if (err)
		goto err1;

	err = br_sysfs_addif(p);
	if (err)
		goto err2;

	if (br_netpoll_info(br) && ((err = br_netpoll_enable(p))))
		goto err3;

	err = netdev_rx_handler_register(dev, br_handle_frame, p);
	if (err)
		goto err3;

	dev->priv_flags |= IFF_BRIDGE_PORT;

	dev_disable_lro(dev);

	list_add_rcu(&p->list, &br->port_list);

	spin_lock_bh(&br->lock);
	br_stp_recalculate_bridge_id(br);
	br_features_recompute(br);

	if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) &&
	    (br->dev->flags & IFF_UP))
		br_stp_enable_port(p);
	spin_unlock_bh(&br->lock);

	br_ifinfo_notify(RTM_NEWLINK, p);

	dev_set_mtu(br->dev, br_min_mtu(br));

	kobject_uevent(&p->kobj, KOBJ_ADD);

	return 0;
err3:
	sysfs_remove_link(br->ifobj, p->dev->name);
err2:
	br_fdb_delete_by_port(br, p, 1);
err1:
	kobject_put(&p->kobj);
	p = NULL; /* kobject_put frees */
err0:
	dev_set_promiscuity(dev, -1);
put_back:
	dev_put(dev);
	kfree(p);
	return err;
}

首先判断了一下要绑定的网络接口设备是否符合要求,注释说non-ethernet类型都不能绑定

然后判断了网桥是否已经开启,要绑定的接口是否已经绑定,看接口是否不允许绑定,然后调用new_nbp()这个函数,传入网桥设备私有结构体和网络接口设备结构体,返回一个net_bridge_port类型的指针

static struct net_bridge_port *new_nbp(struct net_bridge *br,
				       struct net_device *dev)
{
	int index;
	struct net_bridge_port *p;

	index = find_portno(br);
	if (index < 0)
		return ERR_PTR(index);

	p = kzalloc(sizeof(*p), GFP_KERNEL);
	if (p == NULL)
		return ERR_PTR(-ENOMEM);

	p->br = br;
	dev_hold(dev);
	p->dev = dev;
	p->path_cost = port_cost(dev);
	p->priority = 0x8000 >> BR_PORT_BITS;
	p->port_no = index;
	p->flags = 0;
	br_init_port(p);
	p->state = BR_STATE_DISABLED;
	br_stp_port_timer_init(p);
	br_multicast_add_port(p);

	return p;
}

调用find_portno()从网桥设备里返回一个可以使用的索引号,然后为这个net_bridge_port结构分配空间

调用br_fdb_insert()将这个net_bridge_port插入到桥接表中,调用br_netpoll_enable()使能netpoll,

调用netdev_rx_handler_register()将br_handle_frame()这个函数注册到了要绑定的网络接口结构体的rx_handler上,这一点很重要,为什么向桥里绑定一个网络接口,桥就会收到这个网络结构的数据包,原因就在这里,我们来看看是如何添加的

int netdev_rx_handler_register(struct net_device *dev,
			       rx_handler_func_t *rx_handler,
			       void *rx_handler_data)
{
	ASSERT_RTNL();

	if (dev->rx_handler)
		return -EBUSY;

	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
	rcu_assign_pointer(dev->rx_handler, rx_handler);

	return 0;
}

函数的第一个参数是绑定的网络接口的net_device,第二个参数是一个函数指针,第三个参数是这个函数的参数,注释的意识大概是说这个注册函数会为网络接口注册一个接收句柄,这个句柄将会在__netif_receive_skb函数中被调用

__netif_receive_skb

__netif_receive_skb这个函数不用多说,就是网卡驱动向内核协议栈传递skb的接口,skb在这个接口中分发到不同的地方,具体看一看

	/* Handle special case of bridge or macvlan */
	rx_handler = rcu_dereference(skb->dev->rx_handler);
	if (rx_handler) {
		if (pt_prev) {
			ret = deliver_skb(skb, pt_prev, orig_dev);
			pt_prev = NULL;
		}
		skb = rx_handler(skb);
		if (!skb)
			goto out;
	}

上面是__netif_receive_skb中一段和rx_handler相关的代码,在此之前,内核做了netpoll接收,遍历了一个ptype_all链表,在此之后,进行了vlan接收和ptype_base链表遍历

当绑定到桥的接口收到数据包后,在这个rx_handler上,就会调用br_handle_frame()函数进入桥的处理步骤

bridge的数据处理

struct sk_buff *br_handle_frame(struct sk_buff *skb)
{
	struct net_bridge_port *p;
	const unsigned char *dest = eth_hdr(skb)->h_dest;
	int (*rhook)(struct sk_buff *skb);

	if (skb->pkt_type == PACKET_LOOPBACK)
		return skb;

	if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
		goto drop;

	skb = skb_share_check(skb, GFP_ATOMIC);
	if (!skb)
		return NULL;

	p = br_port_get_rcu(skb->dev);

	if (unlikely(is_link_local(dest))) {
		/* Pause frames shouldn't be passed up by driver anyway */
		if (skb->protocol == htons(ETH_P_PAUSE))
			goto drop;

		/* If STP is turned off, then forward */
		if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0)
			goto forward;

		if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
			    NULL, br_handle_local_finish))
			return NULL;	/* frame consumed by filter */
		else
			return skb;	/* continue processing */
	}

forward:
	switch (p->state) {
	case BR_STATE_FORWARDING:
		rhook = rcu_dereference(br_should_route_hook);
		if (rhook != NULL) {
			if (rhook(skb))
				return skb;
			dest = eth_hdr(skb)->h_dest;
		}
		/* fall through */
	case BR_STATE_LEARNING:
		if (!compare_ether_addr(p->br->dev->dev_addr, dest))
			skb->pkt_type = PACKET_HOST;

		NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
			br_handle_frame_finish);
		break;
	default:
drop:
		kfree_skb(skb);
	}
	return NULL;
}

首先做了一些前提判断,然后调用br_port_get_rcu()函数获得net_bridge_port,其实就是从dev->rx_handler_data返回之前注册过的数据,也就是在往桥里绑定接口时注册的桥的net_bridge_port结构

调用is_link_local()判断数据包是否发往本机,看看代码

static inline int is_link_local(const unsigned char *dest)
{
	__be16 *a = (__be16 *)dest;
	static const __be16 *b = (const __be16 *)br_group_address;
	static const __be16 m = cpu_to_be16(0xfff0);

	return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
}

这里用skb的目的mac地址和预先设置的一个全局变量br_group_address比较,br_group_address的值为

const u8 br_group_address[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };

即就是skb的目的mac地址为这个地址段的,就判断为发往本机,会调用一个钩子函数NF_HOOK,钩子点为NF_BR_LOCAL_IN,回调函数为br_handle_local_finish

如果不是发往本机的,也会调用一个钩子函数,钩子点为NF_BR_PRE_ROUTING,回调函数为br_handle_frame_finish

这里的钩子函数NF_HOOK()道理和netfilter类似,就是在内核数据流经的固定一些点上留好位置,其余模块可以利用这些位置,添加一些额外的函数,当数据包走到这些位置时,就会传递给额外的函数去使用

NF_BR_LOCAL_IN的回调函数如下

static int br_handle_local_finish(struct sk_buff *skb)
{
	struct net_bridge_port *p = br_port_get_rcu(skb->dev);

	br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
	return 0;	 /* process further */
}

函数中调用了br_fdb_update(),传入skb的源mac地址,猜测应该是对skb的源mac地址进行学习,以维护fdb

再来看看NF_BR_PRE_ROUTING的回调函数

int br_handle_frame_finish(struct sk_buff *skb)
{
	const unsigned char *dest = eth_hdr(skb)->h_dest;
	struct net_bridge_port *p = br_port_get_rcu(skb->dev);
	struct net_bridge *br;
	struct net_bridge_fdb_entry *dst;
	struct net_bridge_mdb_entry *mdst;
	struct sk_buff *skb2;

	if (!p || p->state == BR_STATE_DISABLED)
		goto drop;

	/* insert into forwarding database after filtering to avoid spoofing */
	br = p->br;
	br_fdb_update(br, p, eth_hdr(skb)->h_source);

	if (is_multicast_ether_addr(dest) &&
	    br_multicast_rcv(br, p, skb))
		goto drop;

	if (p->state == BR_STATE_LEARNING)
		goto drop;

	BR_INPUT_SKB_CB(skb)->brdev = br->dev;

	/* The packet skb2 goes to the local host (NULL to skip). */
	skb2 = NULL;

	if (br->dev->flags & IFF_PROMISC)
		skb2 = skb;

	dst = NULL;

	if (is_multicast_ether_addr(dest)) {
		mdst = br_mdb_get(br, skb);
		if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
			if ((mdst && !hlist_unhashed(&mdst->mglist)) ||
			    br_multicast_is_router(br))
				skb2 = skb;
			br_multicast_forward(mdst, skb, skb2);
			skb = NULL;
			if (!skb2)
				goto out;
		} else
			skb2 = skb;

		br->dev->stats.multicast++;
	} else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) {
		skb2 = skb;
		/* Do not forward the packet since it's local. */
		skb = NULL;
	}

	if (skb) {
		if (dst)
			br_forward(dst->dst, skb, skb2);
		else
			br_flood_forward(br, skb, skb2);
	}

	if (skb2)
		return br_pass_frame_up(skb2);

out:
	return 0;
drop:
	kfree_skb(skb);
	goto out;
}

这里判断了时组播还是单播,并将数据包发往不同的地方

总结一下,上图