Bridge框架 - awokezhou/LinuxPage GitHub Wiki
概述
本文通过对Linux Bridge相关源码进行分析,梳理Bridge框架和原理、数据转发和学习机制,主要内容包括brctl命令、Bridge初始化、Bridge添加操作、Bridge绑定网卡操作和Bridge数据处理流程
brctl源码分析
Linux环境下配置Bridge的命令为brctl,其源码位于busybox/networks/brctl.c文件,brctl的命令用法为
命令 | 参数 | 作用 |
---|---|---|
brctl addbr | br_name | 为系统添加一个名为“br_name”的网桥设备 |
brctk addif | br_name if_name | 为名为“br_name”的网桥设备绑定一个名为“if_name”的网络接口 |
最简单的一个应用场景,假如系统里有两个网络接口,eth0和eth1,如何做到两个接口数据互通呢?
brctl addbr br0
brctl addif br0 eth0
brctl addif br0 eth1
ifconfig 192.168.1.1 netmask 255.255.255.0 up
这样就将eth0和eth1都绑定到了br0这个网桥设备上,br0可以理解为一个软交换机,它有两个端口,分别为eth0和eth1,从任何一个端口进来的数据,对其源mac进行学习,根据目的mac进行转发,在内核中维护一个桥接表
那么它是如何实现这些功能的呢,先来看一下busybox中的brctl在做addbr和addif时都做了什么
br = *argv++;
if (key == ARG_addbr || key == ARG_delbr) { /* addbr or delbr */
ioctl_or_perror_and_die(fd,
key == ARG_addbr ? SIOCBRADDBR : SIOCBRDELBR,
br, "bridge %s", br);
goto done;
}
if (!*argv) /* all but 'addif/delif' need at least two arguments */
bb_show_usage();
strncpy(ifr.ifr_name, br, IFNAMSIZ);
if (key == ARG_addif || key == ARG_delif) { /* addif or delif */
brif = *argv;
ifr.ifr_ifindex = if_nametoindex(brif);
if (!ifr.ifr_ifindex) {
bb_perror_msg_and_die("iface %s", brif);
}
ioctl_or_perror_and_die(fd,
key == ARG_addif ? SIOCBRADDIF : SIOCBRDELIF,
&ifr, "bridge %s", br);
goto done_next_argv;
}
以上代码节选了brctl.c中addbr和addif操作的一段,当输入addbr、delbr、addif、delif时,都会调用ioctl_or_perror_and_die()这个函数来处理,向这个函数传的参数有所变化,ioctl_or_perror_and_die()函数实际上就是调用了一个ioctl将参数传递到内核,如下
int FAST_FUNC ioctl_or_perror_and_die(int fd, unsigned request, void *argp, const char *fmt,...)
{
int ret;
va_list p;
ret = ioctl(fd, request, argp);
if (ret < 0) {
va_start(p, fmt);
bb_verror_msg(fmt, p, strerror(errno));
/* xfunc_die can actually longjmp, so be nice */
va_end(p);
xfunc_die();
}
return ret;
}
总结一下,不同操作命令下,向内核传递的ioctl类型和传递值分别为
命令 | ioctl参数 | 传递值 |
---|---|---|
brctl addbr | SIOCBRADDBR | 网桥设备名称 |
brctl delbr | SIOCBRDELBR | 网桥设备名称 |
brctl addif | SIOCBRADDIF | ifr结构,网桥设备名称放在ifr.ifr_name中,要绑定的网络接口index放在ifr.ifr_ifindex中 |
brctl delif | SIOCBRDELIF | ifr结构,网桥设备名称放在ifr.ifr_name中,要解绑的网络接口index放在ifr.ifr_ifindex中 |
Brideg源码分析
梳理清楚了brctl命令的传参情况,进入kernel代码里看看Bridge是怎么运行的
Bridge初始化
先分析一下初始化代码。Bridge相关代码位于net/bridge/下,其初始化代码位于br.c中
static int __init br_init(void)
{
int err;
err = stp_proto_register(&br_stp_proto);
if (err < 0) {
pr_err("bridge: can't register sap for STP\n");
return err;
}
err = br_fdb_init();
if (err)
goto err_out;
err = register_pernet_subsys(&br_net_ops);
if (err)
goto err_out1;
err = br_netfilter_init();
if (err)
goto err_out2;
err = register_netdevice_notifier(&br_device_notifier);
if (err)
goto err_out3;
err = br_netlink_init();
if (err)
goto err_out4;
brioctl_set(br_ioctl_deviceless_stub);
#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
br_fdb_test_addr_hook = br_fdb_test_addr;
#endif
return 0;
err_out4:
unregister_netdevice_notifier(&br_device_notifier);
err_out3:
br_netfilter_fini();
err_out2:
unregister_pernet_subsys(&br_net_ops);
err_out1:
br_fdb_fini();
err_out:
stp_proto_unregister(&br_stp_proto);
return err;
}
module_init(br_init)
module_exit(br_deinit)
在最下方可以看到,Bridge和网络子系统不同,是以module方式被内核加载的,由于在对内核做make menuconfig操作时,将Bridge选择为了静态module,所以这段初始化代码在Linux系统启动时,就会被加载
首先是调用了一个stp_proto_register()函数,没有仔细研究过,应该是和生成树协议相关的东西
接下来调用了br_fdb_init()函数,fdb很重要(后面会专门分析),指的就是Bridge的桥接表,这里对其做了初始化
static struct kmem_cache *br_fdb_cache __read_mostly;
int __init br_fdb_init(void)
{
br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
sizeof(struct net_bridge_fdb_entry),
0,
SLAB_HWCACHE_ALIGN, NULL);
if (!br_fdb_cache)
return -ENOMEM;
get_random_bytes(&fdb_salt, sizeof(fdb_salt));
return 0;
}
调用kmem_cache_create()为br_fdb_cache这个指针分配了一段缓存空间,猜测是为了增强转发性能
接着调用了register_pernet_subsys(),这个没研究过
调用br_netfilter_init()初始化netfilter,在数据包流经的几个点挂接了钩子函数,应用层有一个ebtables工具,与Bridge里的netfilter相结合,来做一些mac层的转发规则,非常类似于iptables的做法
int __init br_netfilter_init(void)
{
int ret;
ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
if (ret < 0)
return ret;
#ifdef CONFIG_SYSCTL
brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table);
if (brnf_sysctl_header == NULL) {
printk(KERN_WARNING
"br_netfilter: can't register to sysctl.\n");
nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
return -ENOMEM;
}
#endif
printk(KERN_NOTICE "Bridge firewalling registered\n");
return 0;
}
然后调用了register_netdevice_notifier(),这个也没研究过
调用br_netlink_init(),注册了netlink消息,应该是提供用户空间程序处理钩子函数的结构,比如ebtables
最后调用了brioctl_set(br_ioctl_deviceless_stub),这个函数也很重要,brctl的ioctl实际调用的就是这里
void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
{
mutex_lock(&br_ioctl_mutex);
br_ioctl_hook = hook;
mutex_unlock(&br_ioctl_mutex);
}
这里就是用一个全局变量函数指针br_ioctl_hook指向函数br_ioctl_deviceless_stub(),ioctl的调用与这个hook的关系在net/socket.c的socket_ioctl()函数中
static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
......
case SIOCBRADDBR:
case SIOCBRDELBR:
err = -ENOPKG;
if (!br_ioctl_hook)
request_module("bridge");
mutex_lock(&br_ioctl_mutex);
if (br_ioctl_hook)
err = br_ioctl_hook(net, cmd, argp);
mutex_unlock(&br_ioctl_mutex);
break;
......
}
也就是说,brctl的ioctl参数为SIOCBRADDBR和SIOCBRDELBR时,实际上调用的是br_ioctl_deviceless_stub()这个函数
总结一下,Bridge在内核中是以模块的方式加载的,在其初始化函数中,主要做了这样几件事
- STP相关东西的初始化
- 为桥接表分配了一块缓存空间
- 注册了一个ernet子系统什么的
- 注册钩子函数
- 注册了notifier什么的
- netlink初始化
- 提供ioctl接口br_ioctl_deviceless_stub()
添加/删除网桥设备
由刚才的分析可以知道,要搞清楚内核是如何添加或删除一个网桥设备的,就要分析br_ioctl_deviceless_stub()这个函数
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
switch (cmd) {
case SIOCGIFBR:
case SIOCSIFBR:
return old_deviceless(net, uarg);
case SIOCBRADDBR:
case SIOCBRDELBR:
{
char buf[IFNAMSIZ];
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (copy_from_user(buf, uarg, IFNAMSIZ))
return -EFAULT;
buf[IFNAMSIZ-1] = 0;
if (cmd == SIOCBRADDBR)
return br_add_bridge(net, buf);
return br_del_bridge(net, buf);
}
}
return -EOPNOTSUPP;
}
这个函数的最后一个参数uarg就是用户空间传下来的值,即要添加的网桥设备名称,通过copy_from_user()拷贝到了buf中。如果是添加操作,会调用br_add_bridge(),如果是删除操作,会调用br_del_bridge()
添加网桥设备
添加网桥设备的函数br_del_bridge(),代码如下
int br_add_bridge(struct net *net, const char *name)
{
struct net_device *dev;
int ret;
dev = new_bridge_dev(net, name);
if (!dev)
return -ENOMEM;
rtnl_lock();
if (strchr(dev->name, '%')) {
ret = dev_alloc_name(dev, dev->name);
if (ret < 0)
goto out_free;
}
SET_NETDEV_DEVTYPE(dev, &br_type);
ret = register_netdevice(dev);
if (ret)
goto out_free;
ret = br_sysfs_addbr(dev);
if (ret)
unregister_netdevice(dev);
out:
rtnl_unlock();
return ret;
out_free:
free_netdev(dev);
goto out;
}
函数new_bridge_dev()返回时,已经创建好了一个网络设备,可以发现,Bridge设备使用的数据结构也是net_device
调用SET_NETDEV_DEVTYPE()这个宏,将net_device的类型设置为&br_type,看看是怎么做的
#define SET_NETDEV_DEVTYPE(net, devtype) ((net)->dev.type = (devtype))
其实就是用&br_type对net_device的type赋值,那&br_type又是什么呢
static struct device_type br_type = {
.name = "bridge",
};
其实就是把net_device的type当作一个指针来用了,记录br_type这个全局结构的地址
然后调用register_netdevice()向内核注册创建好的网桥设备,再调用br_sysfs_addbr()将网桥添加到sys文件系统,这里没怎么研究
所以添加操作的核心封装在new_bridge_dev()这个函数中,看看它是怎么实现的
static struct net_device *new_bridge_dev(struct net *net, const char *name)
{
struct net_bridge *br;
struct net_device *dev;
dev = alloc_netdev(sizeof(struct net_bridge), name,
br_dev_setup);
if (!dev)
return NULL;
dev_net_set(dev, net);
br = netdev_priv(dev);
br->dev = dev;
br->stats = alloc_percpu(struct br_cpu_netstats);
if (!br->stats) {
free_netdev(dev);
return NULL;
}
spin_lock_init(&br->lock);
INIT_LIST_HEAD(&br->port_list);
spin_lock_init(&br->hash_lock);
br->bridge_id.prio[0] = 0x80;
br->bridge_id.prio[1] = 0x00;
memcpy(br->group_addr, br_group_address, ETH_ALEN);
br->feature_mask = dev->features;
br->stp_enabled = BR_NO_STP;
br->designated_root = br->bridge_id;
br->root_path_cost = 0;
br->root_port = 0;
br->bridge_max_age = br->max_age = 20 * HZ;
br->bridge_hello_time = br->hello_time = 2 * HZ;
br->bridge_forward_delay = br->forward_delay = 15 * HZ;
br->topology_change = 0;
br->topology_change_detected = 0;
br->ageing_time = 300 * HZ;
br_netfilter_rtable_init(br);
br_stp_timer_init(br);
br_multicast_init(br);
return dev;
}
首先调用了alloc_netdev()函数,为net_device数据结构分配了内存空间,待会儿要进去分析一下这个内存空间结构,然后调用了netdev_priv()从net_device中获得Bridge私有数据指针,并将Bridge的dev指向net_device,其关系简单来说如下图
然后初始化了2个spin锁、一个list,然后就是对很多net_bridge里的参数进行设置
重点看一下alloc_netdev()函数,传入的参数是一个net_bridge结构的大小,一个要创建的网桥设备名称,一个br_dev_setup()函数
#define alloc_netdev(sizeof_priv, name, setup) \
alloc_netdev_mq(sizeof_priv, name, setup, 1)
这个宏调用alloc_netdev_mq()函数,追加了一个参数1
/**
* alloc_netdev_mq - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @setup: callback to initialize device
* @queue_count: the number of subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
* and performs basic initialization. Also allocates subquue structs
* for each queue on the device at the end of the netdevice.
*/
struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
void (*setup)(struct net_device *), unsigned int queue_count)
{
struct netdev_queue *tx;
struct net_device *dev;
size_t alloc_size;
struct net_device *p;
#ifdef CONFIG_RPS
struct netdev_rx_queue *rx;
int i;
#endif
BUG_ON(strlen(name) >= sizeof(dev->name));
alloc_size = sizeof(struct net_device);
if (sizeof_priv) {
/* ensure 32-byte alignment of private area */
alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
alloc_size += sizeof_priv;
}
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
p = kzalloc(alloc_size, GFP_KERNEL);
if (!p) {
printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
return NULL;
}
tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
if (!tx) {
printk(KERN_ERR "alloc_netdev: Unable to allocate "
"tx qdiscs.\n");
goto free_p;
}
#ifdef CONFIG_RPS
rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
if (!rx) {
printk(KERN_ERR "alloc_netdev: Unable to allocate "
"rx queues.\n");
goto free_tx;
}
atomic_set(&rx->count, queue_count);
/*
* Set a pointer to first element in the array which holds the
* reference count.
*/
for (i = 0; i < queue_count; i++)
rx[i].first = rx;
#endif
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
if (dev_addr_init(dev))
goto free_rx;
dev_mc_init(dev);
dev_uc_init(dev);
dev_net_set(dev, &init_net);
dev->_tx = tx;
dev->num_tx_queues = queue_count;
dev->real_num_tx_queues = queue_count;
#ifdef CONFIG_RPS
dev->_rx = rx;
dev->num_rx_queues = queue_count;
#endif
dev->gso_max_size = GSO_MAX_SIZE;
netdev_init_queues(dev);
INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
dev->ethtool_ntuple_list.count = 0;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
INIT_LIST_HEAD(&dev->link_watch_list);
dev->priv_flags = IFF_XMIT_DST_RELEASE;
setup(dev);
strcpy(dev->name, name);
return dev;
free_rx:
#ifdef CONFIG_RPS
kfree(rx);
free_tx:
#endif
kfree(tx);
free_p:
kfree(p);
return NULL;
}
由函数上面的注释部分,大致可以了解到,该函数就是用来为网络设备分配内存空间的,可附加私有数据,参数的含义为
参数 | 含义 |
---|---|
sizeof_priv | 要分配的私有数据的大小 |
name | 字符串形式的设备名称 |
setup | 用于初始化设备的函数 |
queue_count | 要分配的队列大小 |
这里的queue_count固定为1
函数首先计算了要分配的net_device空间大小,其大小是以32byte对齐的net_device结构大小与net_bridge结构大小之和,然后通过kalloc()函数为网络设备和队列分配了空间,并将队列挂接到了net_device上,然后初始化了一些list。最终分配的内存空间结构如图
最后会调用传入的初始化函数br_dev_setup()来进行一些初始化工作
void br_dev_setup(struct net_device *dev)
{
random_ether_addr(dev->dev_addr);
ether_setup(dev);
dev->netdev_ops = &br_netdev_ops;
dev->destructor = br_dev_free;
SET_ETHTOOL_OPS(dev, &br_ethtool_ops);
dev->tx_queue_len = 0;
dev->priv_flags = IFF_EBRIDGE;
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX |
NETIF_F_NETNS_LOCAL | NETIF_F_GSO;
}
首先调用random_ether_addr()以随机数方式设置了设备mac地址,然后调用了ether_setup()函数,
void ether_setup(struct net_device *dev)
{
dev->header_ops = ð_header_ops;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
dev->mtu = ETH_DATA_LEN;
dev->addr_len = ETH_ALEN;
dev->tx_queue_len = 1000; /* Ethernet wants good queues */
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
memset(dev->broadcast, 0xFF, ETH_ALEN);
}
ether_setup()这个函数对net_device结构里的一些数据成员进行了设置,如MTU、TX队列大小等
然后回到br_dev_setup()函数,挂接了netdev_ops和ethtool_ops
至此,通过brctl addbr来创建一个网桥设备的整个过程已经完成了,但是这些代码里似乎看不到和数据处理、学习、转发相关的任何内容,所以,如果只是创建了一个网桥设备,而没有绑定任何的网络接口,这个网桥实际上什么也没干
删除网桥设备
删除操作相比添加操作,简单多了,代码如下
int br_del_bridge(struct net *net, const char *name)
{
struct net_device *dev;
int ret = 0;
rtnl_lock();
dev = __dev_get_by_name(net, name);
if (dev == NULL)
ret = -ENXIO; /* Could not find device */
else if (!(dev->priv_flags & IFF_EBRIDGE)) {
/* Attempt to delete non bridge device! */
ret = -EPERM;
}
else if (dev->flags & IFF_UP) {
/* Not shutdown yet. */
ret = -EBUSY;
}
else
del_br(netdev_priv(dev), NULL);
rtnl_unlock();
return ret;
}
调用__dev_get_by_name(),通过用户空间传入的name来找到对应的net_device结构,
struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
struct hlist_node *p;
struct net_device *dev;
struct hlist_head *head = dev_name_hash(net, name);
hlist_for_each_entry(dev, p, head, name_hlist)
if (!strncmp(dev->name, name, IFNAMSIZ))
return dev;
return NULL;
}
其实现就是从一个list表“name_hlist”中去找到name对应的结构,返回。添加操作中没有和这个name_hlist相关的内容,全局搜索了一下,该操作是在register_netdevice()中做的操作
然后会调用del_br()来完成操作,其内部主要调用了br_sysfs_delbr()删除sys文件系统中的内容,调用unregister_netdevice_queue()注销设备和队列
static void del_br(struct net_bridge *br, struct list_head *head)
{
struct net_bridge_port *p, *n;
list_for_each_entry_safe(p, n, &br->port_list, list) {
del_nbp(p);
}
del_timer_sync(&br->gc_timer);
br_sysfs_delbr(br->dev);
unregister_netdevice_queue(br->dev, head);
}
为网桥添加/删除网络接口
通过前面的网桥设备添加操作,添加一个网桥设备,在其net_device结构会挂接br_netdev_ops,
static const struct net_device_ops br_netdev_ops = {
.ndo_open = br_dev_open,
.ndo_stop = br_dev_stop,
.ndo_start_xmit = br_dev_xmit,
.ndo_get_stats64 = br_get_stats64,
.ndo_set_mac_address = br_set_mac_address,
.ndo_set_multicast_list = br_dev_set_multicast_list,
.ndo_change_mtu = br_change_mtu,
.ndo_do_ioctl = br_dev_ioctl,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_netpoll_setup = br_netpoll_setup,
.ndo_netpoll_cleanup = br_netpoll_cleanup,
.ndo_poll_controller = br_poll_controller,
#endif
};
其中有一个br_dev_ioctl,点击进去,可以看到
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
struct net_bridge *br = netdev_priv(dev);
switch(cmd) {
case SIOCDEVPRIVATE:
return old_dev_ioctl(dev, rq, cmd);
case SIOCBRADDIF:
case SIOCBRDELIF:
return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
}
br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
return -EOPNOTSUPP;
}
这里面刚好就有SIOCBRADDIF和SIOCBRDELIF这两个ioctl参数,从这里就可以推断出,brctl addif br0 if_name,最终就会查找到这里来进行操作。但是这里首先要搞清楚一个问题,系统里可以提前创建多个网桥设备,brctl addif br0 eth0这条命令怎么就能把eth0刚好添加到br0上,而不是添加到br1、br2上呢?
在最开始分析brctl命令时,分析过brctl addif这个命令的参数内容,它是向内核传递了一个ifr,ifr.ifr_name是网桥设备的名称,ifr.ifr_ifindex是要绑定的网络接口的名称,这一点很重要
全局搜索一下SIOCBRADDIF都在什么地方出现过
---- SIOCBRADDIF Matches (6 in 4 files) ----
Br_ioctl.c (linux-2.6.36.x\net\bridge): case SIOCBRADDIF:
Br_ioctl.c (linux-2.6.36.x\net\bridge): return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
Dev.c (linux-2.6.36.x\net\core): cmd == SIOCBRADDIF ||
Dev.c (linux-2.6.36.x\net\core): case SIOCBRADDIF:
Socket.c (linux-2.6.36.x\net): case SIOCBRADDIF:
Sockios.h (linux-2.6.36.x\include\linux):#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
大概在socket.c、dev.c和br_ioctl.c里面出现过。这里就需要对linux的系统调用大概过程有一个认识,ioctl这个系统调用是怎么走的呢,我也不是很清楚,但是一定是从最抽象的层次一层一层到最具体的层次
syscall
**********************************************
net dev char dev block dev
**********************************************
br_ioctl... gpio_ioctl... flash_ioctl...
就像我们在应用层使用read()、write()这些系统调用,内核怎么知道我们到底读的是什么东西呢,读的是文件系统里的一个文件,还是设备驱动,起始是通过名字和设备号来区分的,无论是字符设备、块设备、网络设备,都会有它自己的标识,当使用系统调用时,传入的参数就指定了访问的到底是哪种设备
通过socket()返回的fd来做ioctl,当然访问的是网络设备,这一点没问题,然后在网络设备里又有好多种,有网桥、网络接口、虚接口等,内核又怎么知道访问的是哪一种具体的设备呢?通过传入的iocto参数和ifr来区分大致可以推断出这个ioctl的执行流程是
socket.c -> dev.c -> br_ioctl.c
先来看socket.c里的iotcl部分
case SIOCSIFTXQLEN:
case SIOCBRADDIF:
case SIOCBRDELIF:
case SIOCSIFNAME:
case SIOCGMIIPHY:
case SIOCGMIIREG:
case SIOCSMIIREG:
return dev_ifsioc(net, sock, cmd, argp);
如果有以上这些cmd,就会调用dev_ifsioc()函数,进去看看
static int dev_ifsioc(struct net *net, struct socket *sock,
unsigned int cmd, struct compat_ifreq __user *uifr32)
{
struct ifreq __user *uifr;
int err;
uifr = compat_alloc_user_space(sizeof(*uifr));
if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
return -EFAULT;
err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);
if (!err) {
switch (cmd) {
case SIOCGIFFLAGS:
case SIOCGIFMETRIC:
case SIOCGIFMTU:
case SIOCGIFMEM:
case SIOCGIFHWADDR:
case SIOCGIFINDEX:
case SIOCGIFADDR:
case SIOCGIFBRDADDR:
case SIOCGIFDSTADDR:
case SIOCGIFNETMASK:
case SIOCGIFPFLAGS:
case SIOCGIFTXQLEN:
case SIOCGMIIPHY:
case SIOCGMIIREG:
if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
err = -EFAULT;
break;
}
}
return err;
}
这个函数调用copy_in_user()将用户空间传进来的ifr信息接收到一块内存上,用uifr这个指针记录了位置,然后调用sock_do_ioctl(),传入了uifr
static long sock_do_ioctl(struct net *net, struct socket *sock,
unsigned int cmd, unsigned long arg)
{
int err;
void __user *argp = (void __user *)arg;
err = sock->ops->ioctl(sock, cmd, arg);
/*
* If this ioctl is unknown try to hand it down
* to the NIC driver.
*/
if (err == -ENOIOCTLCMD)
err = dev_ioctl(net, cmd, argp);
return err;
}
这里调用了一个sock->ops->ioctl()来处理,如果返回值为ENOIOCTLCMD,就会调用dev_ioctl()。那怎么知道这个sock->ops->ioctl()是什么函数?它的调用结果是什么呢?这就得明白应用层调用socket()时内核都会干什么,内核是如何创建并初始化这个套接字的,才能准确定位这个sock->ops->ioctl()是哪个函数,这里不展开分析了
简单来说,内核会根据socket(domain, type, proto)传入的这些参数,来判断到底挂接哪种ops,因为brctl命令创建的是以“AF_INET、SOCK_STREAM、0”为参数的套接字,所以最后这个ops指向一个inet_stream_ops,
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
EXPORT_SYMBOL(inet_stream_ops);
可以看到,这个inet_stream_ops下挂的ioctl为inet_ioctl,看看它干了什么
int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
int err = 0;
struct net *net = sock_net(sk);
switch (cmd) {
case SIOCGSTAMP:
err = sock_get_timestamp(sk, (struct timeval __user *)arg);
break;
case SIOCGSTAMPNS:
err = sock_get_timestampns(sk, (struct timespec __user *)arg);
break;
case SIOCADDRT:
case SIOCDELRT:
case SIOCRTMSG:
err = ip_rt_ioctl(net, cmd, (void __user *)arg);
break;
case SIOCDARP:
case SIOCGARP:
case SIOCSARP:
err = arp_ioctl(net, cmd, (void __user *)arg);
break;
case SIOCGIFADDR:
case SIOCSIFADDR:
case SIOCGIFBRDADDR:
case SIOCSIFBRDADDR:
case SIOCGIFNETMASK:
case SIOCSIFNETMASK:
case SIOCGIFDSTADDR:
case SIOCSIFDSTADDR:
case SIOCSIFPFLAGS:
case SIOCGIFPFLAGS:
case SIOCSIFFLAGS:
err = devinet_ioctl(net, cmd, (void __user *)arg);
break;
default:
if (sk->sk_prot->ioctl)
err = sk->sk_prot->ioctl(sk, cmd, arg);
else
err = -ENOIOCTLCMD;
break;
}
return err;
}
EXPORT_SYMBOL(inet_ioctl);
其实就是找cmd,找不到的话,返回ENOIOCTLCMD,这样就和上面的ENOIOCTLCMD衔接上了
好,sock_do_ioctl()函数会调用dev_ioctl(),
case SIOCBONDSETHWADDR:
case SIOCBONDCHANGEACTIVE:
case SIOCBRADDIF:
case SIOCBRDELIF:
case SIOCSHWTSTAMP:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
/* fall through */
case SIOCBONDSLAVEINFOQUERY:
case SIOCBONDINFOQUERY:
dev_load(net, ifr.ifr_name);
rtnl_lock();
ret = dev_ifsioc(net, &ifr, cmd);
rtnl_unlock();
return ret;
发现从这里,已经进入了dev.c,就是已经判断进入了网络设备的ioctl层。这里会调用dev_load()和dev_ifsioc(),
void dev_load(struct net *net, const char *name)
{
struct net_device *dev;
rcu_read_lock();
dev = dev_get_by_name_rcu(net, name);
rcu_read_unlock();
if (!dev && capable(CAP_NET_ADMIN))
request_module("%s", name);
}
dev_load()函数就是调用dev_get_by_name_rcu()根据name来找到设备,这里的name就是brctl命令的网桥设备名称,看看dev_get_by_name_rcu()是怎么找的
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
struct hlist_node *p;
struct net_device *dev;
struct hlist_head *head = dev_name_hash(net, name);
hlist_for_each_entry_rcu(dev, p, head, name_hlist)
if (!strncmp(dev->name, name, IFNAMSIZ))
return dev;
return NULL;
}
就是从name_hlist这个list中去查找的,前面分析网桥添加的时候提到过,在注册网桥到网络设备时会操作这个name_hlist,这里根据网桥名称可以找到设备
然后会调用dev_ifsioc(),
static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
{
int err;
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
const struct net_device_ops *ops;
if (!dev)
return -ENODEV;
ops = dev->netdev_ops;
switch (cmd) {
......
/*
* Unknown or private ioctl
*/
default:
if ((cmd >= SIOCDEVPRIVATE &&
cmd <= SIOCDEVPRIVATE + 15) ||
cmd == SIOCBONDENSLAVE ||
cmd == SIOCBONDRELEASE ||
cmd == SIOCBONDSETHWADDR ||
cmd == SIOCBONDSLAVEINFOQUERY ||
cmd == SIOCBONDINFOQUERY ||
cmd == SIOCBONDCHANGEACTIVE ||
cmd == SIOCGMIIPHY ||
cmd == SIOCGMIIREG ||
cmd == SIOCSMIIREG ||
cmd == SIOCBRADDIF ||
cmd == SIOCBRDELIF ||
cmd == SIOCSHWTSTAMP ||
cmd == SIOCWANDEV) {
err = -EOPNOTSUPP;
if (ops->ndo_do_ioctl) {
if (netif_device_present(dev))
err = ops->ndo_do_ioctl(dev, ifr, cmd);
else
err = -ENODEV;
}
} else
err = -EINVAL;
}
return err;
}
首先调用了__dev_get_by_name()根据网桥名称获得网络设备结构体net_device,然后在switch语句里,判断net_device->ops->ndo_do_ioctl是否存在,因为添加过网桥设备的net_device结构体上挂接了br_netdev_ops,所以存在,然后就会执行ndo_do_ioctl,也就是br_netdev_ops->ndo_do_ioctl,也就是br_dev_ioctl()函数。
总结一下,用户空间通过brctl addif br0 eth0时,调用了系统调用ioctl()函数,这个函数根据传入的ioctl命令和ifr参数,从通用socket层开始查找,经过net dev层的ioctl,最终根据传入的网桥名称“br0”找到这个设备结构体net_device,该结构的net_device_ops挂接了br_netdev_ops,所以最终调用的是br_dev_ioctl()这个函数
br_dev_ioctl
进入该函数,分析一下
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
struct net_bridge *br = netdev_priv(dev);
switch(cmd) {
case SIOCDEVPRIVATE:
return old_dev_ioctl(dev, rq, cmd);
case SIOCBRADDIF:
case SIOCBRDELIF:
return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
}
br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
return -EOPNOTSUPP;
}
首先调用了netdev_priv()从net_device上获取私有设备结构体net_bridge,然后添加和删除都会调用add_del_if(),传入了ifr.ifr_index,也就是要添加的网络接口的索引号,以此来确定要绑定哪一个网络设备
static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
{
struct net_device *dev;
int ret;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
dev = __dev_get_by_index(dev_net(br->dev), ifindex);
if (dev == NULL)
return -EINVAL;
if (isadd)
ret = br_add_if(br, dev);
else
ret = br_del_if(br, dev);
return ret;
}
调用了__dev_get_by_index()来根据索引获得网络接口设备结构体,然后如果是添加操作,调用br_add_if(),如果是删除操作,调用br_del_if()
先来看看br_add_if,
int br_add_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p;
int err = 0;
/* Don't allow bridging non-ethernet like devices */
if ((dev->flags & IFF_LOOPBACK) ||
dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN)
return -EINVAL;
/* No bridging of bridges */
if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
return -ELOOP;
/* Device is already being bridged */
if (br_port_exists(dev))
return -EBUSY;
/* No bridging devices that dislike that (e.g. wireless) */
if (dev->priv_flags & IFF_DONT_BRIDGE)
return -EOPNOTSUPP;
p = new_nbp(br, dev);
if (IS_ERR(p))
return PTR_ERR(p);
err = dev_set_promiscuity(dev, 1);
if (err)
goto put_back;
err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
SYSFS_BRIDGE_PORT_ATTR);
if (err)
goto err0;
err = br_fdb_insert(br, p, dev->dev_addr);
if (err)
goto err1;
err = br_sysfs_addif(p);
if (err)
goto err2;
if (br_netpoll_info(br) && ((err = br_netpoll_enable(p))))
goto err3;
err = netdev_rx_handler_register(dev, br_handle_frame, p);
if (err)
goto err3;
dev->priv_flags |= IFF_BRIDGE_PORT;
dev_disable_lro(dev);
list_add_rcu(&p->list, &br->port_list);
spin_lock_bh(&br->lock);
br_stp_recalculate_bridge_id(br);
br_features_recompute(br);
if ((dev->flags & IFF_UP) && netif_carrier_ok(dev) &&
(br->dev->flags & IFF_UP))
br_stp_enable_port(p);
spin_unlock_bh(&br->lock);
br_ifinfo_notify(RTM_NEWLINK, p);
dev_set_mtu(br->dev, br_min_mtu(br));
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
err3:
sysfs_remove_link(br->ifobj, p->dev->name);
err2:
br_fdb_delete_by_port(br, p, 1);
err1:
kobject_put(&p->kobj);
p = NULL; /* kobject_put frees */
err0:
dev_set_promiscuity(dev, -1);
put_back:
dev_put(dev);
kfree(p);
return err;
}
首先判断了一下要绑定的网络接口设备是否符合要求,注释说non-ethernet类型都不能绑定
然后判断了网桥是否已经开启,要绑定的接口是否已经绑定,看接口是否不允许绑定,然后调用new_nbp()这个函数,传入网桥设备私有结构体和网络接口设备结构体,返回一个net_bridge_port类型的指针
static struct net_bridge_port *new_nbp(struct net_bridge *br,
struct net_device *dev)
{
int index;
struct net_bridge_port *p;
index = find_portno(br);
if (index < 0)
return ERR_PTR(index);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return ERR_PTR(-ENOMEM);
p->br = br;
dev_hold(dev);
p->dev = dev;
p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
p->port_no = index;
p->flags = 0;
br_init_port(p);
p->state = BR_STATE_DISABLED;
br_stp_port_timer_init(p);
br_multicast_add_port(p);
return p;
}
调用find_portno()从网桥设备里返回一个可以使用的索引号,然后为这个net_bridge_port结构分配空间
调用br_fdb_insert()将这个net_bridge_port插入到桥接表中,调用br_netpoll_enable()使能netpoll,
调用netdev_rx_handler_register()将br_handle_frame()这个函数注册到了要绑定的网络接口结构体的rx_handler上,这一点很重要,为什么向桥里绑定一个网络接口,桥就会收到这个网络结构的数据包,原因就在这里,我们来看看是如何添加的
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
ASSERT_RTNL();
if (dev->rx_handler)
return -EBUSY;
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
return 0;
}
函数的第一个参数是绑定的网络接口的net_device,第二个参数是一个函数指针,第三个参数是这个函数的参数,注释的意识大概是说这个注册函数会为网络接口注册一个接收句柄,这个句柄将会在__netif_receive_skb
函数中被调用
__netif_receive_skb
__netif_receive_skb这个函数不用多说,就是网卡驱动向内核协议栈传递skb的接口,skb在这个接口中分发到不同的地方,具体看一看
/* Handle special case of bridge or macvlan */
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
skb = rx_handler(skb);
if (!skb)
goto out;
}
上面是__netif_receive_skb中一段和rx_handler相关的代码,在此之前,内核做了netpoll接收,遍历了一个ptype_all链表,在此之后,进行了vlan接收和ptype_base链表遍历
当绑定到桥的接口收到数据包后,在这个rx_handler上,就会调用br_handle_frame()函数进入桥的处理步骤
bridge的数据处理
struct sk_buff *br_handle_frame(struct sk_buff *skb)
{
struct net_bridge_port *p;
const unsigned char *dest = eth_hdr(skb)->h_dest;
int (*rhook)(struct sk_buff *skb);
if (skb->pkt_type == PACKET_LOOPBACK)
return skb;
if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
goto drop;
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return NULL;
p = br_port_get_rcu(skb->dev);
if (unlikely(is_link_local(dest))) {
/* Pause frames shouldn't be passed up by driver anyway */
if (skb->protocol == htons(ETH_P_PAUSE))
goto drop;
/* If STP is turned off, then forward */
if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0)
goto forward;
if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
NULL, br_handle_local_finish))
return NULL; /* frame consumed by filter */
else
return skb; /* continue processing */
}
forward:
switch (p->state) {
case BR_STATE_FORWARDING:
rhook = rcu_dereference(br_should_route_hook);
if (rhook != NULL) {
if (rhook(skb))
return skb;
dest = eth_hdr(skb)->h_dest;
}
/* fall through */
case BR_STATE_LEARNING:
if (!compare_ether_addr(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL,
br_handle_frame_finish);
break;
default:
drop:
kfree_skb(skb);
}
return NULL;
}
首先做了一些前提判断,然后调用br_port_get_rcu()函数获得net_bridge_port,其实就是从dev->rx_handler_data返回之前注册过的数据,也就是在往桥里绑定接口时注册的桥的net_bridge_port结构
调用is_link_local()判断数据包是否发往本机,看看代码
static inline int is_link_local(const unsigned char *dest)
{
__be16 *a = (__be16 *)dest;
static const __be16 *b = (const __be16 *)br_group_address;
static const __be16 m = cpu_to_be16(0xfff0);
return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
}
这里用skb的目的mac地址和预先设置的一个全局变量br_group_address比较,br_group_address的值为
const u8 br_group_address[ETH_ALEN] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
即就是skb的目的mac地址为这个地址段的,就判断为发往本机,会调用一个钩子函数NF_HOOK,钩子点为NF_BR_LOCAL_IN,回调函数为br_handle_local_finish
如果不是发往本机的,也会调用一个钩子函数,钩子点为NF_BR_PRE_ROUTING,回调函数为br_handle_frame_finish
这里的钩子函数NF_HOOK()道理和netfilter类似,就是在内核数据流经的固定一些点上留好位置,其余模块可以利用这些位置,添加一些额外的函数,当数据包走到这些位置时,就会传递给额外的函数去使用
NF_BR_LOCAL_IN的回调函数如下
static int br_handle_local_finish(struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
br_fdb_update(p->br, p, eth_hdr(skb)->h_source);
return 0; /* process further */
}
函数中调用了br_fdb_update(),传入skb的源mac地址,猜测应该是对skb的源mac地址进行学习,以维护fdb
再来看看NF_BR_PRE_ROUTING的回调函数
int br_handle_frame_finish(struct sk_buff *skb)
{
const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
struct net_bridge *br;
struct net_bridge_fdb_entry *dst;
struct net_bridge_mdb_entry *mdst;
struct sk_buff *skb2;
if (!p || p->state == BR_STATE_DISABLED)
goto drop;
/* insert into forwarding database after filtering to avoid spoofing */
br = p->br;
br_fdb_update(br, p, eth_hdr(skb)->h_source);
if (is_multicast_ether_addr(dest) &&
br_multicast_rcv(br, p, skb))
goto drop;
if (p->state == BR_STATE_LEARNING)
goto drop;
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
/* The packet skb2 goes to the local host (NULL to skip). */
skb2 = NULL;
if (br->dev->flags & IFF_PROMISC)
skb2 = skb;
dst = NULL;
if (is_multicast_ether_addr(dest)) {
mdst = br_mdb_get(br, skb);
if (mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) {
if ((mdst && !hlist_unhashed(&mdst->mglist)) ||
br_multicast_is_router(br))
skb2 = skb;
br_multicast_forward(mdst, skb, skb2);
skb = NULL;
if (!skb2)
goto out;
} else
skb2 = skb;
br->dev->stats.multicast++;
} else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) {
skb2 = skb;
/* Do not forward the packet since it's local. */
skb = NULL;
}
if (skb) {
if (dst)
br_forward(dst->dst, skb, skb2);
else
br_flood_forward(br, skb, skb2);
}
if (skb2)
return br_pass_frame_up(skb2);
out:
return 0;
drop:
kfree_skb(skb);
goto out;
}
这里判断了时组播还是单播,并将数据包发往不同的地方
总结一下,上图