邻居子系统实现了IP层发包不感知MAC,即由邻居子系统实现了MAC头封装。MAC头信息包括:源MAC、目的MAC、协议类型,其中协议类型由上层指定,例如IPV4等等,源MAC地址是出口设备MAC地址(在路由表中确定出口设备),目的MAC是由邻居子系统提供的,大致可以猜到,邻居子系统会主动发起arp请求获取到mac地址,实现MAC封包。IP层发包最后会调用ip_finish_output2函数,我们从该函数入手分析邻居子系统。
ip_finish_output2函数
static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)首先会根据出口设备和目的IP地址,查找是否已经存在邻居项,如果没有则创建邻居项,然后通过dst_neigh_output发包,本文分析假设没有邻居项。 先邻居项的查找函数:
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev; //出口设备
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
u32 nexthop;
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (!skb2) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
consume_skb(skb);
skb = skb2;
}
rcu_read_lock_bh();
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);<span style="white-space:pre"> </span>//目的IP地址
neigh = __ipv4_neigh_lookup_noref(dev, nexthop); //根据目的IP查找邻居项是否存在
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); //如果不存在,则创建neigh项
if (!IS_ERR(neigh)) {
int res = dst_neigh_output(dst, neigh, skb); //调用邻居子系统封装MAC头,并且调用二层发包函数完成报文发送
rcu_read_unlock_bh();
return res;
}
rcu_read_unlock_bh();
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
kfree_skb(skb);
return -EINVAL;
}
__ipv4_neigh_lookup_noref函数
static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)___neigh_lookup_noref函数
{
return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); //ipv4从arp_tbl中查找
}
static inline struct neighbour *___neigh_lookup_noref(邻居表项查找比较简单,就是在hash表中查找匹配设备和目的IP地址的邻居表项,该函数支持IPV6, 可扩展性通过参数实现,接下来看下创建邻居表项的实现:
struct neigh_table *tbl,
bool (*key_eq)(const struct neighbour *n, const void *pkey),
__u32 (*hash)(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd),
const void *pkey,
struct net_device *dev)
{
struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht); //hash表,邻居数量大时加速
struct neighbour *n;
u32 hash_val;
hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); //计算hash值
for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
n != NULL;
n = rcu_dereference_bh(n->next)) {
if (n->dev == dev && key_eq(n, pkey)) //dev相同并且pkey相同,这里pkey是IPV4地址
return n;
}
return NULL;
}
__neigh_create函数
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,neigh_alloc函数
struct net_device *dev, bool want_ref)
{
u32 hash_val;
int key_len = tbl->key_len;
int error;
struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);<span style="white-space:pre"> </span>//创建邻居表项对象
struct neigh_hash_table *nht;
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
}
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
dev_hold(dev);
/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) { //IPV4实际调用arp_constructor函数,设置output函数
rc = ERR_PTR(error);
goto out_neigh_release;
}
if (dev->netdev_ops->ndo_neigh_construct) { //一般设备不设置该变量
error = dev->netdev_ops->ndo_neigh_construct(n);
if (error < 0) {
rc = ERR_PTR(error);
goto out_neigh_release;
}
}
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) { //IPV4未定义该函数
rc = ERR_PTR(error);
goto out_neigh_release;
}
n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); //计算hash值,计算方式由邻居表定义
if (n->parms->dead) {
rc = ERR_PTR(-EINVAL);
goto out_tbl_unlock;
}
for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], //找到有相同hash值得neighbour链表
lockdep_is_held(&tbl->lock));
n1 != NULL;
n1 = rcu_dereference_protected(n1->next,
lockdep_is_held(&tbl->lock))) {
if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
if (want_ref)
neigh_hold(n1);
rc = n1;
goto out_tbl_unlock;
}
}
n->dead = 0;
if (want_ref)
neigh_hold(n);
rcu_assign_pointer(n->next,
rcu_dereference_protected(nht->hash_buckets[hash_val],
lockdep_is_held(&tbl->lock))); //插入到链表中
rcu_assign_pointer(nht->hash_buckets[hash_val], n);
write_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
write_unlock_bh(&tbl->lock);
out_neigh_release:
neigh_release(n);
goto out;
}
static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)arp_constructor函数
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
int entries;
entries = atomic_inc_return(&tbl->entries) - 1;
if (entries >= tbl->gc_thresh3 ||
(entries >= tbl->gc_thresh2 &&
time_after(now, tbl->last_flush + 5 * HZ))) {
if (!neigh_forced_gc(tbl) &&
entries >= tbl->gc_thresh3)
goto out_entries;
}
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
__skb_queue_head_init(&n->arp_queue); //初始化arp_queue队列
rwlock_init(&n->lock);
seqlock_init(&n->ha_lock);
n->updated = n->used = now;
n->nud_state = NUD_NONE; //状态为不可用
n->output = neigh_blackhole; //直接丢弃报文
seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms); //拷贝neigh_table中的parms
setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); //注册定时器
NEIGH_CACHE_STAT_INC(tbl, allocs);
n->tbl = tbl;
atomic_set(&n->refcnt, 1);
n->dead = 1;
out:
return n;
out_entries:
atomic_dec(&tbl->entries);
goto out;
}
static int arp_constructor(struct neighbour *neigh)邻居表项创建后,output函数为neigh_resolve_output,此时邻居子系统还不具备发送IP报文的能力,因为目的MAC地址还未获取,我们来看下dst_neigh_output函数实现:
{
__be32 addr = *(__be32 *)neigh->primary_key;
struct net_device *dev = neigh->dev;
struct in_device *in_dev;
struct neigh_parms *parms;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev); //通过net_device得到in_device
if (!in_dev) {
rcu_read_unlock();
return -EINVAL;
}
neigh->type = inet_addr_type(dev_net(dev), addr); //设置地址类型
parms = in_dev->arp_parms;
__neigh_parms_put(neigh->parms);
neigh->parms = neigh_parms_clone(parms);
rcu_read_unlock();
if (!dev->header_ops) { //基本上的网卡都会设置该值
neigh->nud_state = NUD_NOARP;
neigh->ops = &arp_direct_ops;
neigh->output = neigh_direct_output;
} else {
/* Good devices (checked by reading texts, but only Ethernet is
tested)
ARPHRD_ETHER: (ethernet, apfddi)
ARPHRD_FDDI: (fddi)
ARPHRD_IEEE802: (tr)
ARPHRD_METRICOM: (strip)
ARPHRD_ARCNET:
etc. etc. etc.
ARPHRD_IPDDP will also work, if author repairs it.
I did not it, because this driver does not work even
in old paradigm.
*/
if (neigh->type == RTN_MULTICAST) { //组播地址不需要arp
neigh->nud_state = NUD_NOARP;
arp_mc_map(addr, neigh->ha, dev, 1);
} else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { //设备明确不需要arp或本地回环设备,不需要arp
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
} else if (neigh->type == RTN_BROADCAST ||
(dev->flags & IFF_POINTOPOINT)) { //广播或点对点,也不需要arp
neigh->nud_state = NUD_NOARP;
memcpy(neigh->ha, dev->broadcast, dev->addr_len);
}
if (dev->header_ops->cache) //eth_header_ops包含cache
neigh->ops = &arp_hh_ops;
else
neigh->ops = &arp_generic_ops;
if (neigh->nud_state & NUD_VALID)
neigh->output = neigh->ops->connected_output;
else
neigh->output = neigh->ops->output; //初始阶段为该值,即arp_hh_ops的neigh_resolve_output函数
}
return 0;
}
dst_neigh_output函数
static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,neigh_resolve_output函数
struct sk_buff *skb)
{
const struct hh_cache *hh;
if (dst->pending_confirm) {
unsigned long now = jiffies;
dst->pending_confirm = 0;
/* avoid dirtying neighbour */
if (n->confirmed != now)
n->confirmed = now;
}
hh = &n->hh;
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) //如果neighbour已连接且hh已设置
return neigh_hh_output(hh, skb);
else
return n->output(n, skb); //初始阶段调用此函数,此时为neigh_resolve_output函数
}
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)neigh_event_send函数
{
int rc = 0;
if (!neigh_event_send(neigh, skb)) { //发送arp请求,第一次返回true
int err;
struct net_device *dev = neigh->dev;
unsigned int seq;
if (dev->header_ops->cache && !neigh->hh.hh_len)
neigh_hh_init(neigh); //初始化MAC缓存值,目的是加速
do {
__skb_pull(skb, skb_network_offset(skb)); //常见情况,skb指向network header
seq = read_seqbegin(&neigh->ha_lock);
err = dev_hard_header(skb, dev, ntohs(skb->protocol), //封装MAC头
neigh->ha, NULL, skb->len);
} while (read_seqretry(&neigh->ha_lock, seq));
if (err >= 0)
rc = dev_queue_xmit(skb); //二层发送报文
else
goto out_kfree_skb;
}
out:
return rc;
out_kfree_skb:
rc = -EINVAL;
kfree_skb(skb);
goto out;
}
static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)__neigh_event_send函数
{
unsigned long now = jiffies;
if (neigh->used != now)
neigh->used = now;
if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
return __neigh_event_send(neigh, skb); //发送arp请求
return 0;
}
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)neigh_probe函数
{
int rc;
bool immediate_probe = false;
write_lock_bh(&neigh->lock);
rc = 0;
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
if (neigh->dead)
goto out_dead;
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { //初始阶段进入此分支
if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
NEIGH_VAR(neigh->parms, APP_PROBES)) {
unsigned long next, now = jiffies;
atomic_set(&neigh->probes,
NEIGH_VAR(neigh->parms, UCAST_PROBES));
neigh->nud_state = NUD_INCOMPLETE; //设置表项状态为incomplete
neigh->updated = now;
next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
HZ/2);
neigh_add_timer(neigh, next); //触发定时器,期望刷新表项状态和output函数,500毫秒后执行
immediate_probe = true;
} else {
neigh->nud_state = NUD_FAILED;
neigh->updated = jiffies;
write_unlock_bh(&neigh->lock);
kfree_skb(skb);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
neigh_dbg(2, "neigh %p is delayed\n", neigh);
neigh->nud_state = NUD_DELAY;
neigh->updated = jiffies;
neigh_add_timer(neigh, jiffies +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
}
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
while (neigh->arp_queue_len_bytes + skb->truesize >
NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { //如果等待发送的报文数量超过设定值,丢弃报文
struct sk_buff *buff;
buff = __skb_dequeue(&neigh->arp_queue);
if (!buff)
break;
neigh->arp_queue_len_bytes -= buff->truesize;
kfree_skb(buff);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
__skb_queue_tail(&neigh->arp_queue, skb); //报文放入arp_queue队列中
neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
out_unlock_bh:
if (immediate_probe) //初始阶段,邻居项设置状态设置为incomplete,同时设置该变量为true
neigh_probe(neigh); //探测邻居表项
else
write_unlock(&neigh->lock);
local_bh_enable();
return rc;
out_dead:
if (neigh->nud_state & NUD_STALE)
goto out_unlock_bh;
write_unlock_bh(&neigh->lock);
kfree_skb(skb);
return 1;
}
static void neigh_probe(struct neighbour *neigh)从上述函数可以看到,报文并没有被发送出去,做了3个事情:1)发送了arp请求, 2)缓存了报文,3)启动定时器500毫秒后执行。 报文被丢弃了? 没有,其实报文是在neigh_update函数中被发送的,该函数的一个调用者是arp处理函数。 调用neigh_update函数后,neigh的output函数被改变,在这个之前,ouput函数仍然是neigh_resolve_output,如果是同一个目的IP,不会再次发送arp请求,仅仅把报文缓存起来,下面我们来看下neigh_update函数:
__releases(neigh->lock)
{
struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); //取出报文
/* keep skb alive even if arp_queue overflows */
if (skb)
skb = skb_copy(skb, GFP_ATOMIC); //拷贝skb
write_unlock(&neigh->lock);
neigh->ops->solicit(neigh, skb); //实际调用arp_solicit函数,该函数会发送arp请求
atomic_inc(&neigh->probes);
kfree_skb(skb);
}
neigh_update函数
/* Generic update routine.至此,arp的整个大流程基本清晰了,有些细节还有待梳理,例如neigh_update中发包时,为什么需要重新查找neigh表项而不用当前的neigh等。
-- lladdr is new lladdr or NULL, if it is not supplied.
-- new is new state.
-- flags
NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
if it is different.
NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
lladdr instead of overriding it
if it is different.
It also allows to retain current state
if lladdr is unchanged.
NEIGH_UPDATE_F_ADMIN means that the change is administrative.
NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
NTF_ROUTER flag.
NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
a router.
Caller MUST hold reference count on the entry.
*/
int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
u32 flags)
{
u8 old;
int err;
int notify = 0;
struct net_device *dev;
int update_isrouter = 0;
write_lock_bh(&neigh->lock);
dev = neigh->dev;
old = neigh->nud_state;
err = -EPERM;
if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
(old & (NUD_NOARP | NUD_PERMANENT)))
goto out;
if (neigh->dead)
goto out;
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
if (old & NUD_CONNECTED)
neigh_suspect(neigh);
neigh->nud_state = new;
err = 0;
notify = old & NUD_VALID;
if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
(new & NUD_FAILED)) {
neigh_invalidate(neigh);
notify = 1;
}
goto out;
}
/* Compare new lladdr with cached one */
if (!dev->addr_len) {
/* First case: device needs no address. */
lladdr = neigh->ha;
} else if (lladdr) {
/* The second case: if something is already cached
and a new address is proposed:
- compare new & old
- if they are different, check override flag
*/
if ((old & NUD_VALID) &&
!memcmp(lladdr, neigh->ha, dev->addr_len))
lladdr = neigh->ha;
} else {
/* No address is supplied; if we know something,
use it, otherwise discard the request.
*/
err = -EINVAL;
if (!(old & NUD_VALID))
goto out;
lladdr = neigh->ha;
}
if (new & NUD_CONNECTED)
neigh->confirmed = jiffies;
neigh->updated = jiffies;
/* If entry was valid and address is not changed,
do not change entry state, if new one is STALE.
*/
err = 0;
update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
if (old & NUD_VALID) {
if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
update_isrouter = 0;
if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
(old & NUD_CONNECTED)) {
lladdr = neigh->ha;
new = NUD_STALE;
} else
goto out;
} else {
if (lladdr == neigh->ha && new == NUD_STALE &&
((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
(old & NUD_CONNECTED))
)
new = old;
}
}
if (new != old) {
neigh_del_timer(neigh);
if (new & NUD_IN_TIMER)
neigh_add_timer(neigh, (jiffies +
((new & NUD_REACHABLE) ?
neigh->parms->reachable_time :
0)));
neigh->nud_state = new;
notify = 1;
}
if (lladdr != neigh->ha) {
write_seqlock(&neigh->ha_lock);
memcpy(&neigh->ha, lladdr, dev->addr_len);
write_sequnlock(&neigh->ha_lock);
neigh_update_hhs(neigh);
if (!(new & NUD_CONNECTED))
neigh->confirmed = jiffies -
(NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
notify = 1;
}
if (new == old)
goto out;
if (new & NUD_CONNECTED)
neigh_connect(neigh); //修改output函数为neigh_connected_output
else
neigh_suspect(neigh);
if (!(old & NUD_VALID)) { //如果源状态不为valid,则发送缓存的skb
struct sk_buff *skb;
/* Again: avoid dead loop if something went wrong */
while (neigh->nud_state & NUD_VALID &&
(skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { //取出缓冲报文
struct dst_entry *dst = skb_dst(skb);
struct neighbour *n2, *n1 = neigh;
write_unlock_bh(&neigh->lock);
rcu_read_lock();
/* Why not just use 'neigh' as-is? The problem is that
* things such as shaper, eql, and sch_teql can end up
* using alternative, different, neigh objects to output
* the packet in the output path. So what we need to do
* here is re-lookup the top-level neigh in the path so
* we can reinject the packet there.
*/
n2 = NULL;
if (dst) {
n2 = dst_neigh_lookup_skb(dst, skb);
if (n2)
n1 = n2;
}
n1->output(n1, skb); //调用neigh的output函数,此时已经改成connect函数
if (n2)
neigh_release(n2);
rcu_read_unlock();
write_lock_bh(&neigh->lock);
}
__skb_queue_purge(&neigh->arp_queue); //清空缓存
neigh->arp_queue_len_bytes = 0;
}
out:
if (update_isrouter) {
neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
(neigh->flags | NTF_ROUTER) :
(neigh->flags & ~NTF_ROUTER);
}
write_unlock_bh(&neigh->lock);
if (notify)
neigh_update_notify(neigh);
return err;
}