dpvs源码分析(续)

时间:2022-06-06
本文章向大家介绍dpvs源码分析(续),主要内容包括其使用实例、应用技巧、基本知识点总结和需要注意事项,具有一定的参考价值,需要的朋友可以参考一下。

前言

在上一篇https://cloud.tencent.com/developer/article/1180256?s=original-sharing,我们已经介绍过dpvs启动过程中的一些初始化过程,以及在数据平面线程中,是怎么把这些初始化过程串联起来的。在这篇我将主要介绍下数据平面线程的主要逻辑,在netif_loop这个线程中,它到底都做了些!

数据平面线程

前面说了,在netif_loop中函数注册的job函数调用顺序为: lcore_job_recv_fwd -> lcore_job_xmit -> lcore_job_timer_manage -> slave_lcore_loop_func -> ipv4_frag_job -> neigh_process_ring

先从lcore_job_recv_fwd 看起,这篇文章主要也是围绕这个函数进行深入。

1,lcore_job_recv_fwd

static void lcore_job_recv_fwd(void *arg)
{
    int i, j;
    portid_t pid;
    lcoreid_t cid;
    struct netif_queue_conf *qconf;

    cid = rte_lcore_id();
    assert(LCORE_ID_ANY != cid);

    for (i = 0; i < lcore_conf[lcore2index[cid]].nports; i++) {
        pid = lcore_conf[lcore2index[cid]].pqs[i].id;
        assert(pid < rte_eth_dev_count());

        for (j = 0; j < lcore_conf[lcore2index[cid]].pqs[i].nrxq; j++) {
            qconf = &lcore_conf[lcore2index[cid]].pqs[i].rxqs[j];

			// 从arp_ring获取arp报文
            lcore_process_arp_ring(qconf,cid);
            qconf->len = netif_rx_burst(pid, qconf);

            lcore_stats_burst(&lcore_stats[cid], qconf->len);

            lcore_process_packets(qconf, qconf->mbufs, cid, qconf->len, 1);
            kni_send2kern_loop(pid, qconf);
        }
    }
}

看到这个函数,我们似乎有必要对lcore_conf这个变量了解下,它的定义如下:

/* worker configuration array */
static struct netif_lcore_conf lcore_conf[NETIF_MAX_LCORES + 1];
/*
 *  lcore conf
 *  Multiple ports may be processed by a lcore.
 *多个网卡可能会被一个lcore处理
 */
struct netif_lcore_conf
{
    lcoreid_t id; 
    /* nic number of this lcore to process */
    int nports;
    /* port list of this lcore to process */
    struct netif_port_conf pqs[NETIF_MAX_PORTS];
} __rte_cache_aligned;
/*
 * RX/TX port conf for lcore.
 * Multiple queues of a port may be processed by a lcore.
 *一个lcore也可能处理一个网卡的多个queue
 */
struct netif_port_conf
{
    portid_t id; 
    /* rx/tx queues for this lcore to process*/
    int nrxq;
    int ntxq;
    /* rx/tx queue list for this lcore to process */
    struct netif_queue_conf rxqs[NETIF_MAX_QUEUES];
    struct netif_queue_conf txqs[NETIF_MAX_QUEUES];
} __rte_cache_aligned;

netif_lcore_conf这个结构体的定义,有点奇怪,port这个翻译应该是端口的意思,但是dpvs中port一般是指网卡,这个结构的填充是通过读dpvs.conf这个配置文件完成的。一个lcore会处理多个网卡的多个接收数据队列。所以lcore_job_recv_fwd会有个二重循环读取网卡队列数据。取到数据之后调用lcore_process_packets对数据包进行处理

2,lcore_process_packets

static void lcore_process_packets(struct netif_queue_conf *qconf, struct rte_mbuf **mbufs,
                      lcoreid_t cid, uint16_t count, bool pretetch)
{
  
    .........................
    /* prefetch packets */
	/*从内存预取到cache,加快处理速度*/
    if (pretetch) {
        for (t = 0; t < qconf->len && t < NETIF_PKT_PREFETCH_OFFSET; t++)
            rte_prefetch0(rte_pktmbuf_mtod(qconf->mbufs[t], void *));
    }

    /* L2 filter */
    for (i = 0; i < count; i++) {
        struct rte_mbuf *mbuf = mbufs[i];
        struct netif_port *dev = netif_port_get(mbuf->port);

        if (unlikely(!dev)) {
            rte_pktmbuf_free(mbuf);
            lcore_stats[cid].dropped++;
            continue;
        }
        if (dev->type == PORT_TYPE_BOND_SLAVE) {
            dev = dev->bond->slave.master;
            mbuf->port = dev->id;
        }

        if (pretetch && (t < qconf->len)) {
            rte_prefetch0(rte_pktmbuf_mtod(qconf->mbufs[t], void *));
            t++;
        }

        eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
        /* reuse mbuf.packet_type, it was RTE_PTYPE_XXX */
        mbuf->packet_type = eth_type_parse(eth_hdr, dev);

        /*
         * In NETIF_PORT_FLAG_FORWARD2KNI mode.
         * All packets received are deep copied and sent to  KNI
         * for the purpose of capturing forwarding packets.Since the
         * rte_mbuf will be modified in the following procedure,
         * we should use mbuf_copy instead of rte_pktmbuf_clone.
         */
        if (dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) {
            if (likely(NULL != (mbuf_copied = mbuf_copy(mbuf,
                                pktmbuf_pool[dev->socket]))))
                kni_ingress(mbuf_copied, dev, qconf);
            else
                RTE_LOG(WARNING, NETIF, "%s: Failed to copy mbufn",
                        __func__);
        }

        /*
         * do not drop pkt to other hosts (ETH_PKT_OTHERHOST)
         * since virtual devices may have different MAC with
         * underlying device.
         */

        /*
         * handle VLAN
         * if HW offload vlan strip, it's still need vlan module
         * to act as VLAN filter.
         */
        if (eth_hdr->ether_type == htons(ETH_P_8021Q) ||
            mbuf->ol_flags & PKT_RX_VLAN_STRIPPED) {

            if (vlan_rcv(mbuf, netif_port_get(mbuf->port)) != EDPVS_OK) {
                rte_pktmbuf_free(mbuf);
                lcore_stats[cid].dropped++;
                continue;
            }

            dev = netif_port_get(mbuf->port);
            if (unlikely(!dev)) {
                rte_pktmbuf_free(mbuf);
                lcore_stats[cid].dropped++;
                continue;
            }

            eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
        }
        /* handler should free mbuf */
        netif_deliver_mbuf(mbuf, eth_hdr->ether_type, dev, qconf,
                           (dev->flag & NETIF_PORT_FLAG_FORWARD2KNI) ? true:false,
                           cid, pkts_from_ring);

        lcore_stats[cid].ibytes += mbuf->pkt_len;
        lcore_stats[cid].ipackets++;
    }
}

这个函数二层的过滤函数,也就是链路层的过滤。链路层的处理完之后,会调用到netif_deliver_mbuf,这到这层就是ip层了。

3,netif_deliver_mbuf

static inline int netif_deliver_mbuf(struct rte_mbuf *mbuf,
                                     uint16_t eth_type,
                                     struct netif_port *dev,
                                     struct netif_queue_conf *qconf,
                                     bool forward2kni,
                                     lcoreid_t cid,
                                     bool pkts_from_ring)
{
    struct pkt_type *pt;
    int err;
    uint16_t data_off;

    assert(mbuf->port <= NETIF_MAX_PORTS);
    assert(dev != NULL);

    pt = pkt_type_get(eth_type, dev);

   ...............................

	/*在这里就开始处理上层协议了ipv4_rcv,目前只会处理ip和arp,也只注册了这两种*/
    err = pt->func(mbuf, dev);


    return EDPVS_OK;
}

这个函数中重点是pt->func(mbuf, dev);这个的调用,在dpvs中ip层的pkt_type只注册了两种 ip4_pkt_type

和arp_pkt_type 。其中ip4_pkt_type 注册函数在ipv4_init,上一篇中我们还提到过,ipv4_init会调用ipv4_frag_init

来注册NETIF_LCORE_JOB_SLOW类型的job; arp_pkt_type在arp_init中注册的,上篇我们也提到过,arp_init中也注册过NETIF_LCORE_JOB_SLOW类型的job.

static struct pkt_type ip4_pkt_type = {
    //.type       = rte_cpu_to_be_16(ETHER_TYPE_IPv4),
    .func       = ipv4_rcv,
    .port       = NULL,
};

static struct pkt_type arp_pkt_type = {
    //.type       = rte_cpu_to_be_16(ETHER_TYPE_ARP),
    .func       = neigh_resolve_input,
    .port       = NULL,
};

所以我们可以得知,对于ipv4的包,实际上pt->func调用的就是ipv4_rcv。我们接着再看看ipv4_rcv干了些啥

4,ipv4_rcv

static int ipv4_rcv(struct rte_mbuf *mbuf, struct netif_port *port)
{
  
    /*主要是一些错误检查之类的工作*/
    ......................
    return INET_HOOK(INET_HOOK_PRE_ROUTING, mbuf, port, NULL, ipv4_rcv_fin);

csum_error:
    IP4_INC_STATS(csumerrors);
inhdr_error:
    IP4_INC_STATS(inhdrerrors);
drop:
    rte_pktmbuf_free(mbuf);
    return EDPVS_INVPKT;
}

ipv4_rcv做了些错误检查之后就调用INET_HOOK这个函数了。我们再看看这个hook又干了些啥

5,INET_HOOK

int INET_HOOK(unsigned int hook, struct rte_mbuf *mbuf,
        struct netif_port *in, struct netif_port *out,
        int (*okfn)(struct rte_mbuf *mbuf))
{
    struct list_head *hook_list;
    struct inet_hook_ops *ops;
    struct inet_hook_state state;
    int verdict = INET_ACCEPT;

    state.hook = hook;
    hook_list = &inet_hooks[hook];

    ......................

    ops = list_entry(hook_list, struct inet_hook_ops, list);

    if (!list_empty(hook_list)) {
        verdict = INET_ACCEPT;
        list_for_each_entry_continue(ops, hook_list, list) {
repeat:
            verdict = ops->hook(ops->priv, mbuf, &state);/*会先后执行dp_vs_in和dp_vs_pre_routing*/
            if (verdict != INET_ACCEPT) {
                if (verdict == INET_REPEAT)
                    goto repeat;
                break;
            }
        }
    }

    ..............
}

这个inet_hooks是在dp_vs_init中注册的,对于INET_HOOK_PRE_ROUTING这个hooknum,有两个值分别为:


static struct inet_hook_ops dp_vs_ops[] = {
    {
        .hook       = dp_vs_in,
        .hooknum    = INET_HOOK_PRE_ROUTING,
        .priority   = 100,
    },
    {
        .hook       = dp_vs_pre_routing,
        .hooknum    = INET_HOOK_PRE_ROUTING,
        .priority   = 99,
    },
};

到此,我们可以知道了,INET_HOOK这个函数,会一次调用dp_vs_in和dp_vs_pre_routing。我们再追踪到dp_vs_in

里面看看。

6,dp_vs_in

static int dp_vs_in(void *priv, struct rte_mbuf *mbuf, 
                    const struct inet_hook_state *state)
{
   ..........................................................

    /* packet belongs to existing connection ? */
    conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false);

	/*比如是tcp的协议,会调用到conn_sched真正会调用到tcp_conn_sched*/
    if (unlikely(!conn)) {
        /* try schedule RS and create new connection */
        if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) {
            /* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.n", __func__); */
            return verdict;
        }

        /* only SNAT triggers connection by inside-outside traffic. */
        if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
            dir = DPVS_CONN_DIR_OUTBOUND;
        else
            dir = DPVS_CONN_DIR_INBOUND;
    }

  ...................
	/*xmit_inbound 将包转发给RS,xmit_outbound回包*/
    /* holding the conn, need a "put" later. */
    if (dir == DPVS_CONN_DIR_INBOUND)
        return xmit_inbound(mbuf, prot, conn);
    else
        return xmit_outbound(mbuf, prot, conn);
}

dp_vs_in这个函数我也去掉了很多东西,把一些检查除开,他的主体逻辑就是判断ip包时候是某个存在的链接,如果不存在prot->conn_sched会创建一个新连接,如果存在,直接转发包,xmit_inbound会转发给RS, xmit_outbound转发出去。我们再深入到prot->conn_sched看下。

在dpvs源码中注册了3个struct dp_vs_proto,分别是tcp,upd,和icmp。这三个协议的注册在dp_vs_proto_init函数中(main->dpvs_init->dp_vs_proto_init),我们可以看下dp_vs_proto_tcp

struct dp_vs_proto dp_vs_proto_tcp = {
    .name               = "TCP",
    .proto              = IPPROTO_TCP,
    .init               = tcp_init,
    .exit               = tcp_exit,
    .conn_sched         = tcp_conn_sched,
    .conn_lookup        = tcp_conn_lookup,
    .conn_expire        = tcp_conn_expire,
    .fnat_in_handler    = tcp_fnat_in_handler,
    .fnat_out_handler   = tcp_fnat_out_handler,
    .snat_in_handler    = tcp_snat_in_handler,
    .snat_out_handler   = tcp_snat_out_handler,
    .state_trans        = tcp_state_trans,
};

dp_vs_proto_tcp 的conn_sched实际上调用的是tcp_conn_sched。tcp_conn_sched会真正的建立连接。在该函数中还会调用dp_vs_schedule,而dp_vs_schedule里面会有所谓的full-nat和snat的选择。到此3层转发介绍的也就差不多了。后面有机会的话,再介绍其它的一些模块。

结语

在这两篇文章中,我们主要介绍了下dpvs启动到3层转发的主体逻辑,之所以会写这两篇文章,是因为最近要接触GBN项目,在跟项目前,需要学习下dpvs的源码,在此,也就记录下这个学习的过程。个人觉得dpvs的源码读起来还是很有意思的,dpvs为了提升性能,其实做了很多的指令优化,比如我贴的代码中经常会看到pretetch,likely和unlike这些东西。预取和预编译指令在dpvs中出现的还是蛮多的。