/*
 * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 * Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 *
 * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 * property and proprietary rights in and to this material, related
 * documentation and any modifications thereto. Any use, reproduction,
 * disclosure or distribution of this material and related documentation
 * without an express license agreement from NVIDIA CORPORATION or
 * its affiliates is strictly prohibited.
 */

#include <config.h>
#include "dpif-doca.h"
#include "dpif-doca-private.h"
#include "dpif-doca-private-dfc.h"

#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <net/if.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <unistd.h>

#include "batch.h"
#include "bitmap.h"
#include "ccmap.h"
#include "cmap.h"
#include "conntrack.h"
#include "conntrack-offload.h"
#include "conntrack-tp.h"
#include "coverage.h"
#include "ct-dpif.h"
#include "csum.h"
#include "dp-packet.h"
#include "dpif.h"
#include "dpif-doca-lookup.h"
#include "dpif-doca-private-extract.h"
#include "dpif-netdev-perf.h"
#include "dpif-provider.h"
#include "dummy.h"
#include "fat-rwlock.h"
#include "flow.h"
#include "histogram.h"
#include "hmapx.h"
#include "id-pool.h"
#include "id-fpool.h"
#include "ipf.h"
#include "mov-avg.h"
#include "mpsc-queue.h"
#include "netdev.h"
#include "netdev-offload.h"
#include "netdev-offload-doca.h"
#include "netdev-offload-provider.h"
#include "netdev-provider.h"
#include "netdev-vport.h"
#include "netdev-doca.h"
#include "netlink.h"
#include "odp-execute.h"
#include "odp-util.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/list.h"
#include "openvswitch/match.h"
#include "openvswitch/ofp-parse.h"
#include "openvswitch/ofp-print.h"
#include "openvswitch/ofpbuf.h"
#include "openvswitch/shash.h"
#include "openvswitch/vlog.h"
#include "ovs-doca.h"
#include "ovs-numa.h"
#include "ovs-rcu.h"
#include "packets.h"
#include "openvswitch/poll-loop.h"
#include "pvector.h"
#include "random.h"
#include "seq.h"
#include "smap.h"
#include "sset.h"
#include "timeval.h"
#include "tnl-neigh-cache.h"
#include "tnl-ports.h"
#include "unixctl.h"
#include "util.h"
#include "uuid.h"

VLOG_DEFINE_THIS_MODULE(dpif_doca_ext);

extern struct odp_support dp_doca_support;

static atomic_bool dump_packets_enabled = false;
static unsigned int n_pmd_threads;

#define INVALID_OFFLOAD_THREAD_NB (MAX_OFFLOAD_THREAD_NB + 1)

COVERAGE_DEFINE_WARN(flow_offload_200ms_latency);
COVERAGE_DEFINE(ct_offload_30us_latency);
COVERAGE_DEFINE(ct_offload_50us_latency);
COVERAGE_DEFINE_WARN(ct_offload_100us_latency);

#define DP_OFFLOAD_UPKEEP_PERIOD_MS (256)
/* Number of max-backoff to roughly reach the upkeep period. */
#define DP_OFFLOAD_UPKEEP_N_BACKOFF \
    (DP_OFFLOAD_UPKEEP_PERIOD_MS / DP_NETDEV_OFFLOAD_BACKOFF_MAX)
BUILD_ASSERT_DECL(IS_POW2(DP_OFFLOAD_UPKEEP_N_BACKOFF));

#define DEFAULT_MAX_RECIRC_DEPTH 8
unsigned int max_recirc_depth = DEFAULT_MAX_RECIRC_DEPTH;

struct ovs_numa_dump *
dp_doca_pmd_cmask2cores(const char *pmd_cmask)
{
    struct ovs_numa_dump *pmd_cores;

    if (pmd_cmask && pmd_cmask[0]) {
        pmd_cores = ovs_numa_dump_cores_with_cmask(pmd_cmask);
    } else {
        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
    }

    return pmd_cores;
}

unsigned int
dpif_doca_get_n_pmd_threads(void)
{
    return n_pmd_threads;
}

void
dpif_doca_set_n_pmd_threads(const char *pmd_cmask)
{
    struct ovs_numa_dump *pmd_cores;

    pmd_cores = dp_doca_pmd_cmask2cores(pmd_cmask);
    n_pmd_threads = ovs_numa_dump_count(pmd_cores);
    ovs_numa_dump_destroy(pmd_cores);
}

void
dp_doca_dump_packets_toggle(struct unixctl_conn *conn, int argc,
                            const char *argv[], void *aux OVS_UNUSED)
{
    bool flag = false;

    if (argc == 1) {
        flag = true;
    } else {
        if (!strcmp(argv[1], "on")) {
            flag = true;
        } else if (!strcmp(argv[1], "off")) {
            flag = false;
        } else {
            unixctl_command_reply_error(conn, "Invalid parameters");
            return;
        }
    }

    atomic_store_relaxed(&dump_packets_enabled, flag);
    unixctl_command_reply(conn, flag ? "ON" : "OFF");
}

void
dp_doca_read_dump_packets_enabled(bool *flag)
{
    atomic_read_relaxed(&dump_packets_enabled, flag);
}

struct dp_offload_thread *
dp_doca_offload_thread_next(struct dp_offload_thread *start,
                            unsigned int *tid, bool include_main)
{
    size_t array_size = ARRAY_SIZE(dp_offload_threads);
    struct dp_offload_thread *next;
    bool active;

    if (start == NULL) {
        unsigned int first = NETDEV_OFFLOAD_THREAD_MAIN;

        if (!include_main) {
            first += 1;
        }
        next = &dp_offload_threads[first];
    } else {
        next = start + 1;
    }

    while (next != &dp_offload_threads[array_size]) {
        atomic_read(&next->active, &active);
        if (active) {
            if (tid) {
                *tid = (unsigned int) (ptrdiff_t) (next - dp_offload_threads);
            }
            return next;
        }
        next++;
    }

    return NULL;
}

void
dp_doca_offload_thread_enqueue(struct dp_offload_thread *thread,
                               struct dp_offload_thread_item *offload)
{
    dp_doca_offload_init();

    mpsc_queue_insert(&thread->offload_queue, &offload->node);
    atomic_count_inc64(&thread->enqueued_offload);
}

#define CT_ADD_DEFAULT_QUEUE_SIZE 200000
static unsigned int offload_ct_add_queue_size = CT_ADD_DEFAULT_QUEUE_SIZE;

void
packet_enqueue_to_flow_map(struct dp_packet *packet,
                           struct dp_doca_flow *flow,
                           uint16_t tcp_flags,
                           struct dp_packet_flow_map *flow_map,
                           size_t index)
{
    struct dp_packet_flow_map *map = &flow_map[index];
    map->flow = flow;
    map->packet = packet;
    map->tcp_flags = tcp_flags;
}

static void
pmd_thread_offload_netdevs(struct dp_doca_pmd_thread *pmd, odp_port_t port_nos[CT_DIR_NUM],
                           struct netdev *netdevs[CT_DIR_NUM]);

static int
dp_doca_netdev_conns_add(struct netdev *netdevs[CT_DIR_NUM], struct batch *conns)
{
    const struct netdev_flow_api *flow_apis[CT_DIR_NUM] = { NULL, NULL };

    if (!netdevs[CT_DIR_INIT]) {
        return ENODEV;
    }

    flow_apis[CT_DIR_INIT] = ovsrcu_get(const struct netdev_flow_api *,
                                        &netdevs[CT_DIR_INIT]->flow_api);
    if (!flow_apis[CT_DIR_INIT]) {
        return ENODEV;
    }

    if (!flow_apis[CT_DIR_INIT]->conns_add) {
        return EOPNOTSUPP;
    }

    if (netdevs[CT_DIR_REP]) {
        flow_apis[CT_DIR_REP] = ovsrcu_get(const struct netdev_flow_api *,
                                           &netdevs[CT_DIR_REP]->flow_api);
        /* Do NULL-check before doing diff-check, so that an EINVAL truly
         * signals that a connection offload was attempted on two very different ports
         * which is a logical error that should not have happened.
         * The 'flow_api' is set NULL during the normal lifecycle of a port, and it is
         * expected that at some point we will read differing values as one port is
         * destroyed while the other is not.
         */
        if (flow_apis[CT_DIR_INIT] != flow_apis[CT_DIR_REP]) {
            return EINVAL;
        }
    }

    if (!ovs_doca_initialized()) {
        netdev_offload_upkeep(netdevs[CT_DIR_INIT], false);
        if (netdevs[CT_DIR_REP]) {
            netdev_offload_upkeep(netdevs[CT_DIR_REP], false);
        }
    }

    return flow_apis[CT_DIR_INIT]->conns_add(netdevs, conns);
}

static int
dp_doca_netdev_conns_del(struct netdev *netdevs[CT_DIR_NUM], struct batch *conns)
{
    const struct netdev_flow_api *flow_apis[CT_DIR_NUM] = { NULL, NULL };

    if (!netdevs[CT_DIR_INIT] && !netdevs[CT_DIR_REP]) {
        /* No port remains, assume deletion to be successful. */
        return 0;
    }

    for (int dir = 0; dir < CT_DIR_NUM; dir++) {
        if (netdevs[dir]) {
            flow_apis[dir] = ovsrcu_get(const struct netdev_flow_api *,
                                        &netdevs[dir]->flow_api);
        }
    }

    if (flow_apis[CT_DIR_INIT] && flow_apis[CT_DIR_REP] &&
        flow_apis[CT_DIR_INIT] != flow_apis[CT_DIR_REP]) {
        return EINVAL;
    }

    if (!flow_apis[CT_DIR_INIT] && flow_apis[CT_DIR_REP]) {
        flow_apis[CT_DIR_INIT] = flow_apis[CT_DIR_REP];
    }

    if (!flow_apis[CT_DIR_INIT]) {
        return ENODEV;
    }

    if (!flow_apis[CT_DIR_INIT]->conns_del) {
        return EOPNOTSUPP;
    }

    if (!ovs_doca_initialized()) {
        for (int dir = 0; dir < CT_DIR_NUM; dir++) {
            if (netdevs[dir]) {
                netdev_offload_upkeep(netdevs[dir], false);
            }
        }
    }

    return flow_apis[CT_DIR_INIT]->conns_del(netdevs, conns);
}

static int
dp_doca_netdev_conn_stats(struct netdev *netdevs[CT_DIR_NUM], struct conn *conn,
                          struct dpif_flow_stats *stats, struct dpif_flow_attrs *attrs,
                          long long int now)
{
    const struct netdev_flow_api *flow_api = NULL;

    if (!netdevs[CT_DIR_INIT] && !netdevs[CT_DIR_REP]) {
        /* No port remains, connection is already removed. */
        return ENODEV;
    }

    for (int dir = 0; dir < CT_DIR_NUM; dir++) {
        /* Full connection offload already enforced that
         * both netdevs use the same offload API. Read either
         * one of the two. If one netdev is NULL or in the process
         * of being destroyed, skip to the other. */
        if (netdevs[dir]) {
            flow_api = ovsrcu_get(const struct netdev_flow_api *,
                                  &netdevs[dir]->flow_api);
            if (flow_api) {
              break;
            }
        }
    }

    if (!flow_api) {
        return ENODEV;
    }

    if (!flow_api->conn_stats) {
        return EOPNOTSUPP;
    }

    return flow_api->conn_stats(netdevs, conn, stats, attrs, now);
}

static void
dp_doca_offload_conns(struct dp_doca_pmd_thread *pmd,
                        struct conntrack *ct, struct batch *conns, int op)
    OVS_NO_THREAD_SAFETY_ANALYSIS
{
    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
    struct dp_doca *dp = conntrack_datapath(ct);
    struct conn *first = batch_first(conns);
    enum vlog_level vlog_level = VLL_ERR;
    struct netdev *netdevs[CT_DIR_NUM];
    odp_port_t ports[CT_DIR_NUM];
    int n = batch_size(conns);
    const char *desc = NULL;
    int ret = 0;

    conntrack_offload_get_ports(first, ports);
    if (pmd) {
        memset(netdevs, 0, sizeof netdevs);
        pmd_thread_offload_netdevs(pmd, ports, netdevs);
        /* No need to lock the datapath port_rwlock here: we are in the context
         * of a polling thread, which will safely pause itself whenever a thread
         * has to modify the port map.
         *
         * Similarly, the netdev references are not kept, as they are
         * already held within the thread poll_list. */
        for (int i = CT_DIR_INIT; i < CT_DIR_NUM; i++) {
            if (!netdevs[i]) {
                netdevs[i] = netdev_ports_get_short(ports[i], dpif_normalize_type(dp->class->type));
            }
        }
    } else {
        for (int i = CT_DIR_INIT; i < CT_DIR_NUM; i++) {
            netdevs[i] = netdev_ports_get(ports[i], dpif_normalize_type(dp->class->type));
        }
        dp_doca_port_rdlock_limit(dp, 10);
    }

    if (op == DP_NETDEV_FLOW_OFFLOAD_OP_ADD) {
        desc = "add";
        ret = dp_doca_netdev_conns_add(netdevs, conns);
        if (ret == EOPNOTSUPP) {
            vlog_level = VLL_DBG;
        }
        conntrack_stats_add(ct, CT_DPIF_CONN_STATE_OFFLOAD_ADD_PROCESSED, n);
    } else if (op == DP_NETDEV_FLOW_OFFLOAD_OP_DEL) {
        desc = "delete";
        ret = dp_doca_netdev_conns_del(netdevs, conns);
        conntrack_stats_add(ct, CT_DPIF_CONN_STATE_OFFLOAD_DEL_PROCESSED, n);
    }

    if (!pmd) {
        ovs_rwlock_unlock(&dp->port_rwlock);
        netdev_close(netdevs[CT_DIR_INIT]);
        netdev_close(netdevs[CT_DIR_REP]);
    }

    if (ret == ENODEV) {
        VLOG_DBG_RL(&rl, "%s(%d)/%s(%d): failed to handle %s %d connection offloads",
                    netdevs[0] ? netdev_get_name(netdevs[0]) : "(nil)", ports[0],
                    netdevs[1] ? netdev_get_name(netdevs[1]) : "(nil)", ports[1],
                    desc, n);
    } else if (ret) {
        VLOG_RL(&rl, vlog_level, "failed to %s %d connection offloads", desc, n);
    } else {
        int nb_ok = batch_size(conns);
        int nb_fail = n - nb_ok;

        if (nb_fail > 0) {
            VLOG_ERR_RL(&rl, "failed to %s %d connection offloads", desc, nb_fail);
        }
        if (nb_ok > 0 && VLOG_IS_DBG_ENABLED()) {
            VLOG_DBG_RL(&rl, "succeeded to %s %d connection offloads", desc, nb_ok);
        }
    }
}

static void
dp_doca_offload_enqueue_one_conn(struct conntrack *ct,
                                 struct conn *conn,
                                 int op,
                                 unsigned int tid,
                                 bool unidir_update)
{
    struct dp_doca *dp = conntrack_datapath(ct);
    struct dp_offload_thread_item *msg;

    msg = xzalloc(sizeof *msg + sizeof(struct dp_offload_conn_item));
    msg->timestamp = time_usec();
    msg->type = DP_OFFLOAD_CONN;
    msg->dp = dp;
    msg->data->conn.op = op;
    msg->data->conn.ct = ct;
    msg->data->conn.conn = conn;
    msg->data->conn.unidir_update = unidir_update;
    dp_doca_offload_thread_enqueue(&dp_offload_threads[tid], msg);
}

static void
dp_doca_offload_conns_add(struct conntrack *ct, struct batch *conns)
{
    struct dp_doca *dp = conntrack_datapath(ct);
    struct dp_doca_pmd_thread *pmd;

    pmd = ovsthread_getspecific(dp->per_pmd_key);

    while (!batch_is_empty(conns)) {
        struct conn_batch_md *md = (struct conn_batch_md *) conns->md, batch_md;
        struct conn *first = batch_first(conns);
        odp_port_t ports[CT_DIR_NUM];
        struct batch same_ports;
        int same_ports_idx;
        struct conn *conn;

        batch_init(&same_ports);
        same_ports_idx = 0;
        memset(&batch_md, 0, sizeof batch_md);
        batch_md.tid = md->tid;
        same_ports.md = &batch_md;
        conntrack_offload_get_ports(first, ports);
        BATCH_FOREACH_POP (idx, conn, conns) {
            odp_port_t it_ports[CT_DIR_NUM];

            if (md->unidir_update[idx]) {
                unsigned int insertion_tid;

                insertion_tid = conntrack_offload_get_insertion_tid(conn);
                if (insertion_tid != netdev_offload_thread_id()) {
                    dp_doca_offload_enqueue_one_conn(ct, conn, DP_NETDEV_FLOW_OFFLOAD_OP_ADD,
                                                     insertion_tid, true);
                    continue;
                }
            }
            /* All connection handles in 'conns' must share the exact same ports,
             * in the same order. */
            conntrack_offload_get_ports(conn, it_ports);
            if (it_ports[CT_DIR_INIT] == ports[CT_DIR_INIT] &&
                it_ports[CT_DIR_REP] == ports[CT_DIR_REP]) {
                batch_md.unidir_update[same_ports_idx++] = md->unidir_update[idx];
                batch_add(&same_ports, conn);
            } else {
                batch_add(conns, conn);
            }
        }
        if (!batch_is_empty(&same_ports)) {
            dp_doca_offload_conns(pmd, ct, &same_ports, DP_NETDEV_FLOW_OFFLOAD_OP_ADD);
        }
    }
}

static void
dp_doca_offload_conn_del(struct conntrack *ct, struct conn *conn)
{
    unsigned int insertion_tid;

    insertion_tid = conntrack_offload_get_insertion_tid(conn);

    if (insertion_tid != netdev_offload_thread_id()) {
        /* If we cannot take a reference, the connection has already been
         * fully destroyed (potentially during a flush). No point in
         * enqueueing a request. */
        if (!conntrack_conn_ref(conn)) {
            return;
        }
        dp_doca_offload_enqueue_one_conn(ct, conn, DP_NETDEV_FLOW_OFFLOAD_OP_DEL,
                                         insertion_tid, false);
    } else {
        struct dp_doca *dp = conntrack_datapath(ct);
        struct dp_doca_pmd_thread *pmd = ovsthread_getspecific(dp->per_pmd_key);
        struct batch conns = batch_init_one(conn);

        dp_doca_offload_conns(pmd, ct, &conns, DP_NETDEV_FLOW_OFFLOAD_OP_DEL);
    }
}

static int
dp_doca_offload_conn_active(struct conntrack *ct, struct conn *conn,
                            long long now, long long prev_now)
{
    struct netdev *netdevs[CT_DIR_NUM];
    odp_port_t ports[CT_DIR_NUM];
    struct dpif_flow_stats stats;
    const struct dp_doca *dp;
    int ret = 0;

    if (!conntrack_offload_is_enabled()) {
        return EINVAL;
    }

    dp = conntrack_datapath(ct);
    conntrack_offload_get_ports(conn, ports);
    for (int i = CT_DIR_INIT; i < CT_DIR_NUM; i++) {
        netdevs[i] = netdev_ports_get(ports[i], dpif_normalize_type(dp->class->type));
    }
    ret = dp_doca_netdev_conn_stats(netdevs, conn, &stats, NULL, now);
    netdev_close(netdevs[CT_DIR_INIT]);
    netdev_close(netdevs[CT_DIR_REP]);

    if (ret) {
        return ret;
    }

    return stats.used > prev_now ? 0 : EINVAL;
}

static struct conntrack_offload_class dpif_ct_offload_class = {
    .conns_add = dp_doca_offload_conns_add,
    .conn_del = dp_doca_offload_conn_del,
    .conn_active = dp_doca_offload_conn_active,
};

void
dp_doca_ct_offload_init(struct dp_doca *dp)
{
    dp->conntrack = conntrack_init();
    conntrack_set_datapath(dp->conntrack, dp);
    conntrack_set_offload_class(dp->conntrack, &dpif_ct_offload_class);
}

void
dp_doca_ct_offload_uninit(struct dp_doca *dp)
{
    /* Set offload_class to NULL to mark dp is being destroyed. */
    conntrack_set_offload_class(dp->conntrack, NULL);
    conntrack_destroy(dp->conntrack);
}

unsigned int
dp_doca_offload_thread_nb(void)
{
    struct dp_offload_thread *t;
    unsigned int count = 0;

    DP_NETDEV_OFFLOAD_FOREACH_THREAD (t) {
        count++;
    }
    return count;
}

static void
dp_doca_offload_thread_reset(struct dp_offload_thread *thread)
{
    atomic_init(&thread->enqueued_offload, 0);
    atomic_init(&thread->enqueued_ct_add, 0);
    mov_avg_cma_init(&thread->cma);
    mov_avg_ema_init(&thread->ema, 100);
}

void
dp_doca_offload_thread_uninit(struct dp_offload_thread *thread)
{
    bool active;

    atomic_read(&thread->active, &active);
    if (!active) {
        return;
    }

    atomic_store_relaxed(&thread->active, false);
    ccmap_destroy(&thread->mark_ref);
    ovs_mutex_destroy(&thread->mark_to_flow_lock);
    dp_doca_offload_thread_reset(thread);
}

void
dp_doca_offload_thread_init(struct dp_offload_thread *thread)
{
    mpsc_queue_init(&thread->offload_queue);
    cmap_init(&thread->megaflow_to_mark);
    ovs_mutex_init(&thread->mark_to_flow_lock);
    cmap_init(&thread->mark_to_flow);
    ccmap_init(&thread->mark_ref);
    histogram_walls_set_log(&thread->latency, 1, 2000);

    for (int i = 0; i < DP_OFFLOAD_TYPE_NUM; i++) {
        struct dp_offload_queue_metrics *m;

        m = &thread->queue_metrics[i];
        histogram_walls_set_log(&m->wait_time, 1, 2000);
        histogram_walls_set_log(&m->service_time, 1, 10000);
        histogram_walls_set_log(&m->sojourn_time, 1, 2000);
    }

    dp_doca_offload_thread_reset(thread);
    atomic_store_relaxed(&thread->active, true);
}

int
dpif_doca_offload_stats_get(struct dpif *dpif,
                            struct netdev_custom_stats *stats)
{
    enum {
        DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED_OFFLOADS,
        DP_NETDEV_HW_OFFLOADS_STATS_INSERTED,
        DP_NETDEV_HW_OFFLOADS_STATS_CT_BI_DIR_CONNS,
        DP_NETDEV_HW_OFFLOADS_STATS_CT_UNI_DIR_CONNS,
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN,
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV,
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MAX,
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MIN,
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN,
        DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV,
        DP_NETDEV_HW_OFFLOADS_STATS_LAST,
    };
    struct {
        const char *name;
        uint64_t total;
    } hwol_stats[] = {
        [DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED_OFFLOADS] =
            { "                Enqueued offloads", 0 },
        [DP_NETDEV_HW_OFFLOADS_STATS_INSERTED] =
            { "                Inserted offloads", 0 },
        [DP_NETDEV_HW_OFFLOADS_STATS_CT_BI_DIR_CONNS] =
            { "            CT bi-dir Connections", 0 },
        [DP_NETDEV_HW_OFFLOADS_STATS_CT_UNI_DIR_CONNS] =
            { "           CT uni-dir Connections", 0 },
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN] =
            { "  Cumulative Average latency (us)", 0 },
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV] =
            { "   Cumulative Latency stddev (us)", 0 },
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MAX] =
            { "      Cumulative Latency max (us)", 0 },
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MIN] =
            { "      Cumulative Latency min (us)", 0 },
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN] =
            { " Exponential Average latency (us)", 0 },
        [DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV] =
            { "  Exponential Latency stddev (us)", 0 },
    }, *cur_stats;

    struct netdev_offload_stats per_port_nos[MAX_OFFLOAD_THREAD_NB];
    struct netdev_offload_stats total_nos[MAX_OFFLOAD_THREAD_NB];
    struct dp_doca *dp = get_dp_doca(dpif);
    struct dp_offload_thread *thread;
    struct dp_doca_port *port;
    unsigned int nb_thread;
    unsigned int nb_counts;
    unsigned int tid;
    size_t i;
#define DP_NETDEV_STATS_TOTAL_COUNTS (ARRAY_SIZE(hwol_stats))

    if (!netdev_is_flow_api_enabled()) {
        return EINVAL;
    }

    /* /!\ NOTE:
     * It is safe-ish to read nb_thread here and allocate
     * counters, before iterating on the active threads.
     * This function executes in the main thread. A change in
     * active offload threads could only result from a configuration
     * change executing in the main thread. As such, it is impossible
     * for this thread number to change between the two reads.
     *
     * This is only valid if this function executes in the main thread.
     */
    nb_thread = dp_doca_offload_thread_nb();

    ovs_assert(nb_thread > 0);
    /* nb_thread counters for the overall total as well. */
    nb_counts = ARRAY_SIZE(hwol_stats);
    stats->size = (nb_thread + 1) * nb_counts;
    stats->counters = xcalloc(stats->size, sizeof *stats->counters);

    memset(total_nos, 0, sizeof total_nos);

    dp_doca_port_rdlock(dp);
    HMAP_FOR_EACH (port, node, &dp->ports) {
        memset(per_port_nos, 0, sizeof per_port_nos);
        /* Do not abort on read error from a port, just report 0. */
        if (!netdev_offload_get_stats(port->netdev, per_port_nos)) {
            for (i = 0; i < nb_thread; i++) {
                netdev_offload_stats_add(&total_nos[i], per_port_nos[i]);
            }
        }
    }
    ovs_rwlock_unlock(&dp->port_rwlock);

    DP_NETDEV_OFFLOAD_FOREACH_THREAD (thread, tid) {
        uint64_t counts[DP_NETDEV_STATS_TOTAL_COUNTS];
        size_t idx = (tid + 1) * nb_counts;

        memset(counts, 0, sizeof counts);
        counts[DP_NETDEV_HW_OFFLOADS_STATS_INSERTED] =
            total_nos[tid].n_inserted;

        atomic_read_relaxed(&thread->enqueued_offload,
                            &counts[DP_NETDEV_HW_OFFLOADS_STATS_ENQUEUED_OFFLOADS]);
        counts[DP_NETDEV_HW_OFFLOADS_STATS_CT_BI_DIR_CONNS] =
            total_nos[tid].n_conns / 2;
        counts[DP_NETDEV_HW_OFFLOADS_STATS_CT_UNI_DIR_CONNS] =
            total_nos[tid].n_unidir_conns;
        counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN] =
            mov_avg_cma(&thread->cma);
        counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV] =
            mov_avg_cma_std_dev(&thread->cma);
        counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MAX] =
            mov_avg_cma_max(&thread->cma);
        counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MIN] =
            mov_avg_cma_min(&thread->cma);
        counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN] =
            mov_avg_ema(&thread->ema);
        counts[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV] =
            mov_avg_ema_std_dev(&thread->ema);

        for (i = 0; i < nb_counts; i++) {
            cur_stats = &hwol_stats[i];
            snprintf(stats->counters[idx + i].name,
                     sizeof(stats->counters[idx + i].name),
                     "  [%3u] %s", tid, cur_stats->name);
            stats->counters[idx + i].value = counts[i];
            cur_stats->total += counts[i];
        }
    }

    /* Do an average of the average for the aggregate. */
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MEAN].total /= nb_thread;
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_STDDEV].total /= nb_thread;
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MAX].total /= nb_thread;
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_CMA_MIN].total /= nb_thread;
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_MEAN].total /= nb_thread;
    hwol_stats[DP_NETDEV_HW_OFFLOADS_STATS_LAT_EMA_STDDEV].total /= nb_thread;

    for (i = 0; i < nb_counts; i++) {
        cur_stats = &hwol_stats[i];
        snprintf(stats->counters[i].name, sizeof(stats->counters[i].name),
                 "  Total %s", cur_stats->name);
        stats->counters[i].value = cur_stats->total;
    }

    return 0;
}

int
dpif_doca_offload_stats_clear(struct dpif *dpif OVS_UNUSED)
{
    struct dp_doca *dp = get_dp_doca(dpif);
    struct dp_offload_thread *thread;

    if (!netdev_is_flow_api_enabled()) {
        return EINVAL;
    }

    DP_NETDEV_OFFLOAD_FOREACH_THREAD_NO_MAIN (thread) {
        struct dp_offload_thread_item *item;

        item = xmalloc(sizeof *item);
        item->type = DP_OFFLOAD_STATS_CLEAR;
        item->dp = dp;
        item->timestamp = time_usec();

        dp_doca_offload_thread_enqueue(thread, item);
    }

    return 0;
}

void
dpif_doca_set_static_config_ct_add_queue_size(const struct smap *other_config)
{
    if (conntrack_offload_is_enabled()) {
        offload_ct_add_queue_size =
            smap_get_uint(other_config, "hw-offload-ct-add-queue-size",
                          CT_ADD_DEFAULT_QUEUE_SIZE);
        if (offload_ct_add_queue_size == 0) {
            offload_ct_add_queue_size = CT_ADD_DEFAULT_QUEUE_SIZE;
            VLOG_WARN("The size of hw-offload-ct-add-queue-size must be "
                      "greater than 0");
        } else if (conntrack_offload_size() < offload_ct_add_queue_size) {
            offload_ct_add_queue_size = conntrack_offload_size();
            VLOG_INFO("Limiting hw-offload-ct-add-queue-size to the "
                      "conntrack offload size %u",
                      offload_ct_add_queue_size);
        }
        VLOG_INFO("hw-offload-ct-add-queue-size = %"PRIi32,
                  offload_ct_add_queue_size);
    } else {
        offload_ct_add_queue_size = 0;
    }
}

static void
dp_offload_conn(struct dp_offload_thread_item *msg)
{
    struct dp_offload_conn_item *doci = &msg->data->conn;
    struct dp_doca *dp = conntrack_datapath(doci->ct);
    struct dp_doca_pmd_thread *pmd = ovsthread_getspecific(dp->per_pmd_key);
    struct batch conns = batch_init_one(doci->conn);
    struct conn_batch_md batch_md;

    if (doci->op == DP_NETDEV_FLOW_OFFLOAD_OP_ADD) {
        memset(&batch_md, 0, sizeof batch_md);
        batch_md.tid = conntrack_offload_get_insertion_tid(doci->conn);
        batch_md.unidir_update[0] = doci->unidir_update;
        conns.md = &batch_md.tid;
    }

    dp_doca_offload_conns(pmd, doci->ct, &conns, doci->op);

    if (doci->op == DP_NETDEV_FLOW_OFFLOAD_OP_DEL) {
        /* Remove the reference held by the connection deletion request
         * kept in the offload queue. */
        conntrack_conn_unref(doci->conn);
    }
}

void
dp_doca_port_rdlock_at(struct dp_doca *dp, unsigned long long int limit_ms,
                         const char *where)
    OVS_ACQ_RDLOCK(dp->port_rwlock)
{
    static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
    unsigned long long int start = time_msec();

    if (ovs_rwlock_tryrdlock(&dp->port_rwlock)) {
        const char *holder = dp->port_rwlock.where;
        unsigned long long int elapsed;

        ovs_rwlock_rdlock(&dp->port_rwlock);
        elapsed = time_msec() - start;
        if (elapsed > limit_ms) {
            VLOG_WARN_RL(&rl, "%s: Unreasonably long %llums port_rwlock wait, "
                         "held from %s", where, elapsed, holder);
        }
    }
}

static void
pmd_thread_offload_netdevs(struct dp_doca_pmd_thread *pmd, odp_port_t port_nos[CT_DIR_NUM],
                           struct netdev *netdevs[CT_DIR_NUM])
{
    for (int i = 0; i < pmd->rx_port_count; i++) {
        struct dp_doca_port *port = pmd->rx_port_cache[i];

        for (int dir = 0; dir < CT_DIR_NUM; dir++) {
            if (!netdevs[dir]) {
                if (port->port_no == port_nos[dir] &&
                    !port->offload_disabled) {
                    netdevs[dir] = port->netdev;
                }
            }
        }
    }
}

void
pmd_thread_offload_disable(struct dp_doca_pmd_thread *pmd, struct netdev *netdev)
{
    for (int i = 0; i < pmd->rx_port_count; i++) {
        struct dp_doca_port *port = pmd->rx_port_cache[i];

        if (port->netdev == netdev) {
            port->offload_disabled = true;
            return;
        }
    }
}

void
pmd_thread_offload_upkeep(struct dp_doca_pmd_thread *pmd)
{
    dp_doca_pmd_idle_end(pmd);
    for (int i = 0; i < pmd->rx_port_count; i++) {
        netdev_offload_upkeep(pmd->rx_port_cache[i]->netdev, true);
    }
}

/* Eschew safety analysis as the PMD thread implicitly took ownership
 * of the offload thread by allocating its netdev-offload thread ID.
 * This allocation cannot be expressed in a way safety analysis understands.
 */
unsigned int
pmd_thread_offload_process(struct dp_doca_pmd_thread *pmd, unsigned int limit)
    OVS_REQUIRES(dp_offload_threads[pmd->offload_thread_id].offload_queue.read_lock)
{
    struct dp_offload_thread *offload_thread;
    struct dp_offload_thread_item *offload;
    struct mpsc_queue_node *node;
    unsigned int n_msgs = 0;

    if (pmd->offload_thread_id == OVSTHREAD_ID_UNSET) {
        return 0;
    }

    offload_thread = &dp_offload_threads[pmd->offload_thread_id];
    MPSC_QUEUE_FOR_EACH_POP (node, &dp_offload_threads[pmd->offload_thread_id].offload_queue) {
        dp_doca_pmd_idle_end(pmd);
        n_msgs++;
        atomic_count_dec64(&offload_thread->enqueued_offload);
        offload = CONTAINER_OF(node, struct dp_offload_thread_item, node);
        dp_offload_process(offload_thread, offload);
        /* Using limit == 0 means that every message still in the queue
         * must be processed. It is used when terminating the polling thread
         * to clean up requests. */
        if (limit != 0 && n_msgs >= limit) {
            break;
        }
    }

    return n_msgs;
}

void
pmd_thread_offload_init(struct dp_doca_pmd_thread *pmd)
    OVS_ACQUIRES(dp_offload_threads[pmd->offload_thread_id].offload_queue.read_lock)
{
    pmd->offload_thread_id = netdev_offload_thread_id();
    if (pmd->core_id == NON_PMD_CORE_ID) {
        ovs_assert(pmd->offload_thread_id == NETDEV_OFFLOAD_THREAD_MAIN);
    }
    dp_doca_offload_thread_init(&dp_offload_threads[pmd->offload_thread_id]);
    mpsc_queue_acquire(&dp_offload_threads[pmd->offload_thread_id].offload_queue);
}

void
pmd_thread_offload_uninit(struct dp_doca_pmd_thread *pmd)
    OVS_RELEASES(dp_offload_threads[pmd->offload_thread_id].offload_queue.read_lock)
{
    pmd_thread_offload_process(pmd, 0);
    pmd_thread_offload_upkeep(pmd);
    mpsc_queue_release(&dp_offload_threads[pmd->offload_thread_id].offload_queue);

    dp_doca_offload_thread_uninit(&dp_offload_threads[pmd->offload_thread_id]);
    pmd->offload_thread_id = OVSTHREAD_ID_UNSET;
    netdev_offload_thread_uninit();
}

void
dp_doca_pmd_idle_begin(struct dp_doca_pmd_thread *pmd)
{
    if (pmd->core_id != NON_PMD_CORE_ID &&
        !pmd->idle) {
        ovsrcu_quiesce_start();
        pmd->idle = true;
    }
}

void
dp_doca_pmd_idle_end(struct dp_doca_pmd_thread *pmd)
{
    if (pmd->idle) {
        ovsrcu_quiesce_end();
        pmd->idle = false;
        pmd->next_rcu_quiesce =
            pmd->ctx.now + PMD_RCU_QUIESCE_INTERVAL;
    }
}

void
dp_doca_flow_format(const char *prefix,
                    struct ds *s,
                    const struct dp_doca_flow *dp_flow)
{
    struct dp_doca_actions *dp_actions;

    ds_init(s);
    ds_put_format(s, "%s: ", prefix);
    odp_format_ufid(&dp_flow->ufid, s);
    ds_put_cstr(s, " mega_");
    odp_format_ufid(&dp_flow->mega_ufid, s);
    ds_put_cstr(s, " ");

    flow_format(s, &dp_flow->flow, NULL);

    dp_actions = dp_doca_flow_get_actions(dp_flow);
    ds_put_cstr(s, ", actions:");
    if (dp_actions) {
        format_odp_actions(s, dp_actions->actions, dp_actions->size, NULL);
    } else {
        ds_put_cstr(s, "(nil)");
    }
}

void
dp_doca_offload_poll_queues(struct dp_offload_thread *ofl_thread,
                            struct dp_offload_thread_item **offload_item)
    OVS_REQUIRES(ofl_thread->offload_queue.read_lock)
{
    struct mpsc_queue_node *queue_node;
    unsigned int n_backoff;
    uint64_t backoff;

    *offload_item = NULL;
    backoff = DP_NETDEV_OFFLOAD_BACKOFF_MIN;
    n_backoff = 0;

    while (1) {
        queue_node = mpsc_queue_pop(&ofl_thread->offload_queue);
        if (queue_node != NULL) {
            *offload_item = CONTAINER_OF(queue_node,
                                         struct dp_offload_thread_item, node);
            atomic_count_dec64(&ofl_thread->enqueued_offload);
            return;
        }

        /* Execute upkeep if
         *
         *   + we are waiting for work for the first time
         *     -> We have just stopped a streak of offloading,
         *        some remaining things might need cleanup.
         *
         *   + we have waited roughly the amount of time
         *     between upkeep period.
         */
        if ((n_backoff & (DP_OFFLOAD_UPKEEP_N_BACKOFF - 1)) == 0) {
            /* Signal 'quiescing' only on the first backoff. */
            netdev_ports_upkeep(n_backoff == 0);
        }
        n_backoff += 1;

        /* The thread is flagged as quiescent during xnanosleep(). */
        xnanosleep(backoff * 1E6);
        if (backoff < DP_NETDEV_OFFLOAD_BACKOFF_MAX) {
            backoff <<= 1;
        }
    }
}

long long int
dp_offload_measure_latency(struct dp_offload_thread *thread,
                           long long int enqueue_time_us,
                           long long int finish_time_us)
{
    long long int latency_us;

    latency_us = finish_time_us - enqueue_time_us;
    mov_avg_cma_update(&thread->cma, latency_us);
    mov_avg_ema_update(&thread->ema, latency_us);
    histogram_add_sample(&thread->latency, latency_us / 1000);

    return latency_us;
}

void
dp_offload_process(struct dp_offload_thread *thread,
                   struct dp_offload_thread_item *msg)
{
    struct dp_offload_queue_metrics *m;
    long long int enqueue_time_us;
    long long int dequeue_time_us;
    long long int service_time_us;
    long long int finish_time_us;
    long long int wait_time_ms;
    long long int latency_us;

    enqueue_time_us = msg->timestamp;
    dequeue_time_us = time_usec();

    switch (msg->type) {
    case DP_OFFLOAD_FLOW:
        dpif_doca_dp_offload_flow(msg);
        break;
    case DP_OFFLOAD_CONN:
        dp_offload_conn(msg);
        break;
    case DP_OFFLOAD_STATS_CLEAR:
        mov_avg_cma_init(&thread->cma);
        mov_avg_ema_init(&thread->ema, 100);
        break;
    case DP_OFFLOAD_FLUSH:
        dpif_doca_dp_offload_flush(msg);
        break;
    case DP_OFFLOAD_TYPE_NUM:
        /* Fallthrough */
    default:
        OVS_NOT_REACHED();
    }

    finish_time_us = time_usec();
    latency_us = dp_offload_measure_latency(thread,
                                            enqueue_time_us,
                                            finish_time_us);

    wait_time_ms = (dequeue_time_us - enqueue_time_us) / 1000;
    service_time_us = finish_time_us - dequeue_time_us;
    m = &thread->queue_metrics[msg->type];
    histogram_add_sample(&m->wait_time, wait_time_ms);
    histogram_add_sample(&m->service_time, service_time_us);
    histogram_add_sample(&m->sojourn_time, latency_us / 1000);

    switch (msg->type) {
    case DP_OFFLOAD_FLOW:
        if (!thread->high_latency_event &&
            latency_us >= 200000) {
            thread->high_latency_event = true;
            COVERAGE_INC(flow_offload_200ms_latency);
        }
        break;
    case DP_OFFLOAD_CONN:
        if (!thread->high_latency_event) {
            thread->high_latency_event = true;
            if (latency_us >= 100) {
                COVERAGE_INC(ct_offload_100us_latency);
            } else if (latency_us >= 50) {
                COVERAGE_INC(ct_offload_50us_latency);
            } else if (latency_us > 30) {
                COVERAGE_INC(ct_offload_30us_latency);
            } else {
                thread->high_latency_event = false;
            }
        }
        break;
    case DP_OFFLOAD_STATS_CLEAR:
        /* Fallthrough */
    case DP_OFFLOAD_FLUSH:
        /* Fallthrough */
    case DP_OFFLOAD_TYPE_NUM:
        /* Fallthrough */
    default:
        break;
    }

    dp_doca_free_offload(msg);
}

static void
dpif_doca_set_static_config(const struct smap *other_config)
{
    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;

    if (!ovsthread_once_start(&once)) {
        return;
    }

    conntrack_offload_config(other_config);
    dpif_doca_set_static_config_ct_add_queue_size(other_config);

    ovsthread_once_done(&once);
}

void
dpif_doca_set_ext_config(struct dp_doca *dp, const struct smap *other_config,
                         bool first_set_config)
{
    bool pmd_quiet_idle, cur_pmd_quiet_idle;

    if (smap_get_node(other_config, "max-recirc-depth")) {
        unsigned int read_depth;

        read_depth = smap_get_uint(other_config, "max-recirc-depth",
                                   DEFAULT_MAX_RECIRC_DEPTH);

        if (read_depth < DEFAULT_MAX_RECIRC_DEPTH) {
            read_depth = DEFAULT_MAX_RECIRC_DEPTH;
        }

        if (max_recirc_depth != read_depth) {
            max_recirc_depth = read_depth;
            VLOG_INFO("max recirc depth set to %u", read_depth);
        }
    }

    bool sleep_changed = dpif_doca_set_all_pmd_max_sleeps(dp, other_config);
    if (first_set_config || sleep_changed) {
        dpif_doca_log_all_pmd_sleeps(dp);
    }

    pmd_quiet_idle = smap_get_bool(other_config, "pmd-quiet-idle", false);
    atomic_read_relaxed(&dp->pmd_quiet_idle, &cur_pmd_quiet_idle);
    if (first_set_config || pmd_quiet_idle != cur_pmd_quiet_idle) {
        atomic_store_relaxed(&dp->pmd_quiet_idle, pmd_quiet_idle);
        VLOG_INFO("PMD quiescent idling mode %s.",
                  pmd_quiet_idle ? "enabled" : "disabled");
    }

    conntrack_config(dp->conntrack, other_config);
    dpif_doca_set_static_config(other_config);
}

int
dpif_doca_ct_get_stats(struct dpif *dpif,
                       struct ct_dpif_stats *stats)
{
    struct dp_doca *dp = get_dp_doca(dpif);

    return conntrack_get_stats(dp->conntrack, stats);
}

int
dp_doca_offload_netdev_meter_set(uint32_t meter_id_,
                                 struct ofputil_meter_config *config)
{
    /* Compensate for ovs-ofctl (meter_ID - 1) adjustment */
    ofproto_meter_id id = { .uint32 = meter_id_ + 1 };
    return netdev_doca_meter_set(id, config);
}

int
dp_doca_offload_netdev_meter_get(uint32_t meter_id_,
                                 struct ofputil_meter_stats *stats,
                                 uint16_t n_bands)
{
    /* Compensate for ovs-ofctl (meter_ID - 1) adjustment */
    ofproto_meter_id id = { .uint32 = meter_id_ + 1 };
    return netdev_doca_meter_get(id, stats, n_bands);
}

int
dp_doca_offload_netdev_meter_del(uint32_t meter_id_,
                                 struct ofputil_meter_stats *stats,
                                 uint16_t n_bands)
{
    /* Compensate for ovs-ofctl (meter_ID - 1) adjustment */
    ofproto_meter_id id = { .uint32 = meter_id_ + 1 };
    return netdev_doca_meter_del(id, stats, n_bands);
}
