/*
 * SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
 * Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
 *
 * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
 * property and proprietary rights in and to this material, related
 * documentation and any modifications thereto. Any use, reproduction,
 * disclosure or distribution of this material and related documentation
 * without an express license agreement from NVIDIA CORPORATION or
 * its affiliates is strictly prohibited.
 */

#include <config.h>
#include "netdev-doca.h"

#include <dirent.h>
#include <errno.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <linux/virtio_net.h>
#include <sys/socket.h>
#include <linux/if.h>

#include <rte_bus.h>
#include <rte_config.h>
#include <rte_cycles.h>
#include <rte_dev.h>
#include <rte_errno.h>
#include <rte_ethdev.h>
#include <rte_flow.h>
#include <rte_malloc.h>
#include <rte_mbuf.h>
#include <rte_meter.h>
#include <rte_mtr.h>
#include <rte_pci.h>
#include <rte_version.h>
#include <rte_vhost.h>

#include "cmap.h"
#include "coverage.h"
#include "dirs.h"
#include "dp-packet.h"
#include "dpdk.h"
#include "dpif-doca.h"
#include "fatal-signal.h"
#include "if-notifier.h"
#include "netdev-offload-doca.h"
#include "netdev-provider.h"
#include "netdev-vport.h"
#include "odp-util.h"
#include "openflow/openflow.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/list.h"
#include "openvswitch/match.h"
#include "openvswitch/ofp-print.h"
#include "openvswitch/shash.h"
#include "openvswitch/vlog.h"
#include "ovs-doca.h"
#include "ovs-numa.h"
#include "ovs-rcu.h"
#include "ovs-thread.h"
#include "packets.h"
#include "rtnetlink.h"
#include "smap.h"
#include "sset.h"
#include "timeval.h"
#include "unaligned.h"
#include "unixctl.h"
#include "userspace-tso.h"
#include "util.h"
#include "uuid.h"

VLOG_DEFINE_THIS_MODULE(netdev_doca_ext);

#ifndef DOCA_OFFLOAD
struct doca_flow_resource_meter_cfg {
};
#endif

/* DPDK library uses uint16_t for port_id. */
typedef uint16_t dpdk_port_t;
#define DPDK_PORT_ID_FMT "%"PRIu16

#define MAX_PHYS_ITEM_ID_LEN 32

/* DPDK interfaces software meter definitions.
 *
 * Netdev-DPDK ports can protect their software ingress against floods
 * by configuring a sw rate-limit. This structure holds the complete
 * representation of such configuration.
 */

/* The default sw-meter describes the common template applied
 * to all meters. With its rate set to 0 it is considered
 * disabled and no special rule is installed until a different
 * meter is configured. */
static struct netdev_doca_sw_meter default_sw_meter = {
    .config = {
        .meter_id = -1,
        .flags = OFPMF13_BURST | OFPMF13_STATS,
        .n_bands = 1,
        .bands = &default_sw_meter.band,
    },
    .band = {
        .type = OFPMBT13_DROP,
        .prec_level = 0,
        .rate = 0,
        .burst_size = 0,
    },
};

#define SW_METER_FMT "%s:%" PRIu32 ":%" PRIu32
#define SW_METER_ARG(m) \
    (m)->config.flags & OFPMF13_PKTPS ? "pps" : \
    (m)->config.flags & OFPMF13_KBPS ? "bps" : "xxx", \
    (m)->band.rate * (((m)->config.flags & OFPMF13_KBPS) ? 1000 : 1), \
    (m)->band.burst_size * (((m)->config.flags & OFPMF13_KBPS) ? 1000 : 1)

int
netdev_doca_sw_meter_parse(const char *s,
                           struct netdev_doca_sw_meter *m,
                           const char **errp)
{
    unsigned long long int rate, burst;
    enum ofp13_meter_flags type;
    char burst_str[20];
    char rate_str[20];
    char type_str[4];
    int ret;

    if (!ovs_scan(s, "%3[bps]:%19[a-zA-Z0-9]:%19[a-zA-Z0-9]",
                  type_str, rate_str, burst_str)) {
        *errp = "Invalid format.";
        return EINVAL;
    }

    /* Parse the meter type: packet or (kilo)bits. */
    if (!strncmp(type_str, "pps", sizeof type_str)) {
        type = OFPMF13_PKTPS;
    } else if (!strncmp(type_str, "bps", sizeof type_str)) {
        type = OFPMF13_KBPS;
    } else {
        *errp = "Unrecognized type (pps|bps).";
        return EINVAL;
    }

    /* Parse the rate & burst size. */
    ret = parse_ull_scale(rate_str, &rate, errp);
    if (ret) {
        return ret;
    }

    ret = parse_ull_scale(burst_str, &burst, errp);
    if (ret) {
        return ret;
    }

    if (rate == 0) {
        *errp = "rate cannot be 0.";
        return ERANGE;
    }

    if (rate > UINT32_MAX) {
        *errp = "rate cannot exceed 2**32.";
        return ERANGE;
    }

    if (burst == 0) {
        *errp = "burst size cannot be 0.";
        return ERANGE;
    }

    if (burst > UINT32_MAX) {
        *errp = "burst size cannot exceed 2**32.";
        return ERANGE;
    }

    if (type == OFPMF13_KBPS) {
        /* The rate is set in bit per second,
         * while ofproto expects it in kilo-bit per second. */
        rate /= 1000;
        if (rate == 0) {
            rate = 1;
        }
        burst /= 1000;
        if (burst == 0) {
            burst = 1;
        }
    }

    netdev_doca_sw_meter_copy(m, &default_sw_meter);
    m->config.flags |= type;
    m->band.rate = rate;
    m->band.burst_size = burst;

    return 0;
}

void
netdev_doca_sw_meter_copy(struct netdev_doca_sw_meter *dst,
                          const struct netdev_doca_sw_meter *src)
{
    memcpy(dst, src, sizeof *dst);
    dst->config.bands = &dst->band;
}

bool
netdev_doca_sw_meter_equal(const struct netdev_doca_sw_meter *m1,
                           const struct netdev_doca_sw_meter *m2)
{
    if (m1->config.flags != m2->config.flags ||
        m1->band.rate != m2->band.rate ||
        m1->band.burst_size != m2->band.burst_size ||
        m1->dry_run != m2->dry_run) {
        return false;
    }
    return true;
}

void
netdev_doca_add_meter_policy(dpdk_port_t port_id, uint32_t policy_id)
{
    const struct rte_flow_action end = { RTE_FLOW_ACTION_TYPE_DROP, NULL };
    struct rte_mtr_meter_policy_params policy = {
        .actions[RTE_COLOR_GREEN] = NULL,
        .actions[RTE_COLOR_YELLOW] = NULL,
        .actions[RTE_COLOR_RED] = &end,
    };
    struct rte_mtr_error error;
    int rv;

    VLOG_INFO("Creating meter policy %u on port "DPDK_PORT_ID_FMT,
              policy_id, port_id);
    rv = rte_mtr_meter_policy_add(port_id, policy_id, &policy, &error);
    if (rv) {
        VLOG_WARN("cannot add meter policy, error: %s\n", error.message);
    }
}

bool
netdev_doca_is_esw_mgr(const struct netdev *netdev)
{
    return netdev_doca_get_esw_mgr_port_id(netdev) ==
           netdev_doca_get_port_id(netdev) &&
           netdev_doca_get_esw_mgr_port_id(netdev) != -1;
}

void
netdev_doca_get_config_sw_meter(struct smap *args,
                                struct netdev_doca_sw_meter *requested_sw_meter,
                                struct netdev_doca_sw_meter *sw_meter)
{
    if (sw_meter->band.rate) {
        smap_add_format(args, "sw_meter", SW_METER_FMT,
                        SW_METER_ARG(sw_meter));
    }

    if (requested_sw_meter->band.rate) {
        smap_add_format(args, "requested_sw_meter", SW_METER_FMT,
                        SW_METER_ARG(requested_sw_meter));
    }
}

struct rte_flow_action_handle *
netdev_doca_indirect_action_create(struct netdev *netdev,
                                   int port_id,
                                   const struct rte_flow_action *action,
                                   struct rte_flow_error *error)
{
    struct rte_flow_indir_action_conf conf = { .transfer = 1, };
    struct rte_flow_action_handle *act_hdl;

    if (!is_doca_class(netdev->netdev_class)) {
        return NULL;
    }

    netdev_doca_dev_lock(netdev);
    act_hdl = rte_flow_action_handle_create(port_id, &conf, action, error);
    netdev_doca_dev_unlock(netdev);
    return act_hdl;
}

int
netdev_doca_indirect_action_destroy(int port_id,
                                    struct rte_flow_action_handle *act_hdl,
                                    struct rte_flow_error *error)
{
    int ret;

    ret = rte_flow_action_handle_destroy(port_id, act_hdl, error);
    return ret;
}

int
netdev_doca_indirect_action_query(int port_id,
                                  struct rte_flow_action_handle *act_hdl,
                                  void *data,
                                  struct rte_flow_error *error)
{
    int ret;

    ret = rte_flow_action_handle_query(port_id, act_hdl, data, error);
    return ret;
}

static void
fill_meter_profile(struct rte_mtr_meter_profile *profile,
                   struct ofputil_meter_config *config)
{
    profile->alg = RTE_MTR_SRTCM_RFC2697;
    profile->srtcm_rfc2697.ebs = 0;

    if (config->flags & OFPMF13_PKTPS) {
        profile->packet_mode = 1;
        profile->srtcm_rfc2697.cir = config->bands[0].rate;
        profile->srtcm_rfc2697.cbs = config->bands[0].burst_size;
    } else {
        profile->packet_mode = 0;
        /* Convert from kilobits per second to bytes per second */
        profile->srtcm_rfc2697.cir = ((uint64_t) config->bands[0].rate) * 125;
        profile->srtcm_rfc2697.cbs =
            ((uint64_t) config->bands[0].burst_size) * 125;
    }
}

static int
add_meter_profile(uint32_t profile_id,
                  struct ofputil_meter_config *config,
                  struct rte_mtr_error *error)
{
    struct rte_mtr_meter_profile profile;

    if (config->n_bands != 1) {
        return -1;
    }

    memset(&profile, 0, sizeof(profile));
    fill_meter_profile(&profile, config);
    return rte_mtr_meter_profile_add(NETDEV_DPDK_METER_PORT_ID, profile_id,
                                     &profile, error);
}

static int
create_meter(uint32_t mtr_id, uint32_t profile_id,
             struct rte_mtr_error *error)
{
    struct rte_mtr_params params;

    memset(&params, 0, sizeof params);
    params.meter_enable = 1;
    params.use_prev_mtr_color = 0;
    params.meter_policy_id = NETDEV_DPDK_METER_POLICY_ID;
    params.meter_profile_id = profile_id;
    params.dscp_table = NULL; /* no input color. */
    /* Enable all stats. */
    params.stats_mask = RTE_MTR_STATS_N_BYTES_GREEN |
                        RTE_MTR_STATS_N_BYTES_YELLOW |
                        RTE_MTR_STATS_N_BYTES_RED |
                        RTE_MTR_STATS_N_BYTES_DROPPED |
                        RTE_MTR_STATS_N_PKTS_GREEN |
                        RTE_MTR_STATS_N_PKTS_YELLOW |
                        RTE_MTR_STATS_N_PKTS_RED |
                        RTE_MTR_STATS_N_PKTS_DROPPED;

    return rte_mtr_create(NETDEV_DPDK_METER_PORT_ID, mtr_id,
                          &params, true, error);
}

struct netdev_doca_meter {
    struct ovs_mutex mutex;
    uint32_t refcnt;
    bool valid;
    struct doca_flow_resource_meter_cfg cfg;
};

/* meter with index [0] is never used */
static struct netdev_doca_meter netdev_doca_meters[NETDEV_DPDK_MAX_METERS];

int
netdev_doca_meter_set(ofproto_meter_id meter_id,
                      struct ofputil_meter_config *config)
{
    uint32_t mtr_id = meter_id.uint32;
    struct netdev_doca_meter *meter;
    struct rte_mtr_error mtr_error;
    uint32_t profile_id = mtr_id;
    int ret;

    if (!netdev_is_flow_api_enabled()) {
        return 0;
    }

    if (config->n_bands != 1) {
        VLOG_WARN("cannot create meter with more than one band");
        return -1;
    }

    if (mtr_id < 1 || mtr_id >= NETDEV_DPDK_MAX_METERS) {
        return -1;
    }

    meter = &netdev_doca_meters[mtr_id];
    if (ovs_doca_enabled()) {
        ovs_mutex_lock(&meter->mutex);
        ret = ovs_doca_create_meter(mtr_id, config, &meter->cfg, &mtr_error);
        ovs_mutex_unlock(&meter->mutex);
        if (ret) {
            VLOG_WARN("cannot create meter: %u, error: %s", mtr_id,
                      mtr_error.message);
            return ret;
        }
    } else {
        ret = add_meter_profile(profile_id, config, &mtr_error);
        if (ret) {
            VLOG_WARN("cannot add meter profile, error: %s",
                      mtr_error.message);
            return ret;
        }
        ret = create_meter(mtr_id, profile_id, &mtr_error);
        if (ret) {
            VLOG_WARN("cannot create meter: %u with profile: %u, error: %s",
                      mtr_id, profile_id, mtr_error.message);
            return ret;
        }
    }

    ovs_mutex_lock(&meter->mutex);
    meter->valid = true;
    ovs_mutex_unlock(&meter->mutex);

    return ret;
}

int
netdev_doca_meter_get(ofproto_meter_id meter_id,
                      struct ofputil_meter_stats *stats,
                      uint16_t n_bands OVS_UNUSED)
{
    uint16_t port_id = NETDEV_DPDK_METER_PORT_ID;
    uint32_t mtr_id = meter_id.uint32;
    struct netdev_doca_meter *meter;
    struct rte_mtr_stats mtr_stats;
    struct rte_mtr_error error;
    uint64_t stats_mask;
    int ret;

    if (!netdev_is_flow_api_enabled()) {
        return 0;
    }

    if (mtr_id < 1 || mtr_id >= NETDEV_DPDK_MAX_METERS) {
        return -1;
    }
    /* If a meter's DPDK object was not created, do not attempt to query it.
     */
    meter = &netdev_doca_meters[mtr_id];
    if (!meter->valid) {
        return EOPNOTSUPP;
    }

    memset(&mtr_stats, 0x0, sizeof mtr_stats);
    if (ovs_doca_enabled()) {
        ret = ovs_doca_mtr_stats_read(port_id, mtr_id, &mtr_stats, &error);
    } else {
        ret = rte_mtr_stats_read(port_id, mtr_id, &mtr_stats, &stats_mask, 0,
                                 &error);
    }
    if (ret) {
        return -1;
    }
    stats->packet_in_count += (mtr_stats.n_pkts[RTE_COLOR_GREEN] +
                               mtr_stats.n_pkts[RTE_COLOR_YELLOW] +
                               mtr_stats.n_pkts[RTE_COLOR_RED]);
    stats->byte_in_count += (mtr_stats.n_bytes[RTE_COLOR_GREEN] +
                             mtr_stats.n_bytes[RTE_COLOR_YELLOW] +
                             mtr_stats.n_bytes[RTE_COLOR_RED]);

    stats->bands[0].packet_count += mtr_stats.n_pkts_dropped;
    stats->bands[0].byte_count += mtr_stats.n_bytes_dropped;
    return 0;
}

int
netdev_doca_meter_get_cfg(uint32_t of_meter_id, void *meter_cfg_)
{
    struct doca_flow_resource_meter_cfg *meter_cfg = meter_cfg_;
    struct netdev_doca_meter *meter;

    if (of_meter_id < 1 || of_meter_id >= NETDEV_DPDK_MAX_METERS) {
        VLOG_WARN("Failed to get configuration for meter ID %u: out of bound", of_meter_id);
        return -1;
    }

    meter = &netdev_doca_meters[of_meter_id];
    ovs_mutex_lock(&meter->mutex);
    if (meter->valid) {
        memcpy(meter_cfg, &meter->cfg, sizeof *meter_cfg);
        ovs_mutex_unlock(&meter->mutex);
        return 0;
    }
    ovs_mutex_unlock(&meter->mutex);
    return -1;
}

static int
netdev_doca_meter_del_(uint16_t port_id, uint32_t mtr_id, uint32_t profile_id)
{
    struct rte_mtr_error error;

    if (ovs_doca_enabled()) {
        return ovs_doca_delete_meter(port_id, mtr_id, &error);
    }

    if (rte_mtr_destroy(port_id, mtr_id, &error)) {
        return -1;
    }
    return rte_mtr_meter_profile_delete(port_id, profile_id, &error);
}

int
netdev_doca_meter_del(ofproto_meter_id meter_id,
                      struct ofputil_meter_stats *stats OVS_UNUSED,
                      uint16_t n_bands OVS_UNUSED)
{
    uint16_t port_id = NETDEV_DPDK_METER_PORT_ID;
    uint32_t mtr_id = meter_id.uint32;
    struct netdev_doca_meter *meter;
    uint32_t profile_id = mtr_id;
    int ret = 0;

    if (!netdev_is_flow_api_enabled()) {
        return 0;
    }

    if (mtr_id < 1 || mtr_id >= NETDEV_DPDK_MAX_METERS) {
        return -1;
    }
    /* If a meter's DPDK object was not created, do not attempt to destroy it.
     */
    meter = &netdev_doca_meters[mtr_id];
    if (!meter->valid) {
        return EOPNOTSUPP;
    }

    ovs_mutex_lock(&meter->mutex);
    meter->valid = false;
    if (meter->refcnt == 0) {
        ret = netdev_doca_meter_del_(port_id, mtr_id, profile_id);
    }
    ovs_mutex_unlock(&meter->mutex);
    return ret;
}

void
netdev_doca_meters_init(void)
{
    struct netdev_doca_meter *meter;
    uint32_t meter_id;

    for (meter_id = 0, meter = netdev_doca_meters;
         meter_id < NETDEV_DPDK_MAX_METERS; meter_id++, meter++) {
        ovs_mutex_init(&meter->mutex);
        meter->refcnt = 0;
        meter->valid = false;
    }
}

bool
netdev_doca_meter_ref(uint32_t meter_id)
{
    struct netdev_doca_meter *meter;
    bool ret = false;

    if (meter_id < 1 || meter_id >= NETDEV_DPDK_MAX_METERS) {
        return false;
    }

    meter = &netdev_doca_meters[meter_id];
    ovs_mutex_lock(&meter->mutex);
    if (meter->valid) {
        meter->refcnt++;
        ret = true;
    }
    ovs_mutex_unlock(&meter->mutex);
    return ret;
}

void
netdev_doca_meter_unref(uint32_t meter_id)
{
    struct netdev_doca_meter *meter;

    if (meter_id >= NETDEV_DPDK_MAX_METERS) {
        return;
    }

    meter = &netdev_doca_meters[meter_id];
    ovs_mutex_lock(&meter->mutex);
    meter->refcnt--;
    if (!meter->valid && meter->refcnt == 0) {
        uint16_t port_id = NETDEV_DPDK_METER_PORT_ID;
        uint32_t profile_id;

        profile_id = meter_id;
        netdev_doca_meter_del_(port_id, meter_id, profile_id);
    }
    ovs_mutex_unlock(&meter->mutex);
}

int
netdev_doca_set_config_sw_meter(struct netdev *netdev, const struct smap *args,
                                struct netdev_doca_sw_meter *requested_sw_meter,
                                bool dev_started,
                                char **errp)
{
    struct netdev_doca_sw_meter sw_meter;
    const char *sw_meter_str;
    const char *error;
    int err = 0;

    error = NULL;
    sw_meter_str = smap_get(args, "sw-meter");
    if (!sw_meter_str ||
        netdev_doca_sw_meter_parse(sw_meter_str, &sw_meter, &error)) {
        if (error) {
            if (dev_started) {
                VLOG_WARN("'%s': failed to parse software meter "
                          "'%s': %s", netdev_get_name(netdev), sw_meter_str,
                          error);
            } else {
                VLOG_WARN_BUF(errp, "'%s': failed to parse software meter "
                              "'%s': %s", netdev_get_name(netdev), sw_meter_str,
                              error);
                /* If this is the first configuration attempt for this port,
                 * this invalid sw-meter will be ignored and the port probed with
                 * no metering, while the user expected at least 'some' metering.
                 *
                 * As this is a flood protection feature, do not attach this port
                 * to the datapath. */
                err = EINVAL;
                goto out;
            }
        }
        ovs_doca_sw_meter_uninit(netdev);
        /* Reset the meter to its default value. */
        netdev_doca_sw_meter_copy(&sw_meter, &default_sw_meter);
        /* Set the default meter to packet-based. */
        sw_meter.config.flags |= OFPMF13_PKTPS;
    }

    sw_meter.dry_run = smap_get_bool(args, "sw-meter-dry-run", false);

    /* If this configuration is valid update the requested values and trigger a reconfigure. */
    if (!!sw_meter.band.rate) {
        if (ovs_doca_enabled()) {
            netdev_doca_sw_meter_copy(requested_sw_meter, &sw_meter);
            netdev_request_reconfigure(netdev);
        } else {
            VLOG_WARN("interface '%s': software metering is not "
                      "supported in OVS-DPDK mode. Configuration ignored",
                      netdev_get_name(netdev));
        }
    }

out:
    return err;
}

void
netdev_doca_set_config_port_dir(const char *devargs,
                                enum netdev_doca_port_dir *port_dir)
{
    char *rep;

    rep = strstr(devargs, "representor=");
    if (!rep) {
        *port_dir = NETDEV_DOCA_PORT_DIR_RX;
    } else {
        int len = strlen("representor=pf1");

        rep = strstr(devargs, "representor=pf1");
        if (rep && (rep[len] == '\0' || rep[len] == ',')) {
            *port_dir = NETDEV_DOCA_PORT_DIR_RX;
        } else {
            *port_dir = NETDEV_DOCA_PORT_DIR_TX;
        }
    }
}

static int
get_sys(const char *prefix, const char *devname, const char *suffix,
        char *outp, size_t maxlen)
{
    char str[PATH_MAX];
    size_t len;
    FILE *fp;
    char *p;
    int n;

    n = snprintf(str, sizeof str, "/sys/%s/%s/%s", prefix, devname, suffix);
    if (!(n >= 0 && n < sizeof str)) {
        return -1;
    }

    fp = fopen(str, "r");
    if (fp == NULL) {
        return -1;
    }

    p = fgets(str, sizeof str, fp);
    fclose(fp);

    if (p == NULL) {
        return -1;
    }

    /* The string is terminated by \n. Drop it. */
    if (outp) {
        len = strnlen(str, maxlen);
        if (maxlen <= len) {
            return -1;
        }
        ovs_strlcpy(outp, str, len);
    }

    return 0;
}

int
netdev_doca_get_phys_port_name(const char *devname, char *outp, size_t maxlen)
{
    return get_sys("class/net", devname, "phys_port_name", outp, maxlen);
}

static int
get_phys_switch_id(const char *devname, char *outp, size_t maxlen)
{
    return get_sys("class/net", devname, "phys_switch_id", outp, maxlen);
}

static int
get_bonding_slaves(const char *devname, char *outp, size_t maxlen)
{
    return get_sys("class/net", devname, "bonding/slaves", outp, maxlen);
}

static bool
is_mpesw(const char *pci)
{
    char tmp[PATH_MAX];
    DIR *dirp;

    /* Reading /sys/kernel/debug/mlx5/0000:08:00.0/lag/type requires
     * openvswitch to run as root. Instead check if we are in shared
     * fdb mode which means lag mode by checking the infiniband port
     * doesn't exists.
     */
    if (snprintf(tmp, sizeof tmp, "/sys/bus/pci/devices/%s/infiniband", pci) < 0) {
        return false;
    }
    if ((dirp = opendir(tmp)) == NULL) {
        return true;
    }
    closedir(dirp);

    return false;
}

/* PCI string size as appearing in sysfs */
#define PCI_DEV_STR_LEN sizeof("0000:08:00.0")

static int
get_pci(const char *name, char *pci, size_t maxlen)
{
    char device[PATH_MAX];
    char tmp[PATH_MAX];
    int len;

    if (maxlen <= PCI_DEV_STR_LEN) {
        return EINVAL;
    }

    if (snprintf(tmp, sizeof tmp, "/sys/class/net/%s/device", name) < 0) {
        return EINVAL;
    }
    len = readlink(tmp, device, sizeof device);
    if (len == 0) {
        return ENODEV;
    } else if (len < PCI_DEV_STR_LEN || len >= sizeof device) {
        return E2BIG;
    }
    /* The result is like this: "../../../0000:08:00.0".
     * Take the last 12 chars as the PCI address.
     */
    ovs_strlcpy(pci, &device[len - PCI_DEV_STR_LEN + 1], PCI_DEV_STR_LEN);
    return 0;
}

static int
get_primary_pci(const char *sw_id, char *pci, size_t maxlen)
{
    struct dirent *de;
    int err = EINVAL;
    DIR *dirp;

    if ((dirp = opendir("/sys/class/net")) == NULL) {
        goto out;
    }

    pci[0] = '\0';
    while ((de = readdir(dirp)) != NULL) {
        char p_switch_id[MAX_PHYS_ITEM_ID_LEN];
        char p_port_name[IFNAMSIZ];

        if (netdev_doca_get_phys_port_name(de->d_name, p_port_name, sizeof p_port_name)) {
            continue;
        }
        if (strcmp(p_port_name, "p0")) {
            continue;
        }

        if (get_phys_switch_id(de->d_name, p_switch_id, sizeof p_switch_id)) {
            continue;
        }
        if (strcmp(p_switch_id, sw_id)) {
            continue;
        }

        err = get_pci(de->d_name, pci, maxlen);
        break;
    }
    closedir(dirp);
out:
    return err;
}

static int
get_dpdk_iface_name(const char *name, char iface[IFNAMSIZ])
{
    char phys_port_name[IFNAMSIZ];
    char slaves[PATH_MAX];
    char *save_ptr;
    char *lower;

    /* In case the device is a bond, there is a lower_p0 symbolic link, with
     * the format of ../../.../<lower-dev>. Extract the lower device.
     */

    if (get_bonding_slaves(name, slaves, sizeof slaves)) {
        goto fallback;
    }

    lower = strtok_r(slaves, " ", &save_ptr);
    while (lower) {
        if (!netdev_doca_get_phys_port_name(lower, phys_port_name, sizeof phys_port_name) &&
            !strcmp(phys_port_name, "p0")) {
            break;
        }
        lower = strtok_r(NULL, " ", &save_ptr);
    }

    if (!lower) {
        goto fallback;
    }

    /* Reached here if found a lower device p0. */
    ovs_strlcpy(iface, lower, IFNAMSIZ);
    goto out;

fallback:
    ovs_strlcpy(iface, name, IFNAMSIZ);
out:
    return 0;
}

char *
netdev_doca_generate_devargs(const char *name, char *devargs, size_t maxlen,
                             char iface[IFNAMSIZ])
{
    char phys_port_name_[IFNAMSIZ], *phys_port_name = phys_port_name_;
    char phys_sw_id[MAX_PHYS_ITEM_ID_LEN];
    char iface_tmp[IFNAMSIZ];
    char device[PATH_MAX];
    char *mlx5_devargs;
    bool mpesw;
    bool is_pf;
    char *pci;
    int port;
    int len;

    if (get_dpdk_iface_name(name, iface_tmp)) {
        return NULL;
    }
    name = iface_tmp;
    ovs_strlcpy(iface, name, IFNAMSIZ);

    if (get_pci(name, device, sizeof device)) {
        return NULL;
    }
    pci = device;

    if (netdev_doca_get_phys_port_name(name, phys_port_name_, sizeof phys_port_name_)) {
        return NULL;
    }
    /* In some kernels, there is a controller prefix. e.g. "c1". Ignore it. */
    if (sscanf(phys_port_name, "c%d", &port) == 1) {
        phys_port_name += 2;
    }

    if (get_phys_switch_id(name, phys_sw_id, sizeof phys_sw_id)) {
        return NULL;
    }

    is_pf = false;

    if (sscanf(phys_port_name, "p%d", &port) == 1) {
        is_pf =  true;
    } else if (sscanf(phys_port_name, "pf%d", &port) != 1) {
        return NULL;
    }

    mpesw = is_mpesw(pci);
    if (port > 0 && mpesw) {
        if (get_primary_pci(phys_sw_id, device, sizeof device)) {
            return NULL;
        }
    }

    if (ovs_doca_enabled()) {
        mlx5_devargs =
            "dv_xmeta_en=4,"
            "dv_flow_en=2,"
            "reuse_tag=1,"
            "fdb_def_rule_en=0,"
            "repr_matching_en=0,"
            "vport_match=1,"
            "mpesw_suppress_pf_rep=1";
    } else {
        mlx5_devargs = "dv_xmeta_en=1";
    }

    len = strlen(phys_port_name);
    /* hpf's phys_port_name is pf0/pf1. */
    if (len == 3 && !strncmp(phys_port_name, "pf", 2)) {
        if (snprintf(devargs, maxlen, "%s,%s,representor=pf%dvf65535", pci,
                     mlx5_devargs, port) < 0) {
            return NULL;
        }
        return devargs;
    }

    /* pf ports */
    if (is_pf) {
        if (!mpesw || port == 0) {
            len = snprintf(devargs, maxlen, "%s,%s", pci, mlx5_devargs);
        } else {
            len = snprintf(devargs, maxlen, "%s,%s,representor=pf%d", pci,
                           mlx5_devargs, port);
        }
        if (len < 0) {
            return NULL;
        }
        return devargs;
    }

    /* representors */
    if (snprintf(devargs, maxlen, "%s,%s,representor=%s", pci, mlx5_devargs,
                 phys_port_name) < 0) {
        return NULL;
    }
    return devargs;
}

bool
netdev_doca_is_ethdev(struct netdev *netdev)
{
    return netdev_doca_get_port_id(netdev) != -1;
}

int
netdev_doca_setup_hairpin_queues(struct netdev *netdev, dpdk_port_t port_id,
                                 int rxq_size, int txq_size,
                                 int n_rxq, int n_txq,
                                 struct rte_eth_hairpin_conf *hairpin_conf)
{
    int diag = 0;
    int i;

    for (i = 0; i < NR_HAIRPINQ; i++) {
         hairpin_conf->peers[0].port = port_id;
         hairpin_conf->peers[0].queue = i + n_txq;
         diag = rte_eth_rx_hairpin_queue_setup
                   (port_id, n_rxq + i, rxq_size, hairpin_conf);
         if (diag) {
             VLOG_ERR("Interface %s unable to setup rx hairpinq(%d): %s",
                      netdev_get_name(netdev), n_rxq + i, rte_strerror(-diag));
             return diag;
         }

         hairpin_conf->peers[0].queue = i + n_rxq;
         diag = rte_eth_tx_hairpin_queue_setup
                   (port_id, n_txq + i, txq_size, hairpin_conf);
         if (diag) {
             VLOG_ERR("Interface %s unable to setup tx hairpinq(%d): %s",
                      netdev_get_name(netdev), n_txq + i, rte_strerror(-diag));
             return diag;
         }
    }

    return diag;
}

const char *
netdev_doca_adapt_doca_mlx5_devargs(const char *devargs, char *new_devargs, size_t maxlen)
{
    struct {
        const char *key;
        const char *value;
    } args[] = {
        { "reuse_tag", "1", },
        { "fdb_def_rule_en", "0", },
        { "repr_matching_en", "0", },
        { "vport_match", "1", },
        { "mpesw_suppress_pf_rep", "1", },
    };
    char *tail = new_devargs;

    if (!devargs || !new_devargs || maxlen == 0) {
        return devargs;
    }

    ovs_strcat(new_devargs, maxlen, &tail, devargs);

    for (int k = 0; k < ARRAY_SIZE(args); k++) {
        if (!strstr(devargs, args[k].key)) {
            ovs_strcat(new_devargs, maxlen, &tail, ",");
            ovs_strcat(new_devargs, maxlen, &tail, args[k].key);
            ovs_strcat(new_devargs, maxlen, &tail, "=");
            ovs_strcat(new_devargs, maxlen, &tail, args[k].value);
        }
    }

    return new_devargs;
}

void
netdev_doca_get_custom_stats_sw_meter(const struct netdev *netdev,
                                      struct netdev_custom_stats *custom_stats)
{
    struct rte_flow_query_count query;
    int sw_stats_size;

    memset(&query, 0, sizeof query);
    if (!netdev_offload_doca_sw_meter_drops(netdev, &query)) {
        struct netdev_custom_counter *counter;
        enum {
            QUERY_PACKETS,
            QUERY_BYTES,
            QUERY_NUM_STATS,
        };

        sw_stats_size = custom_stats->size;
        custom_stats->size += QUERY_NUM_STATS;
        custom_stats->counters = xrealloc(custom_stats->counters,
                                          custom_stats->size *
                                          sizeof *custom_stats->counters);
        counter = &custom_stats->counters[sw_stats_size];

        counter[QUERY_PACKETS].value = query.hits;
        ovs_strlcpy(counter[QUERY_PACKETS].name,
                    "sw_meter_dropped_packets",
                    NETDEV_CUSTOM_STATS_NAME_SIZE);

        counter[QUERY_BYTES].value = query.bytes;
        ovs_strlcpy(counter[QUERY_BYTES].name,
                    "sw_meter_dropped_bytes",
                    NETDEV_CUSTOM_STATS_NAME_SIZE);
    }
}

static int
get_first_dir_entry(const char *dir, char *peer, size_t maxlen)
{
    struct dirent *de;
    int rv = 0;
    DIR *dirp;
    int n;

    if ((dirp = opendir(dir)) == NULL) {
        return -1;
    }
    while ((de = readdir(dirp)) != NULL) {
        if (de->d_name[0] == '.') {
            continue;
        }

        n = strlen(de->d_name);
        if (n >= maxlen) {
            rv = -1;
            break;
        }
        ovs_strlcpy(peer, de->d_name, maxlen);
        break;
    }
    closedir(dirp);

    return rv;
}

static int
get_rep_peer_vf(const char *name, const char *phys_port_name, char *peer, size_t maxlen)
{
    char str[PATH_MAX];
    int vf_num;
    int n;

    if (sscanf(phys_port_name + strlen("pfX"), "vf%d", &vf_num) != 1) {
        return -1;
    }

    n = snprintf(str, sizeof str, "/sys/class/net/%s/device/virtfn%d/net", name, vf_num);
    if (!(n >= 0 && n < sizeof str)) {
        return -1;
    }

    if (get_first_dir_entry(str, peer, maxlen)) {
        return -1;
    }

    return 0;
}

static int
get_rep_peer_sf(const char *name, const char *phys_port_name, char *peer, size_t maxlen)
{
    char str[PATH_MAX];
    struct dirent *de;
    int sf_num;
    int rv = 0;
    DIR *dirp;
    int n;

    if (sscanf(phys_port_name + strlen("pfX"), "sf%d", &sf_num) != 1) {
        return -1;
    }

    n = snprintf(str, sizeof str, "/sys/class/net/%s/device", name);
    if (!(n >= 0 && n < sizeof str)) {
        return -1;
    }

    if ((dirp = opendir(str)) == NULL) {
        return -1;
    }
    while ((de = readdir(dirp)) != NULL) {
        char sf_num_name[IFNAMSIZ];

        if (!strstr(de->d_name, "mlx5_core.sf")) {
            continue;
        }

        /* Check SF number. */
        n = snprintf(str, sizeof str, "device/%s/sfnum", de->d_name);
        if (!(n >= 0 && n < sizeof str)) {
            rv = -1;
            break;
        }
        if (get_sys("class/net", name, str, sf_num_name, sizeof sf_num_name)) {
            rv = -1;
            break;
        }
        if (sf_num != atoi(sf_num_name)) {
            continue;
        }

        /* Get the device name. */
        n = snprintf(str, sizeof str, "/sys/class/net/%s/device/%s/net", name, de->d_name);
        if (!(n >= 0 && n < sizeof str)) {
            rv = -1;
            break;
        }
        if (get_first_dir_entry(str, peer, maxlen)) {
            rv = -1;
        }
        break;
    }
    closedir(dirp);

    return rv;
}

int
get_rep_peer(const char name[], char *peer, size_t maxlen)
{
    char phys_port_name_[IFNAMSIZ], *phys_port_name = phys_port_name_;
    int port;

    if (netdev_doca_get_phys_port_name(name, phys_port_name_, sizeof phys_port_name_)) {
        return -1;
    }
    /* In some kernels, there is a controller prefix. e.g. "c1". Ignore it. */
    if (sscanf(phys_port_name, "c%d", &port) == 1) {
        phys_port_name += 2;
    }

    if (strstr(phys_port_name, "vf")) {
        return get_rep_peer_vf(name, phys_port_name, peer, maxlen);
    }
    if (strstr(phys_port_name, "sf")) {
        return get_rep_peer_sf(name, phys_port_name, peer, maxlen);
    }

    return -1;
}

struct get_rep_aux {
    const char *dev;
    char *rep;
    size_t maxlen;
    int rv;
};

static bool
get_rep_cb(struct netdev *netdev,
           odp_port_t odp_port OVS_UNUSED,
           void *aux_)
{
    const char *name = netdev_get_name(netdev);
    struct get_rep_aux *aux = aux_;
    char peer_[IFNAMSIZ], *peer;
    bool allocated = false;
    bool rv = false;

    if (sizeof peer_ < aux->maxlen) {
        peer = xmalloc(aux->maxlen);
        allocated = true;
    } else {
        peer = peer_;
    }

    if (get_rep_peer(name, peer, aux->maxlen)) {
        goto out;
    }
    if (strncmp(aux->dev, peer, aux->maxlen)) {
        goto out;
    }

    ovs_strlcpy(aux->rep, name, aux->maxlen);
    aux->rv = 0;
    rv = true;

out:
    if (allocated) {
        free(peer);
    }
    return rv;
}

int
netdev_doca_get_rep_name(const char dev[], char *rep, size_t maxlen)
{
    struct get_rep_aux aux = {
        .dev = dev,
        .rep = rep,
        .maxlen = maxlen,
        .rv = -1,
    };

    netdev_ports_traverse("doca", get_rep_cb, &aux);
    return aux.rv;
}

static bool
netdev_doca_can_use_port(const char *name)
{
    char generated[PATH_MAX];
    char iface[IFNAMSIZ];
    char *new_devargs;

    new_devargs = netdev_doca_generate_devargs(name, generated,
                                               sizeof generated, iface);
    return !!new_devargs;
}

uint32_t
netdev_doca_get_datapath_priority(const char *netdev_name, const char *dpif_type)
{
    if (!strcmp(dpif_type, "doca") && netdev_doca_can_use_port(netdev_name)) {
        return INT_MAX;
    }

    return 0;
}
