#!/bin/bash
# ex:ts=4:sw=4:sts=4:et
# -*- tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*-
#
# Copyright (c) 2020 Mellanox Technologies. All rights reserved.
#
# This Software is licensed under one of the following licenses:
#
# 1) under the terms of the "Common Public License 1.0" a copy of which is
#    available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/cpl.php.
#
# 2) under the terms of the "The BSD License" a copy of which is
#    available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/bsd-license.php.
#
# 3) under the terms of the "GNU General Public License (GPL) Version 2" a
#    copy of which is available from the Open Source Initiative, see
#    http://www.opensource.org/licenses/gpl-license.php.
#
# Licensee has the right to choose one of the above licenses.
#
# Redistributions of source code must retain the above copyright
# notice and one of the license notices.
#
# Redistributions in binary form must reproduce both the above copyright
# notice, one of the license notices in the documentation
# and/or other materials provided with the distribution.

PATH=/opt/mellanox/iproute2/sbin:/opt/mellanox/ethtool/sbin:/bin:/sbin:/usr/bin:/usr/sbin
MLXCONFIG_TIMEOUT=${MLXCONFIG_TIMEOUT:-60}
RDMA_SET_NETNS_TIMEOUT=${RDMA_SET_NETNS_TIMEOUT:-3}

RC=0

is_bf=`lspci -s 00:00.0 2> /dev/null | grep -wq "PCI bridge: Mellanox Technologies" && echo 1 || echo 0`
if [ $is_bf -ne 1 ]; then
	exit 0
fi

prog=`basename $0`

PID=$(pgrep -oxf "/bin/bash /sbin/$prog" \
        || pgrep -oxf "/bin/bash /usr/sbin/$prog" \
        || pgrep -oxf "/usr/bin/bash /sbin/$prog" \
        || pgrep -oxf "/usr/bin/bash /usr/sbin/$prog")
if [[ -n $PID && $$ -ne $PID ]]; then
	# $prog is running already with PID: $PID
	exit 0
fi

info()
{
	logger -t $prog -i "INFO: $*"
}

error()
{
	logger -t $prog -i "ERR: $*"
}


get_steering_mode()
{
	pci_dev=$1
	shift

	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/devlink/steering_mode 2> /dev/null`
	if [ -n "$compat" ]; then
		cat ${compat} 2> /dev/null
	else
		devlink dev param show pci/${pci_dev} name flow_steering_mode | tail -1 | awk '{print $NF}'
	fi
}

set_steering_mode()
{
	pci_dev=$1
	mode=$2
	shift 2

	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/devlink/steering_mode 2> /dev/null`
	if [ -n "$compat" ]; then
		echo ${mode} > /sys/bus/pci/devices/${pci_dev}/net/*/compat/devlink/steering_mode
	else
		devlink dev param set pci/${pci_dev} name flow_steering_mode value "${mode}" cmode runtime
	fi
	rc=$?
	if [ $rc -ne 0 ]; then
		error "Failed to configure steering mode ${mode} for ${pci_dev}"
	else
		info "Configured mode steering ${mode} for ${pci_dev}"
	fi

	return $rc
}

get_eswitch_mode()
{
	pci_dev=$1
	shift

	devlink dev eswitch show pci/${pci_dev} 2> /dev/null | cut -d ' ' -f 3
}

set_eswitch_mode()
{
	pci_dev=$1
	mode=$2
	shift 2

	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/devlink/mode 2> /dev/null`
	if [ -n "$compat" ]; then
		echo ${mode} > ${compat}
	else
		devlink dev eswitch set pci/${pci_dev} mode ${mode}
	fi
	rc=$?
	if [ $rc -ne 0 ]; then
		error "Failed to configure ${mode} mode for ${pci_dev}"
	else
		info "Configured ${mode} mode for ${pci_dev}"
	fi

	return $rc
}

get_dev_param()
{
	pci_dev=$1
	name=$2
	shift 2
	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/devlink/${name} 2> /dev/null`
	if [ -n "$compat" ]; then
		cat ${compat} 2> /dev/null
	else
		devlink dev param show pci/${pci_dev} name ${name} 2> /dev/null | tail -1 | awk '{print $NF}'
	fi
}

set_dev_param()
{
	pci_dev=$1
	name=$2
	value=$3
	msg=$4
	shift 4

	compat=`/bin/ls -1 /sys/bus/pci/devices/${pci_dev}/net/*/compat/devlink/${name} 2> /dev/null`
	if [ -n "$compat" ]; then
		echo ${value} > ${compat}
	else
		devlink dev param set pci/${pci_dev} name ${name} value ${value} cmode runtime
	fi
	rc=$?
	if [ $rc -ne 0 ]; then
		error "${msg} Failed to set parameter ${name} to ${value} value for ${pci_dev}"
	else
		info "${msg} Set ${name} parameter to ${value} value for ${pci_dev}"
	fi

	return $rc
}

run_mlxconfig()
{
	cmd="$*"
	mft_output=""

	SECONDS=0
	mft_output="$($cmd 2>&1)"
	while (echo "$mft_output" | grep -q "Failed to open"); do
		if [ $SECONDS -gt $MLXCONFIG_TIMEOUT ]; then
			error "Failed to run $cmd"
			exit 1
		fi
		sleep 1
		mft_output=$($cmd 2>&1)
	done
	echo "$mft_output"
}

wait_for_netdev()
{
	port=$1
	SECONDS=0
	while ! [ -d /sys/class/net/$port ]
	do
		if [ $SECONDS -gt $OVS_BR_PORTS_TIMEOUT ]; then
			break
		fi
		sleep 1
	done
}

ovs_save_conf()
{
	ovs_brs=$(ovs-vsctl list-br)
	
	for ovs_br in $ovs_brs
	do
		eval ${ovs_br}_ports=\"$(ovs-vsctl list-ports $ovs_br | tr -s '[=\n=]' ' ')\"
		for port in $(eval echo \$$(echo ${ovs_br}_ports))
		do
			eval ${port}_options="\"$(ovs-vsctl get Interface $port options | sed 's/[{},]//g')\""
			eval ${port}_type="\"$(ovs-vsctl get Interface $port type | sed 's/[{},]//g')\""
		done
		info "Saved $ovs_br: $(eval echo \$$(echo ${ovs_br}_ports))"
	done
}

ovs_delete_conf()
{
	for ovs_br in $ovs_brs
	do
		ovs-vsctl -t 3 del-br $ovs_br
		rc=$?
		if [ $rc -eq 0 ]; then
			info "Removed $ovs_br"
		else
			error "Failed to remove $ovs_br"
		fi
	done

	return $rc
}

ovs_restore_conf()
{	
	for ovs_br in $ovs_brs
	do
	    ovs_br_ports="$(eval echo \$$(echo ${ovs_br}_ports))"
		$vsctl add-br $ovs_br
		info "Created bridge: $ovs_br"
		for port in $ovs_br_ports
		do
			cmd="$vsctl add-port $ovs_br $port"
			port_options="$(eval echo \$$(echo ${port}_options))"
			if [ -n "$port_options" ]; then
				port_options="$(echo ${port_options} | sed 's/^/options:/g' | sed 's/[[:space:]]/ options:/g')"
			fi
			port_type="$(eval echo \$$(echo ${port}_type))"
			if [ -n "$port_type" ]; then
				port_type="type=$port_type"
			else
				wait_for_netdev $port
			fi
			if [[ -n "$port_type" || -n "$port_options" ]]; then
				cmd="$cmd -- set interface $port $port_type $port_options"
			fi
			eval $cmd
			info "bridge $ovs_br: added port $port $port_type $port_options"
		done
		ip link set $ovs_br up
	done
}

vsctl=`which ovs-vsctl 2> /dev/null`

is_port_ib()
{
	dev=$1
	port=$2
	shift 2

	mft_output=$(run_mlxconfig "$mftconfig -d ${dev} -e q LINK_TYPE_P$port")

	if (echo "$mft_output" | grep -o LINK_TYPE_P.* | awk '{print $3}' | grep -q "IB(1)"); then
		return 0
	fi
	return 1
}

is_smartnic_mode()
{
	dev=$1
	port=$2
	shift 2

	mft_output=$(run_mlxconfig "$mftconfig -d ${dev} -e q INTERNAL_CPU_MODEL")

	if (echo $mft_output | grep -o INTERNAL_CPU_MODEL.* | awk '{print$3}' | grep -q "EMBEDDED_CPU(1)"); then
		info "Device ${dev} is in SmartNIC mode"
		return 0
	fi
	info "Device ${dev} is in SEPERATED_HOST mode"
	return 1
}

is_SecureBoot=0
if (mokutil --sb-state 2>&1 | grep -q "SecureBoot enabled"); then
	is_SecureBoot=1
fi

if [ $is_SecureBoot -eq 1 ]; then
	mst_dev=`/bin/ls -1 /dev/mst/mt*_pciconf0 2> /dev/null`
	if [ ! -n "${mst_dev}" ]; then
		mst start > /dev/null 2>&1
	fi
fi

mftconfig=mstconfig
if [ -x /usr/bin/mlxconfig ]; then
	mftconfig=mlxconfig
fi

if [ -f /etc/mellanox/mlnx-bf.conf ]; then
	. /etc/mellanox/mlnx-bf.conf
fi
IPSEC_FULL_OFFLOAD=${IPSEC_FULL_OFFLOAD:-"no"}
LAG_HASH_MODE=${LAG_HASH_MODE:-"yes"}
ENABLE_ESWITCH_MULTIPORT=${ENABLE_ESWITCH_MULTIPORT:-"no"}

if [ -f /etc/mellanox/mlnx-ovs.conf ]; then
	. /etc/mellanox/mlnx-ovs.conf
fi

OVS_CONF_SAVED=0
OVS_CONF_RESTORE=${OVS_CONF_RESTORE:-"yes"}

if [ -n "$vsctl" ]; then
	ovsbr_number=`$vsctl list-br | wc -l`
	if [ $ovsbr_number -gt 0 ]; then
		ovs_save_conf
		if [ -n "$ovs_brs" ]; then
			OVS_CONF_SAVED=1
		fi
	fi

	ovs_delete_conf
	for i in $(/bin/ls -1 /sys/class/net/); do tc filter del dev $i ingress > /dev/null 2>&1; done
fi

RDMA_SET_NETNS_EXCLUSIVE=${RDMA_SET_NETNS_EXCLUSIVE:-"yes"}
if [ "X${RDMA_SET_NETNS_EXCLUSIVE}" == "Xyes" ]; then
	if ! (rdma system show netns 2>&1 | grep -q exclusive); then
		SECONDS=0
		output=$(rdma system set netns exclusive 2>&1)
		rc=$?
		while [ $rc -ne 0 ]; do
			if [ $SECONDS -gt $RDMA_SET_NETNS_TIMEOUT ]; then
				break
			fi
			sleep 1
			output=$(rdma system set netns exclusive 2>&1)
			rc=$?
		done
	fi

	if (rdma system show netns 2>&1 | grep -q exclusive); then
		info "The RDMA subsystem is set in network namespace exclusive mode."
	else
		error "Failed to set rdma exclusive mode: $output"
	fi
fi

num_of_devs=0
for dev in `lspci -nD -d 15b3: | grep 'a2d[26c]' | cut -d ' ' -f 1`
do
	SECONDS=0
	while ! ($mftconfig -d ${dev} -e q INTERNAL_CPU_MODEL > /dev/null 2>&1); do
		if [ $SECONDS -gt $MLXCONFIG_TIMEOUT ]; then
			error "Failed to run $mftconfig query on $dev"
			break
		fi
		sleep 1
	done

	port=$(( ${dev: -1} + 1 ))
	if (is_port_ib "$dev" "$port"); then
		info "Link type is IB for ${dev}. Skipping mode confiugration."
		continue
	fi

	if (is_smartnic_mode "$dev" "$port"); then
		if [ "X${LAG_HASH_MODE}" == "Xno" ]; then
			set_dev_param ${dev} lag_port_select_mode queue_affinity
		elif [ "X${LAG_MULTIPORT_ESW_MODE}" == "Xyes" ]; then
			eswitch_mode=`get_eswitch_mode ${dev}`
			if [ "${eswitch_mode}" != "legacy" ]; then
				set_eswitch_mode ${dev} legacy
				RC=$((RC+$?))
			fi

			set_dev_param ${dev} lag_port_select_mode multiport_esw
		fi

		if [ "X${ENCAP_NONE_MODE}" == "Xyes" ]; then
			eswitch_mode=`get_eswitch_mode ${dev}`
			if [ "${eswitch_mode}" != "legacy" ]; then
				set_eswitch_mode ${dev} legacy
				RC=$((RC+$?))
			fi

			set_dev_param ${dev} encap none
		fi

		steering_mode=`get_steering_mode ${dev}`
		if [ "${steering_mode}" == "dmfs" ]; then
			eswitch_mode=`get_eswitch_mode ${dev}`
			if [ "${eswitch_mode}" != "legacy" ]; then
				set_eswitch_mode ${dev} legacy
				RC=$((RC+$?))
			fi

			set_steering_mode ${dev} smfs
		fi

		if [ "${IPSEC_FULL_OFFLOAD}" == "yes" ]; then
			lscpu | grep Flags | grep sha1 | grep sha2 | grep -q aes
			if [ $? -eq 0 ]; then
				eswitch_mode=`get_eswitch_mode ${dev}`
				if [ "${eswitch_mode}" != "legacy" ]; then
					set_eswitch_mode ${dev} legacy
					RC=$((RC+$?))
				fi
				steering_mode=`get_steering_mode ${dev}`
				if [ "${steering_mode}" == "smfs" ]; then
					set_steering_mode ${dev} dmfs
				fi
			else
				info "Crypto disabled on this devide. Skipping IPsec mode configuration."
			fi
		fi

		eswitch_mode=`get_eswitch_mode ${dev}`
		if [ "${eswitch_mode}" != "switchdev" ]; then
			if [ "X${LEGACY_METADATA_MATCH_MODE}" == "Xyes" ]; then
				set_dev_param ${dev} vport_match_mode legacy
			fi

			set_eswitch_mode ${dev} switchdev
			RC=$((RC+$?))
		fi
		eswitch_mode=`get_eswitch_mode ${dev}`
		if [ "${eswitch_mode}" == "switchdev" ]; then
			if [ "X${ENABLE_ESWITCH_MULTIPORT}" == "Xyes" ]; then
				set_dev_param ${dev} esw_multiport  1 "ESW Multiport:"
			fi
			num_of_devs=$((num_of_devs+1))
		fi
	fi
done

if [ "$(stat -c %d:%i /)" != "$(stat -c %d:%i /proc/1/root/.)" ]; then
	info "Running in chroot environment. Exiting..."
	exit 0
fi

if [ $RC -ne 0 ]; then
	error "Exiting due to failures. RC=$RC"
	exit $RC
fi

if [ $num_of_devs -eq 0 ]; then
	info "No devices configured to switchdev mod. Skipping SF/Bridges configuration."
	exit 0
fi

if [ ! -n "$vsctl" ]; then
	info "OVS is not installed. Skipping OVS bridges creation."
	exit 0
fi

# Configure the default OVS bridge
OVS_BRIDGE1=${OVS_BRIDGE1:-"ovsbr1"}
OVS_BRIDGE1_PORTS=${OVS_BRIDGE1_PORTS:-"p0 pf0hpf en3f0pf0sf0"}
OVS_BRIDGE2=${OVS_BRIDGE2:-"ovsbr2"}
OVS_BRIDGE2_PORTS=${OVS_BRIDGE2_PORTS:-"p1 pf1hpf en3f1pf1sf0"}
OVS_HW_OFFLOAD=${OVS_HW_OFFLOAD:-"yes"}
OVS_TIMEOUT=${OVS_TIMEOUT:-30}

ovs_service=""
if [ -e /etc/init.d/openvswitch-switch ]; then
	ovs_service="openvswitch-switch.service"
elif [ -e /usr/lib/systemd/system/openvswitch.service ]; then
	ovs_service="openvswitch.service"
else
	ovs_service=`systemctl list-unit-files 2> /dev/null | grep -E "openvswitch.service|openvswitch-switch.service" | awk '{print $1}'`
fi

if ! (systemctl is-enabled $ovs_service 2> /dev/null | grep -wq enabled); then
	# OVS service is not enabled
	info "$ovs_service is not enabled. Exiting..."
	exit 0
fi

ovs_restart="systemctl restart $ovs_service"

ovs_restart()
{
	if [ -n "$ovs_service" ]; then
		info "Restarting $ovs_service"
		$ovs_restart
	fi

	# Re-apply udev settings removed by OVS
	if [ -x /lib/udev/mlnx_bf_udev ]; then
		for p in $(cd /sys/class/net; /bin/ls -d p* e*)
		do
			/lib/udev/mlnx_bf_udev $p > /dev/null 2>&1
		done
	fi

	for p in $OVS_BRIDGE1_PORTS $OVS_BRIDGE2_PORTS
	do
		if [ -d /sys/class/net/$p ]; then
			ethtool -K $p hw-tc-offload on
		fi
	done
}

{
if [ -f /etc/mellanox/mlnx-sf.conf ]; then
    # Wait until NetworkManager is active to avoid race condition
    NETWORK_MANAGER_TIMEOUT=${NETWORK_MANAGER_TIMEOUT:-120}
    SECONDS=0
    while ! (systemctl is-active NetworkManager)
    do
        if [ $SECONDS -gt ${NETWORK_MANAGER_TIMEOUT} ]; then
            info "NetworkManager timeout: ${NETWORK_MANAGER_TIMEOUT}"
            break;
        fi
        sleep 0.1
    done
    . /etc/mellanox/mlnx-sf.conf
fi

SECONDS=0
while ! ($vsctl show > /dev/null 2>&1)
do
	if [ $SECONDS -gt $OVS_TIMEOUT ]; then
		info "$ovs_service is not up. Exiting..."
		exit 1
	fi
	sleep 1
done

if [[ $OVS_CONF_SAVED -eq 1 && "$OVS_CONF_RESTORE" == "yes" ]]; then
	info "Restoring original OVS configuration"
	ovs_restore_conf
else
	ovsbr_number=`$vsctl list-br | wc -l`
	if [ $ovsbr_number -gt 0 ]; then
		if ($vsctl get Open_vSwitch . Other_config 2> /dev/null | grep -q 'hw-offload="true"'); then
			ovs_restart
		fi
		exit $RC
	fi

	CREATE_OVS_BRIDGES=${CREATE_OVS_BRIDGES:-"yes"}
	if [ "X${CREATE_OVS_BRIDGES}" != "Xyes" ]; then
		exit $RC
	fi

	OVS_BR_PORTS_TIMEOUT=${OVS_BR_PORTS_TIMEOUT:-30}
	for i in `seq $num_of_devs`
	do
		br_name=OVS_BRIDGE${i}
		br_name=${!br_name}
		br_ports=OVS_BRIDGE${i}_PORTS
		br_ports=${!br_ports}

		if ($vsctl br-exists $br_name); then
			info "bridge $br_name exist already."
			ip link set $br_name up
			continue
		fi

		missing_port=0
		ovs_br_ports=""
		for port in $br_ports
		do
			wait_for_netdev $port

			if [ -d /sys/class/net/$port ]; then
				ovs_br_ports="$ovs_br_ports $port"
			else
				info "port device $port for bridge $br_name is missing."
				case $port in
					pf*sf*)
						info "RDMA functionality is not expected to work without $port in $br_name"
					;;
					*)
						missing_port=$((missing_port+1))
					;;
				esac
			fi
		done

		if [ $missing_port -gt 0 ]; then
			info "Skipping $br_name configuration."
			continue
		fi

		$vsctl add-br $br_name
		info "Created bridge: $br_name"
		for port in $ovs_br_ports
		do
			$vsctl add-port $br_name $port
			info "bridge $br_name: added port $port"
		done
		ip link set $br_name up

	done
fi

if [ "X${OVS_HW_OFFLOAD}" == "Xyes" ]; then
	$vsctl set Open_vSwitch . Other_config:hw-offload=true
	if [ $? -eq 0 ]; then
		info "OVS HW offload is set"
		info "Going to restart $ovs_service to activate hw-offload"
		ovs_restart
	fi
fi
} &

sync

exit $RC
