From patchwork Wed Mar 7 01:12:13 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [RFC,v3,net-next,01/18] sock: Fix SO_ZEROCOPY switch case X-Patchwork-Submitter: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> X-Patchwork-Id: 882329 X-Patchwork-Delegate: davem@davemloft.net Message-Id: <20180307011230.24001-2-jesus.sanchez-palencia@intel.com> To: netdev@vger.kernel.org Cc: jhs@mojatatu.com, xiyou.wangcong@gmail.com, jiri@resnulli.us, vinicius.gomes@intel.com, richardcochran@gmail.com, intel-wired-lan@lists.osuosl.org, anna-maria@linutronix.de, henrik@austad.us, tglx@linutronix.de, john.stultz@linaro.org, levi.pearson@harman.com, edumazet@google.com, willemb@google.com, mlichvar@redhat.com, Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> Date: Tue, 6 Mar 2018 17:12:13 -0800 From: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> List-Id: <netdev.vger.kernel.org> Fix the SO_ZEROCOPY switch case on sock_setsockopt() avoiding the ret values to be overwritten by the one set on the default case. Fixes: 28190752c7092 ("sock: permit SO_ZEROCOPY on PF_RDS socket") Signed-off-by: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> Acked-by: Willem de Bruijn <willemb@google.com> --- arch/alpha/include/uapi/asm/socket.h | 5 arch/frv/include/uapi/asm/socket.h | 5 arch/ia64/include/uapi/asm/socket.h | 5 arch/m32r/include/uapi/asm/socket.h | 5 arch/mips/include/uapi/asm/socket.h | 5 arch/mn10300/include/uapi/asm/socket.h | 5 arch/parisc/include/uapi/asm/socket.h | 5 arch/s390/include/uapi/asm/socket.h | 5 arch/sparc/include/uapi/asm/socket.h | 5 arch/xtensa/include/uapi/asm/socket.h | 5 drivers/net/ethernet/intel/igb/e1000_defines.h | 16 drivers/net/ethernet/intel/igb/igb.h | 1 drivers/net/ethernet/intel/igb/igb_main.c | 243 +++++++--- include/linux/netdevice.h | 2 include/linux/posix-timers.h | 1 include/linux/skbuff.h | 3 include/net/pkt_sched.h | 7 include/net/sock.h | 4 include/uapi/asm-generic/socket.h | 5 include/uapi/linux/pkt_sched.h | 18 net/core/skbuff.c | 1 net/core/sock.c | 41 + net/ipv4/raw.c | 7 net/ipv4/udp.c | 10 net/packet/af_packet.c | 19 net/sched/Kconfig | 11 net/sched/Makefile | 1 net/sched/sch_api.c | 11 net/sched/sch_tbs.c | 591 +++++++++++++++++++++++++ 29 files changed, 978 insertions(+), 64 deletions(-) Index: linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h =================================================================== @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:115 @ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 62 +#define SCM_CLOCKID 63 + #endif /* _UAPI_ASM_SOCKET_H */ Index: linux-4.16.8-rt3/arch/frv/include/uapi/asm/socket.h =================================================================== --- linux-4.16.8-rt3.orig/arch/frv/include/uapi/asm/socket.h +++ linux-4.16.8-rt3/arch/frv/include/uapi/asm/socket.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:108 @ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 62 +#define SCM_CLOCKID 63 + #endif /* _ASM_SOCKET_H */ Index: linux-4.16.8-rt3/arch/ia64/include/uapi/asm/socket.h =================================================================== --- linux-4.16.8-rt3.orig/arch/ia64/include/uapi/asm/socket.h +++ linux-4.16.8-rt3/arch/ia64/include/uapi/asm/socket.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:117 @ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 62 +#define SCM_CLOCKID 63 + #endif /* _ASM_IA64_SOCKET_H */ Index: linux-4.16.8-rt3/arch/m32r/include/uapi/asm/socket.h =================================================================== --- linux-4.16.8-rt3.orig/arch/m32r/include/uapi/asm/socket.h +++ linux-4.16.8-rt3/arch/m32r/include/uapi/asm/socket.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:108 @ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 62 +#define SCM_CLOCKID 63 + #endif /* _ASM_M32R_SOCKET_H */ Index: linux-4.16.8-rt3/arch/mips/include/uapi/asm/socket.h =================================================================== --- linux-4.16.8-rt3.orig/arch/mips/include/uapi/asm/socket.h +++ linux-4.16.8-rt3/arch/mips/include/uapi/asm/socket.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:126 @ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 62 +#define SCM_CLOCKID 63 + #endif /* _UAPI_ASM_SOCKET_H */ Index: linux-4.16.8-rt3/arch/mn10300/include/uapi/asm/socket.h =================================================================== --- linux-4.16.8-rt3.orig/arch/mn10300/include/uapi/asm/socket.h +++ linux-4.16.8-rt3/arch/mn10300/include/uapi/asm/socket.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:108 @ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 62 +#define SCM_CLOCKID 63 + #endif /* _ASM_SOCKET_H */ Index: linux-4.16.8-rt3/arch/parisc/include/uapi/asm/socket.h =================================================================== --- linux-4.16.8-rt3.orig/arch/parisc/include/uapi/asm/socket.h +++ linux-4.16.8-rt3/arch/parisc/include/uapi/asm/socket.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:107 @ #define SO_ZEROCOPY 0x4035 +#define SO_TXTIME 0x4036 +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 0x4037 +#define SCM_CLOCKID 0x4038 + #endif /* _UAPI_ASM_SOCKET_H */ Index: linux-4.16.8-rt3/arch/s390/include/uapi/asm/socket.h =================================================================== --- linux-4.16.8-rt3.orig/arch/s390/include/uapi/asm/socket.h +++ linux-4.16.8-rt3/arch/s390/include/uapi/asm/socket.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:114 @ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 62 +#define SCM_CLOCKID 63 + #endif /* _ASM_SOCKET_H */ Index: linux-4.16.8-rt3/arch/sparc/include/uapi/asm/socket.h =================================================================== --- linux-4.16.8-rt3.orig/arch/sparc/include/uapi/asm/socket.h +++ linux-4.16.8-rt3/arch/sparc/include/uapi/asm/socket.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:104 @ #define SO_ZEROCOPY 0x003e +#define SO_TXTIME 0x003f +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 0x0040 +#define SCM_CLOCKID 0x0041 + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 Index: linux-4.16.8-rt3/arch/xtensa/include/uapi/asm/socket.h =================================================================== --- linux-4.16.8-rt3.orig/arch/xtensa/include/uapi/asm/socket.h +++ linux-4.16.8-rt3/arch/xtensa/include/uapi/asm/socket.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:119 @ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 62 +#define SCM_CLOCKID 63 + #endif /* _XTENSA_SOCKET_H */ Index: linux-4.16.8-rt3/drivers/net/ethernet/intel/igb/e1000_defines.h =================================================================== --- linux-4.16.8-rt3.orig/drivers/net/ethernet/intel/igb/e1000_defines.h +++ linux-4.16.8-rt3/drivers/net/ethernet/intel/igb/e1000_defines.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1069 @ #define E1000_TQAVCTRL_XMIT_MODE BIT(0) #define E1000_TQAVCTRL_DATAFETCHARB BIT(4) #define E1000_TQAVCTRL_DATATRANARB BIT(8) +#define E1000_TQAVCTRL_DATATRANTIM BIT(9) +#define E1000_TQAVCTRL_SP_WAIT_SR BIT(10) +/* Fetch Time Delta - bits 31:16 + * + * This field holds the value to be reduced from the launch time for + * fetch time decision. The FetchTimeDelta value is defined in 32 ns + * granularity. + * + * This field is 16 bits wide, and so the maximum value is: + * + * 65535 * 32 = 2097120 ~= 2.1 msec + * + * XXX: We are configuring the max value here since we couldn't come up + * with a reason for not doing so. + */ +#define E1000_TQAVCTRL_FETCHTIME_DELTA (0xFFFF << 16) /* TX Qav Credit Control fields */ #define E1000_TQAVCC_IDLESLOPE_MASK 0xFFFF Index: linux-4.16.8-rt3/drivers/net/ethernet/intel/igb/igb.h =================================================================== --- linux-4.16.8-rt3.orig/drivers/net/ethernet/intel/igb/igb.h +++ linux-4.16.8-rt3/drivers/net/ethernet/intel/igb/igb.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:284 @ struct igb_ring { u16 count; /* number of desc. in the ring */ u8 queue_index; /* logical index of the ring*/ u8 reg_idx; /* physical index of the ring */ + bool launchtime_enable; /* true if LaunchTime is enabled */ bool cbs_enable; /* indicates if CBS is enabled */ s32 idleslope; /* idleSlope in kbps */ s32 sendslope; /* sendSlope in kbps */ Index: linux-4.16.8-rt3/drivers/net/ethernet/intel/igb/igb_main.c =================================================================== --- linux-4.16.8-rt3.orig/drivers/net/ethernet/intel/igb/igb_main.c +++ linux-4.16.8-rt3/drivers/net/ethernet/intel/igb/igb_main.c @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1675 @ static void set_queue_mode(struct e1000_ wr32(E1000_I210_TQAVCC(queue), val); } +static bool is_any_cbs_enabled(struct igb_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_tx_queues; i++) { + if (adapter->tx_ring[i]->cbs_enable) + return true; + } + + return false; +} + +static bool is_any_txtime_enabled(struct igb_adapter *adapter) +{ + int i; + + for (i = 0; i < adapter->num_tx_queues; i++) { + if (adapter->tx_ring[i]->launchtime_enable) + return true; + } + + return false; +} + /** - * igb_configure_cbs - Configure Credit-Based Shaper (CBS) + * igb_config_tx_modes - Configure "Qav Tx mode" features on igb * @adapter: pointer to adapter struct * @queue: queue number - * @enable: true = enable CBS, false = disable CBS - * @idleslope: idleSlope in kbps - * @sendslope: sendSlope in kbps - * @hicredit: hiCredit in bytes - * @locredit: loCredit in bytes - * - * Configure CBS for a given hardware queue. When disabling, idleslope, - * sendslope, hicredit, locredit arguments are ignored. Returns 0 if - * success. Negative otherwise. - **/ -static void igb_configure_cbs(struct igb_adapter *adapter, int queue, - bool enable, int idleslope, int sendslope, - int hicredit, int locredit) + * + * Configure CBS and Launchtime for a given hardware queue. + * Parameters are retrieved from the correct Tx ring, so + * igb_save_cbs_params() and igb_save_txtime_params() should be used + * for setting those correctly prior to this function being called. + **/ +static void igb_config_tx_modes(struct igb_adapter *adapter, int queue) { + struct igb_ring *ring = adapter->tx_ring[queue]; struct net_device *netdev = adapter->netdev; struct e1000_hw *hw = &adapter->hw; - u32 tqavcc; + u32 tqavcc, tqavctrl; u16 value; WARN_ON(hw->mac.type != e1000_i210); WARN_ON(queue < 0 || queue > 1); - if (enable) { + /* If any of the Qav features is enabled, configure queues as SR and + * with HIGH PRIO. If none is, then configure them with LOW PRIO and + * as SP. + */ + if (ring->cbs_enable || ring->launchtime_enable) { set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH); set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION); + } else { + set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW); + set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY); + } + + /* If CBS is enabled, set DataTranARB and config its parameters. */ + if (ring->cbs_enable) { + /* Always set data transfer arbitration to credit-based + * shaper algorithm on TQAVCTRL if CBS is enabled for any of + * the queues. + */ + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl |= E1000_TQAVCTRL_DATATRANARB; + wr32(E1000_I210_TQAVCTRL, tqavctrl); /* According to i210 datasheet section 7.2.7.7, we should set * the 'idleSlope' field from TQAVCC register following the @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1799 @ static void igb_configure_cbs(struct igb * calculated value, so the resulting bandwidth might * be slightly higher for some configurations. */ - value = DIV_ROUND_UP_ULL(idleslope * 61034ULL, 1000000); + value = DIV_ROUND_UP_ULL(ring->idleslope * 61034ULL, 1000000); tqavcc = rd32(E1000_I210_TQAVCC(queue)); tqavcc &= ~E1000_TQAVCC_IDLESLOPE_MASK; tqavcc |= value; wr32(E1000_I210_TQAVCC(queue), tqavcc); - wr32(E1000_I210_TQAVHC(queue), 0x80000000 + hicredit * 0x7735); + wr32(E1000_I210_TQAVHC(queue), + 0x80000000 + ring->hicredit * 0x7735); } else { - set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_LOW); - set_queue_mode(hw, queue, QUEUE_MODE_STRICT_PRIORITY); /* Set idleSlope to zero. */ tqavcc = rd32(E1000_I210_TQAVCC(queue)); @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1817 @ static void igb_configure_cbs(struct igb /* Set hiCredit to zero. */ wr32(E1000_I210_TQAVHC(queue), 0); + + /* If CBS is not enabled for any queues anymore, then return to + * the default state of Data Transmission Arbitration on + * TQAVCTRL. + */ + if (!is_any_cbs_enabled(adapter)) { + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl &= ~E1000_TQAVCTRL_DATATRANARB; + wr32(E1000_I210_TQAVCTRL, tqavctrl); + } + } + + /* If LaunchTime is enabled, set DataTranTIM. */ + if (ring->launchtime_enable) { + /* Always set DataTranTIM on TQAVCTRL if LaunchTime is enabled + * for any of the SR queues, and configure fetchtime delta. + * XXX NOTE: + * - LaunchTime will be enabled for all SR queues. + * - A fixed offset can be added relative to the launch + * time of all packets if configured at reg LAUNCH_OS0. + * We are keeping it as 0 for now (default value). + */ + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl |= E1000_TQAVCTRL_DATATRANTIM | + E1000_TQAVCTRL_FETCHTIME_DELTA; + wr32(E1000_I210_TQAVCTRL, tqavctrl); + } else { + /* If Launchtime is not enabled for any SR queues anymore, + * then clear DataTranTIM on TQAVCTRL and clear fetchtime delta, + * effectively disabling Launchtime. + */ + if (!is_any_txtime_enabled(adapter)) { + tqavctrl = rd32(E1000_I210_TQAVCTRL); + tqavctrl &= ~E1000_TQAVCTRL_DATATRANTIM; + tqavctrl &= ~E1000_TQAVCTRL_FETCHTIME_DELTA; + wr32(E1000_I210_TQAVCTRL, tqavctrl); + } } /* XXX: In i210 controller the sendSlope and loCredit parameters from @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1861 @ static void igb_configure_cbs(struct igb * configuration' in respect to these parameters. */ - netdev_dbg(netdev, "CBS %s: queue %d idleslope %d sendslope %d hiCredit %d locredit %d\n", - (enable) ? "enabled" : "disabled", queue, - idleslope, sendslope, hicredit, locredit); + netdev_dbg(netdev, "Qav Tx mode: cbs %s, launchtime %s, queue %d \ + idleslope %d sendslope %d hiCredit %d \ + locredit %d\n", + (ring->cbs_enable) ? "enabled" : "disabled", + (ring->launchtime_enable) ? "enabled" : "disabled", queue, + ring->idleslope, ring->sendslope, ring->hicredit, + ring->locredit); +} + +static int igb_save_txtime_params(struct igb_adapter *adapter, int queue, + bool enable) +{ + struct igb_ring *ring; + + if (queue < 0 || queue > adapter->num_tx_queues) + return -EINVAL; + + ring = adapter->tx_ring[queue]; + ring->launchtime_enable = enable; + + return 0; } static int igb_save_cbs_params(struct igb_adapter *adapter, int queue, @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1904 @ static int igb_save_cbs_params(struct ig return 0; } -static bool is_any_cbs_enabled(struct igb_adapter *adapter) -{ - struct igb_ring *ring; - int i; - - for (i = 0; i < adapter->num_tx_queues; i++) { - ring = adapter->tx_ring[i]; - - if (ring->cbs_enable) - return true; - } - - return false; -} - +/** + * igb_setup_tx_mode - Switch to/from Qav Tx mode when applicable + * @adapter: pointer to adapter struct + * + * Configure TQAVCTRL register switching the controller's Tx mode + * if FQTSS mode is enabled or disabled. Additionally, will issue + * a call to igb_config_tx_modes() per queue so any previously saved + * Tx parameters are applied. + **/ static void igb_setup_tx_mode(struct igb_adapter *adapter) { struct net_device *netdev = adapter->netdev; @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1927 @ static void igb_setup_tx_mode(struct igb int i, max_queue; /* Configure TQAVCTRL register: set transmit mode to 'Qav', - * set data fetch arbitration to 'round robin' and set data - * transfer arbitration to 'credit shaper algorithm. + * set data fetch arbitration to 'round robin', set SP_WAIT_SR + * so SP queues wait for SR ones. */ val = rd32(E1000_I210_TQAVCTRL); - val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_DATATRANARB; + val |= E1000_TQAVCTRL_XMIT_MODE | E1000_TQAVCTRL_SP_WAIT_SR; val &= ~E1000_TQAVCTRL_DATAFETCHARB; wr32(E1000_I210_TQAVCTRL, val); @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1972 @ static void igb_setup_tx_mode(struct igb adapter->num_tx_queues : I210_SR_QUEUES_NUM; for (i = 0; i < max_queue; i++) { - struct igb_ring *ring = adapter->tx_ring[i]; - - igb_configure_cbs(adapter, i, ring->cbs_enable, - ring->idleslope, ring->sendslope, - ring->hicredit, ring->locredit); + igb_config_tx_modes(adapter, i); } } else { wr32(E1000_RXPBS, I210_RXPBSIZE_DEFAULT); @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:2545 @ igb_features_check(struct sk_buff *skb, return features; } +static void igb_offload_apply(struct igb_adapter *adapter, s32 queue) +{ + if (!is_fqtss_enabled(adapter)) { + enable_fqtss(adapter, true); + return; + } + + igb_config_tx_modes(adapter, queue); + + if (!is_any_cbs_enabled(adapter) && !is_any_txtime_enabled(adapter)) + enable_fqtss(adapter, false); +} + static int igb_offload_cbs(struct igb_adapter *adapter, struct tc_cbs_qopt_offload *qopt) { @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:2578 @ static int igb_offload_cbs(struct igb_ad if (err) return err; - if (is_fqtss_enabled(adapter)) { - igb_configure_cbs(adapter, qopt->queue, qopt->enable, - qopt->idleslope, qopt->sendslope, - qopt->hicredit, qopt->locredit); + igb_offload_apply(adapter, qopt->queue); - if (!is_any_cbs_enabled(adapter)) - enable_fqtss(adapter, false); + return 0; +} - } else { - enable_fqtss(adapter, true); - } +static int igb_offload_txtime(struct igb_adapter *adapter, + struct tc_tbs_qopt_offload *qopt) +{ + struct e1000_hw *hw = &adapter->hw; + int err; + + /* Launchtime offloading is only supported by i210 controller. */ + if (hw->mac.type != e1000_i210) + return -EOPNOTSUPP; + + /* Launchtime offloading is only supported by queues 0 and 1. */ + if (qopt->queue < 0 || qopt->queue > 1) + return -EINVAL; + + err = igb_save_txtime_params(adapter, qopt->queue, qopt->enable); + + if (err) + return err; + + igb_offload_apply(adapter, qopt->queue); return 0; } @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:2615 @ static int igb_setup_tc(struct net_devic switch (type) { case TC_SETUP_QDISC_CBS: return igb_offload_cbs(adapter, type_data); + case TC_SETUP_QDISC_TBS: + return igb_offload_txtime(adapter, type_data); default: return -EOPNOTSUPP; @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:5428 @ set_itr_now: } } -static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, u32 vlan_macip_lens, - u32 type_tucmd, u32 mss_l4len_idx) +static void igb_tx_ctxtdesc(struct igb_ring *tx_ring, + struct igb_tx_buffer *first, + u32 vlan_macip_lens, u32 type_tucmd, + u32 mss_l4len_idx) { struct e1000_adv_tx_context_desc *context_desc; u16 i = tx_ring->next_to_use; + struct timespec64 ts; context_desc = IGB_TX_CTXTDESC(tx_ring, i); @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:5450 @ static void igb_tx_ctxtdesc(struct igb_r mss_l4len_idx |= tx_ring->reg_idx << 4; context_desc->vlan_macip_lens = cpu_to_le32(vlan_macip_lens); - context_desc->seqnum_seed = 0; context_desc->type_tucmd_mlhl = cpu_to_le32(type_tucmd); context_desc->mss_l4len_idx = cpu_to_le32(mss_l4len_idx); + + /* We assume there is always a valid tx time available. Invalid times + * should have been handled by the upper layers. + */ + if (tx_ring->launchtime_enable) { + ts = ns_to_timespec64(first->skb->tstamp); + context_desc->seqnum_seed = cpu_to_le32(ts.tv_nsec / 32); + } else { + context_desc->seqnum_seed = 0; + } } static int igb_tso(struct igb_ring *tx_ring, @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:5544 @ static int igb_tso(struct igb_ring *tx_r vlan_macip_lens |= (ip.hdr - skb->data) << E1000_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK; - igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, mss_l4len_idx); + igb_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, + type_tucmd, mss_l4len_idx); return 1; } @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:5600 @ no_csum: vlan_macip_lens |= skb_network_offset(skb) << E1000_ADVTXD_MACLEN_SHIFT; vlan_macip_lens |= first->tx_flags & IGB_TX_FLAGS_VLAN_MASK; - igb_tx_ctxtdesc(tx_ring, vlan_macip_lens, type_tucmd, 0); + igb_tx_ctxtdesc(tx_ring, first, vlan_macip_lens, type_tucmd, 0); } #define IGB_SET_FLAG(_input, _flag, _result) \ Index: linux-4.16.8-rt3/include/linux/netdevice.h =================================================================== --- linux-4.16.8-rt3.orig/include/linux/netdevice.h +++ linux-4.16.8-rt3/include/linux/netdevice.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:784 @ enum tc_setup_type { TC_SETUP_QDISC_CBS, TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, + TC_SETUP_QDISC_TBS, }; /* These structures hold the attributes of bpf state that are being passed @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:3377 @ static __always_inline int ____dev_forwa skb_scrub_packet(skb, true); skb->priority = 0; + skb->tstamp = 0; return 0; } Index: linux-4.16.8-rt3/include/linux/posix-timers.h =================================================================== --- linux-4.16.8-rt3.orig/include/linux/posix-timers.h +++ linux-4.16.8-rt3/include/linux/posix-timers.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:31 @ struct cpu_timer_list { * * A clockid is invalid if bits 2, 1, and 0 are all set. */ +#define CLOCKID_INVALID GENMASK(2, 0) #define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3)) #define CPUCLOCK_PERTHREAD(clock) \ (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0) Index: linux-4.16.8-rt3/include/linux/skbuff.h =================================================================== --- linux-4.16.8-rt3.orig/include/linux/skbuff.h +++ linux-4.16.8-rt3/include/linux/skbuff.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:788 @ struct sk_buff { __u8 tc_redirected:1; __u8 tc_from_ingress:1; #endif + __u8 tc_drop_if_late:1; + + clockid_t txtime_clockid; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ Index: linux-4.16.8-rt3/include/net/pkt_sched.h =================================================================== --- linux-4.16.8-rt3.orig/include/net/pkt_sched.h +++ linux-4.16.8-rt3/include/net/pkt_sched.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:75 @ struct qdisc_watchdog { struct Qdisc *qdisc; }; +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires); @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:158 @ struct tc_cbs_qopt_offload { s32 sendslope; }; +struct tc_tbs_qopt_offload { + u8 enable; + s32 queue; +}; + #endif Index: linux-4.16.8-rt3/include/net/sock.h =================================================================== --- linux-4.16.8-rt3.orig/include/net/sock.h +++ linux-4.16.8-rt3/include/net/sock.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:780 @ enum sock_flags { SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ + SOCK_TXTIME, }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1572 @ void sock_kzfree_s(struct sock *sk, void void sk_send_sigurg(struct sock *sk); struct sockcm_cookie { + u64 transmit_time; u32 mark; + clockid_t clockid; u16 tsflags; + u8 drop_if_late; }; int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, Index: linux-4.16.8-rt3/include/uapi/asm-generic/socket.h =================================================================== --- linux-4.16.8-rt3.orig/include/uapi/asm-generic/socket.h +++ linux-4.16.8-rt3/include/uapi/asm-generic/socket.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:110 @ #define SO_ZEROCOPY 60 +#define SO_TXTIME 61 +#define SCM_TXTIME SO_TXTIME +#define SCM_DROP_IF_LATE 62 +#define SCM_CLOCKID 63 + #endif /* __ASM_GENERIC_SOCKET_H */ Index: linux-4.16.8-rt3/include/uapi/linux/pkt_sched.h =================================================================== --- linux-4.16.8-rt3.orig/include/uapi/linux/pkt_sched.h +++ linux-4.16.8-rt3/include/uapi/linux/pkt_sched.h @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:937 @ enum { #define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + +/* TBS */ +struct tc_tbs_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_TBS_SORTING_ON BIT(0) +#define TC_TBS_OFFLOAD_ON BIT(1) +}; + +enum { + TCA_TBS_UNSPEC, + TCA_TBS_PARMS, + __TCA_TBS_MAX, +}; + +#define TCA_TBS_MAX (__TCA_TBS_MAX - 1) + #endif Index: linux-4.16.8-rt3/net/core/skbuff.c =================================================================== --- linux-4.16.8-rt3.orig/net/core/skbuff.c +++ linux-4.16.8-rt3/net/core/skbuff.c @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:4867 @ EXPORT_SYMBOL(skb_try_coalesce); */ void skb_scrub_packet(struct sk_buff *skb, bool xnet) { - skb->tstamp = 0; skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; Index: linux-4.16.8-rt3/net/core/sock.c =================================================================== --- linux-4.16.8-rt3.orig/net/core/sock.c +++ linux-4.16.8-rt3/net/core/sock.c @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:94 @ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <asm/unaligned.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/errqueue.h> @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1065 @ set_rcvbuf: sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); break; + case SO_TXTIME: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + ret = -EPERM; + else if (val < 0 || val > 1) + ret = -EINVAL; + else + sock_valbool_flag(sk, SOCK_TXTIME, valbool); + break; + default: ret = -ENOPROTOOPT; break; @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1406 @ int sock_getsockopt(struct socket *sock, v.val = sock_flag(sk, SOCK_ZEROCOPY); break; + case SO_TXTIME: + v.val = sock_flag(sk, SOCK_TXTIME); + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:2124 @ int __sock_cmsg_send(struct sock *sk, st struct sockcm_cookie *sockc) { u32 tsflags; + u8 drop; switch (cmsg->cmsg_type) { case SO_MARK: @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:2145 @ int __sock_cmsg_send(struct sock *sk, st sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; sockc->tsflags |= tsflags; break; + case SCM_TXTIME: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) + return -EINVAL; + sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); + break; + case SCM_DROP_IF_LATE: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u8))) + return -EINVAL; + + drop = get_unaligned((u8 *)CMSG_DATA(cmsg)); + if (drop < 0 || drop > 1) + return -EINVAL; + + sockc->drop_if_late = drop; + break; + case SCM_CLOCKID: + if (!sock_flag(sk, SOCK_TXTIME)) + return -EINVAL; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(clockid_t))) + return -EINVAL; + sockc->clockid = get_unaligned((clockid_t *)CMSG_DATA(cmsg)); + break; /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ case SCM_RIGHTS: case SCM_CREDENTIALS: Index: linux-4.16.8-rt3/net/ipv4/raw.c =================================================================== --- linux-4.16.8-rt3.orig/net/ipv4/raw.c +++ linux-4.16.8-rt3/net/ipv4/raw.c @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:82 @ #include <linux/netfilter_ipv4.h> #include <linux/compat.h> #include <linux/uio.h> +#include <linux/posix-timers.h> struct raw_frag_vec { struct msghdr *msg; @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:385 @ static int raw_send_hdrinc(struct sock * skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc->transmit_time; + skb->txtime_clockid = sockc->clockid; + skb->tc_drop_if_late = sockc->drop_if_late; skb_dst_set(skb, &rt->dst); *rtp = NULL; @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:569 @ static int raw_sendmsg(struct sock *sk, } ipc.sockc.tsflags = sk->sk_tsflags; + ipc.sockc.transmit_time = 0; + ipc.sockc.drop_if_late = 0; + ipc.sockc.clockid = CLOCKID_INVALID; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; Index: linux-4.16.8-rt3/net/ipv4/udp.c =================================================================== --- linux-4.16.8-rt3.orig/net/ipv4/udp.c +++ linux-4.16.8-rt3/net/ipv4/udp.c @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:118 @ #include "udp_impl.h" #include <net/sock_reuseport.h> #include <net/addrconf.h> +#include <linux/posix-timers.h> struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:930 @ int udp_sendmsg(struct sock *sk, struct } ipc.sockc.tsflags = sk->sk_tsflags; + ipc.sockc.transmit_time = 0; + ipc.sockc.drop_if_late = 0; + ipc.sockc.clockid = CLOCKID_INVALID; ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1049 @ back_from_confirm: sizeof(struct udphdr), &ipc, &rt, msg->msg_flags); err = PTR_ERR(skb); - if (!IS_ERR_OR_NULL(skb)) + if (!IS_ERR_OR_NULL(skb)) { + skb->tstamp = ipc.sockc.transmit_time; + skb->txtime_clockid = ipc.sockc.clockid; + skb->tc_drop_if_late = ipc.sockc.drop_if_late; err = udp_send_skb(skb, fl4); + } goto out; } Index: linux-4.16.8-rt3/net/packet/af_packet.c =================================================================== --- linux-4.16.8-rt3.orig/net/packet/af_packet.c +++ linux-4.16.8-rt3/net/packet/af_packet.c @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:97 @ #endif #include <linux/bpf.h> #include <net/compat.h> +#include <linux/posix-timers.h> #include "internal.h" @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:1987 @ retry: goto out_unlock; } + sockc.transmit_time = 0; + sockc.drop_if_late = 0; + sockc.clockid = CLOCKID_INVALID; sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:2001 @ retry: skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; + skb->tstamp = sockc.transmit_time; + skb->tc_drop_if_late = sockc.drop_if_late; + skb->txtime_clockid = sockc.clockid; sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:2501 @ static int tpacket_fill_skb(struct packe skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; + skb->tstamp = sockc->transmit_time; + skb->tc_drop_if_late = sockc->drop_if_late; + skb->txtime_clockid = sockc->clockid; sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:2680 @ static int tpacket_snd(struct packet_soc if (unlikely(!(dev->flags & IFF_UP))) goto out_put; + sockc.transmit_time = 0; + sockc.drop_if_late = 0; + sockc.clockid = CLOCKID_INVALID; sockc.tsflags = po->sk.sk_tsflags; if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:2879 @ static int packet_snd(struct socket *soc if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; + sockc.transmit_time = 0; + sockc.drop_if_late = 0; + sockc.clockid = CLOCKID_INVALID; sockc.tsflags = sk->sk_tsflags; sockc.mark = sk->sk_mark; if (msg->msg_controllen) { @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:2956 @ static int packet_snd(struct socket *soc skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sockc.mark; + skb->tstamp = sockc.transmit_time; + skb->tc_drop_if_late = sockc.drop_if_late; + skb->txtime_clockid = sockc.clockid; if (has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); Index: linux-4.16.8-rt3/net/sched/Kconfig =================================================================== --- linux-4.16.8-rt3.orig/net/sched/Kconfig +++ linux-4.16.8-rt3/net/sched/Kconfig @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:186 @ config NET_SCH_CBS To compile this code as a module, choose M here: the module will be called sch_cbs. +config NET_SCH_TBS + tristate "Time Based Scheduler (TBS)" + ---help--- + Say Y here if you want to use the Time Based Scheduler (TBS) packet + scheduling algorithm. + + See the top of <file:net/sched/sch_tbs.c> for more details. + + To compile this code as a module, choose M here: the + module will be called sch_tbs. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- Index: linux-4.16.8-rt3/net/sched/Makefile =================================================================== --- linux-4.16.8-rt3.orig/net/sched/Makefile +++ linux-4.16.8-rt3/net/sched/Makefile @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:57 @ obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o +obj-$(CONFIG_NET_SCH_TBS) += sch_tbs.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o Index: linux-4.16.8-rt3/net/sched/sch_api.c =================================================================== --- linux-4.16.8-rt3.orig/net/sched/sch_api.c +++ linux-4.16.8-rt3/net/sched/sch_api.c @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:599 @ static enum hrtimer_restart qdisc_watchd return HRTIMER_NORESTART; } -void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, + clockid_t clockid) { - hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } +EXPORT_SYMBOL(qdisc_watchdog_init_clockid); + +void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) +{ + qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); +} EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) Index: linux-4.16.8-rt3/net/sched/sch_tbs.c =================================================================== --- /dev/null +++ linux-4.16.8-rt3/net/sched/sch_tbs.c @ linux-4.16.8-rt3/arch/alpha/include/uapi/asm/socket.h:4 @ +/* + * net/sched/sch_tbs.c Time Based Shaper + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jesus Sanchez-Palencia <jesus.sanchez-palencia@intel.com> + * Vinicius Costa Gomes <vinicius.gomes@intel.com> + * + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/rbtree.h> +#include <linux/skbuff.h> +#include <linux/posix-timers.h> +#include <net/netlink.h> +#include <net/sch_generic.h> +#include <net/pkt_sched.h> +#include <net/sock.h> + +#define SORTING_IS_ON(x) (x->flags & TC_TBS_SORTING_ON) +#define OFFLOAD_IS_ON(x) (x->flags & TC_TBS_OFFLOAD_ON) + +struct tbs_sched_data { + bool offload; + bool sorting; + int clockid; + int queue; + s32 delta; /* in ns */ + ktime_t last; /* The txtime of the last skb sent to the netdevice. */ + struct rb_root head; + struct qdisc_watchdog watchdog; + struct Qdisc *qdisc; + int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free); + struct sk_buff *(*dequeue)(struct Qdisc *sch); + struct sk_buff *(*peek)(struct Qdisc *sch); +}; + +static const struct nla_policy tbs_policy[TCA_TBS_MAX + 1] = { + [TCA_TBS_PARMS] = { .len = sizeof(struct tc_tbs_qopt) }, +}; + +typedef ktime_t (*get_time_func_t)(void); + +static const get_time_func_t clockid_to_get_time[MAX_CLOCKS] = { + [CLOCK_MONOTONIC] = ktime_get, + [CLOCK_REALTIME] = ktime_get_real, + [CLOCK_BOOTTIME] = ktime_get_boottime, + [CLOCK_TAI] = ktime_get_clocktai, +}; + +static ktime_t get_time_by_clockid(clockid_t clockid) +{ + get_time_func_t func = clockid_to_get_time[clockid]; + + if (!func) + return 0; + + return func(); +} + +static inline int validate_input_params(struct tc_tbs_qopt *qopt, + struct netlink_ext_ack *extack) +{ + /* Check if params comply to the following rules: + * * If SW best-effort, then clockid and delta must be valid. + * + * * If HW offload is ON and sorting is ON, then clockid and delta + * must be valid. + * + * * If HW offload is ON and sorting is OFF, then clockid and + * delta must not have been set. The netdevice PHC will be used + * implictly. + * + * * Dynamic clockids are not supported. + * * Delta must be a positive integer. + */ + if (!OFFLOAD_IS_ON(qopt) || SORTING_IS_ON(qopt)) { + if ((qopt->clockid & CLOCKID_INVALID) == CLOCKID_INVALID || + qopt->clockid >= MAX_CLOCKS) { + NL_SET_ERR_MSG(extack, "Invalid clockid"); + return -EINVAL; + } else if (qopt->clockid < 0 || + !clockid_to_get_time[qopt->clockid]) { + NL_SET_ERR_MSG(extack, "Clockid is not supported"); + return -ENOTSUPP; + } + + if (qopt->delta < 0) { + NL_SET_ERR_MSG(extack, "Delta must be positive"); + return -EINVAL; + } + } else { + if (qopt->delta != 0) { + NL_SET_ERR_MSG(extack, "Cannot set delta for this mode"); + return -EINVAL; + } + if ((qopt->clockid & CLOCKID_INVALID) != CLOCKID_INVALID) { + NL_SET_ERR_MSG(extack, "Cannot set clockid for this mode"); + return -EINVAL; + } + } + + return 0; +} + +static bool is_packet_valid(struct Qdisc *sch, struct sk_buff *nskb) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + ktime_t txtime = nskb->tstamp; + struct sock *sk = nskb->sk; + ktime_t now; + + if (sk && !sock_flag(sk, SOCK_TXTIME)) + return false; + + /* We don't perform crosstimestamping. + * Drop if packet's clockid differs from qdisc's. + */ + if (nskb->txtime_clockid != q->clockid) + return false; + + now = get_time_by_clockid(q->clockid); + if (ktime_before(txtime, now) || ktime_before(txtime, q->last)) + return false; + + return true; +} + +static struct sk_buff *tbs_peek(struct Qdisc *sch) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + + return q->peek(sch); +} + +static struct sk_buff *tbs_peek_timesortedlist(struct Qdisc *sch) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + struct rb_node *p; + + p = rb_first(&q->head); + if (!p) + return NULL; + + return rb_to_skb(p); +} + +static void reset_watchdog(struct Qdisc *sch) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = tbs_peek(sch); + ktime_t next; + + if (!skb) + return; + + next = ktime_sub_ns(skb->tstamp, q->delta); + qdisc_watchdog_schedule_ns(&q->watchdog, ktime_to_ns(next)); +} + +static int tbs_enqueue(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + + return q->enqueue(nskb, sch, to_free); +} + +static int tbs_enqueue_fifo(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + if (!is_packet_valid(sch, nskb)) + return qdisc_drop(nskb, sch, to_free); + + return qdisc_enqueue_tail(nskb, sch); +} + +static int tbs_enqueue_scheduledfifo(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + int err; + + if (!is_packet_valid(sch, nskb)) + return qdisc_drop(nskb, sch, to_free); + + err = qdisc_enqueue_tail(nskb, sch); + + /* If there is only 1 packet, then we must reset the watchdog. */ + if (err >= 0 && sch->q.qlen == 1) + reset_watchdog(sch); + + return err; +} + +static int tbs_enqueue_timesortedlist(struct sk_buff *nskb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + struct rb_node **p = &q->head.rb_node, *parent = NULL; + ktime_t txtime = nskb->tstamp; + + if (!is_packet_valid(sch, nskb)) + return qdisc_drop(nskb, sch, to_free); + + while (*p) { + struct sk_buff *skb; + + parent = *p; + skb = rb_to_skb(parent); + if (ktime_after(txtime, skb->tstamp)) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&nskb->rbnode, parent, p); + rb_insert_color(&nskb->rbnode, &q->head); + + qdisc_qstats_backlog_inc(sch, nskb); + sch->q.qlen++; + + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return NET_XMIT_SUCCESS; +} + +static void timesortedlist_erase(struct Qdisc *sch, struct sk_buff *skb, + bool drop) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + + rb_erase(&skb->rbnode, &q->head); + + qdisc_qstats_backlog_dec(sch, skb); + + if (drop) { + struct sk_buff *to_free = NULL; + + qdisc_drop(skb, sch, &to_free); + kfree_skb_list(to_free); + qdisc_qstats_overlimit(sch); + } else { + qdisc_bstats_update(sch, skb); + + q->last = skb->tstamp; + } + + sch->q.qlen--; + + /* The rbnode field in the skb re-uses these fields, now that + * we are done with the rbnode, reset them. + */ + skb->next = NULL; + skb->prev = NULL; + skb->dev = qdisc_dev(sch); +} + +static struct sk_buff *tbs_dequeue(struct Qdisc *sch) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + + return q->dequeue(sch); +} + +static struct sk_buff *tbs_dequeue_fifo(struct Qdisc *sch) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = qdisc_dequeue_head(sch); + + /* XXX: The drop_if_late bit is not checked here because that would + * require the PHC time to be read directly. + */ + + if (skb) + q->last = skb->tstamp; + + return skb; +} + +static struct sk_buff *tbs_dequeue_scheduledfifo(struct Qdisc *sch) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb = tbs_peek(sch); + ktime_t now, next; + + if (!skb) + return NULL; + + now = get_time_by_clockid(q->clockid); + + /* Drop if packet has expired while in queue and the drop_if_late + * flag is set. + */ + if (skb->tc_drop_if_late && ktime_before(skb->tstamp, now)) { + struct sk_buff *to_free = NULL; + + qdisc_queue_drop_head(sch, &to_free); + kfree_skb_list(to_free); + qdisc_qstats_overlimit(sch); + + skb = NULL; + goto out; + } + + next = ktime_sub_ns(skb->tstamp, q->delta); + + /* Dequeue only if now is within the [txtime - delta, txtime] range. */ + if (ktime_after(now, next)) + skb = qdisc_dequeue_head(sch); + else + skb = NULL; + +out: + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return skb; +} + +static struct sk_buff *tbs_dequeue_timesortedlist(struct Qdisc *sch) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + ktime_t now, next; + + skb = tbs_peek(sch); + if (!skb) + return NULL; + + now = get_time_by_clockid(q->clockid); + + /* Drop if packet has expired while in queue and the drop_if_late + * flag is set. + */ + if (skb->tc_drop_if_late && ktime_before(skb->tstamp, now)) { + timesortedlist_erase(sch, skb, true); + skb = NULL; + goto out; + } + + next = ktime_sub_ns(skb->tstamp, q->delta); + + /* Dequeue only if now is within the [txtime - delta, txtime] range. */ + if (ktime_after(now, next)) + timesortedlist_erase(sch, skb, false); + else + skb = NULL; + +out: + /* Now we may need to re-arm the qdisc watchdog for the next packet. */ + reset_watchdog(sch); + + return skb; +} + +static void tbs_disable_offload(struct net_device *dev, + struct tbs_sched_data *q) +{ + struct tc_tbs_qopt_offload tbs = { }; + const struct net_device_ops *ops; + int err; + + if (!q->offload) + return; + + ops = dev->netdev_ops; + if (!ops->ndo_setup_tc) + return; + + tbs.queue = q->queue; + tbs.enable = 0; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBS, &tbs); + if (err < 0) + pr_warn("Couldn't disable TBS offload for queue %d\n", + tbs.queue); +} + +static int tbs_enable_offload(struct net_device *dev, struct tbs_sched_data *q, + struct netlink_ext_ack *extack) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_tbs_qopt_offload tbs = { }; + int err; + + if (q->offload) + return 0; + + if (!ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, "Specified device does not support TBS offload"); + return -EOPNOTSUPP; + } + + tbs.queue = q->queue; + tbs.enable = 1; + + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TBS, &tbs); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Specified device failed to setup TBS hardware offload"); + return err; + } + + return 0; +} + +static inline void setup_queueing_mode(struct tbs_sched_data *q) +{ + if (q->sorting) { + q->enqueue = tbs_enqueue_timesortedlist; + q->dequeue = tbs_dequeue_timesortedlist; + q->peek = tbs_peek_timesortedlist; + } else { + if (q->offload) { + q->enqueue = tbs_enqueue_fifo; + q->dequeue = tbs_dequeue_fifo; + q->peek = qdisc_peek_head; + } else { + q->enqueue = tbs_enqueue_scheduledfifo; + q->dequeue = tbs_dequeue_scheduledfifo; + q->peek = qdisc_peek_head; + } + } +} + +static int tbs_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct nlattr *tb[TCA_TBS_MAX + 1]; + struct tc_tbs_qopt *qopt; + int err; + + if (!opt) { + NL_SET_ERR_MSG(extack, "Missing TBS qdisc options which are mandatory"); + return -EINVAL; + } + + err = nla_parse_nested(tb, TCA_TBS_MAX, opt, tbs_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TBS_PARMS]) { + NL_SET_ERR_MSG(extack, "Missing mandatory TBS parameters"); + return -EINVAL; + } + + qopt = nla_data(tb[TCA_TBS_PARMS]); + + pr_debug("delta %d clockid %d offload %s sorting %s\n", + qopt->delta, qopt->clockid, + OFFLOAD_IS_ON(qopt) ? "on" : "off", + SORTING_IS_ON(qopt) ? "on" : "off"); + + err = validate_input_params(qopt, extack); + if (err < 0) + return err; + + q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0); + + if (OFFLOAD_IS_ON(qopt)) { + err = tbs_enable_offload(dev, q, extack); + if (err < 0) + return err; + } + + /* Everything went OK, save the parameters used. */ + q->delta = qopt->delta; + q->clockid = qopt->clockid; + q->offload = OFFLOAD_IS_ON(qopt); + q->sorting = SORTING_IS_ON(qopt); + + /* Select queueing mode based on offload and sorting parameters. */ + setup_queueing_mode(q); + + /* The watchdog will be needed for SW best-effort or if TxTime + * based sorting is on. + */ + if (!q->offload || q->sorting) + qdisc_watchdog_init_clockid(&q->watchdog, sch, q->clockid); + + return 0; +} + +static void timesortedlist_clear(struct Qdisc *sch) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + struct rb_node *p = rb_first(&q->head); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + + rb_erase(&skb->rbnode, &q->head); + rtnl_kfree_skbs(skb, skb); + sch->q.qlen--; + } +} + +static void tbs_reset(struct Qdisc *sch) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + /* No matter which mode we are on, it's safe to clear both lists. */ + timesortedlist_clear(sch); + __qdisc_reset_queue(&sch->q); + + sch->qstats.backlog = 0; + sch->q.qlen = 0; + + q->last = 0; +} + +static void tbs_destroy(struct Qdisc *sch) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + /* Only cancel watchdog if it's been initialized. */ + if (q->watchdog.qdisc == sch) + qdisc_watchdog_cancel(&q->watchdog); + + tbs_disable_offload(dev, q); +} + +static int tbs_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct tbs_sched_data *q = qdisc_priv(sch); + struct tc_tbs_qopt opt = { }; + struct nlattr *nest; + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + opt.delta = q->delta; + opt.clockid = q->clockid; + if (q->offload) + opt.flags |= TC_TBS_OFFLOAD_ON; + + if (q->sorting) + opt.flags |= TC_TBS_SORTING_ON; + + if (nla_put(skb, TCA_TBS_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + return nla_nest_end(skb, nest); + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc_ops tbs_qdisc_ops __read_mostly = { + .id = "tbs", + .priv_size = sizeof(struct tbs_sched_data), + .enqueue = tbs_enqueue, + .dequeue = tbs_dequeue, + .peek = tbs_peek, + .init = tbs_init, + .reset = tbs_reset, + .destroy = tbs_destroy, + .dump = tbs_dump, + .owner = THIS_MODULE, +}; + +static int __init tbs_module_init(void) +{ + return register_qdisc(&tbs_qdisc_ops); +} + +static void __exit tbs_module_exit(void) +{ + unregister_qdisc(&tbs_qdisc_ops); +} +module_init(tbs_module_init) +module_exit(tbs_module_exit) +MODULE_LICENSE("GPL");