diff --git a/drivers/net/bnx2x/bnx2x.h b/drivers/net/bnx2x/bnx2x.h
index 16dc2c9df8b72c0665d73f54b6e8de9307e0a9c3..53fa8ea983e969a2b236f9d92b0573868b9bb3b4 100644
--- a/drivers/net/bnx2x/bnx2x.h
+++ b/drivers/net/bnx2x/bnx2x.h
@@ -120,6 +120,7 @@ do {								 \
 
 
 #ifdef BNX2X_STOP_ON_ERROR
+void bnx2x_int_disable(struct bnx2x *bp);
 #define bnx2x_panic() do { \
 		bp->panic = 1; \
 		BNX2X_ERR("driver assert\n"); \
@@ -240,21 +241,21 @@ do {								 \
  */
 /* iSCSI L2 */
 #define BNX2X_ISCSI_ETH_CL_ID_IDX	1
-#define BNX2X_ISCSI_ETH_CID		17
+#define BNX2X_ISCSI_ETH_CID		49
 
 /* FCoE L2 */
 #define BNX2X_FCOE_ETH_CL_ID_IDX	2
-#define BNX2X_FCOE_ETH_CID		18
+#define BNX2X_FCOE_ETH_CID		50
 
 /** Additional rings budgeting */
 #ifdef BCM_CNIC
-#define CNIC_CONTEXT_USE		1
-#define FCOE_CONTEXT_USE		1
+#define CNIC_PRESENT			1
+#define FCOE_PRESENT			1
 #else
-#define CNIC_CONTEXT_USE		0
-#define FCOE_CONTEXT_USE		0
+#define CNIC_PRESENT			0
+#define FCOE_PRESENT			0
 #endif /* BCM_CNIC */
-#define NONE_ETH_CONTEXT_USE	(FCOE_CONTEXT_USE)
+#define NON_ETH_CONTEXT_USE	(FCOE_PRESENT)
 
 #define AEU_IN_ATTN_BITS_PXPPCICLOCKCLIENT_PARITY_ERROR \
 	AEU_INPUTS_ATTN_BITS_PXPPCICLOCKCLIENT_PARITY_ERROR
@@ -262,8 +263,35 @@ do {								 \
 #define SM_RX_ID			0
 #define SM_TX_ID			1
 
-/* fast path */
+/* defines for multiple tx priority indices */
+#define FIRST_TX_ONLY_COS_INDEX		1
+#define FIRST_TX_COS_INDEX		0
+
+/* defines for decodeing the fastpath index and the cos index out of the
+ * transmission queue index
+ */
+#define MAX_TXQS_PER_COS	FP_SB_MAX_E1x
+
+#define TXQ_TO_FP(txq_index)	((txq_index) % MAX_TXQS_PER_COS)
+#define TXQ_TO_COS(txq_index)	((txq_index) / MAX_TXQS_PER_COS)
+
+/* rules for calculating the cids of tx-only connections */
+#define CID_TO_FP(cid)		((cid) % MAX_TXQS_PER_COS)
+#define CID_COS_TO_TX_ONLY_CID(cid, cos)	(cid + cos * MAX_TXQS_PER_COS)
+
+/* fp index inside class of service range */
+#define FP_COS_TO_TXQ(fp, cos)    ((fp)->index + cos * MAX_TXQS_PER_COS)
+
+/*
+ * 0..15 eth cos0
+ * 16..31 eth cos1 if applicable
+ * 32..47 eth cos2 If applicable
+ * fcoe queue follows eth queues (16, 32, 48 depending on cos)
+ */
+#define MAX_ETH_TXQ_IDX(bp)	(MAX_TXQS_PER_COS * (bp)->max_cos)
+#define FCOE_TXQ_IDX(bp)	(MAX_ETH_TXQ_IDX(bp))
 
+/* fast path */
 struct sw_rx_bd {
 	struct sk_buff	*skb;
 	DEFINE_DMA_UNMAP_ADDR(mapping);
@@ -388,6 +416,29 @@ struct bnx2x_agg_info {
 #define Q_STATS_OFFSET32(stat_name) \
 			(offsetof(struct bnx2x_eth_q_stats, stat_name) / 4)
 
+struct bnx2x_fp_txdata {
+
+	struct sw_tx_bd		*tx_buf_ring;
+
+	union eth_tx_bd_types	*tx_desc_ring;
+	dma_addr_t		tx_desc_mapping;
+
+	u32			cid;
+
+	union db_prod		tx_db;
+
+	u16			tx_pkt_prod;
+	u16			tx_pkt_cons;
+	u16			tx_bd_prod;
+	u16			tx_bd_cons;
+
+	unsigned long		tx_pkt;
+
+	__le16			*tx_cons_sb;
+
+	int			txq_index;
+};
+
 struct bnx2x_fastpath {
 	struct bnx2x		*bp; /* parent */
 
@@ -404,10 +455,8 @@ struct bnx2x_fastpath {
 
 	dma_addr_t		status_blk_mapping;
 
-	struct sw_tx_bd		*tx_buf_ring;
-
-	union eth_tx_bd_types	*tx_desc_ring;
-	dma_addr_t		tx_desc_mapping;
+	u8			max_cos; /* actual number of active tx coses */
+	struct bnx2x_fp_txdata	txdata[BNX2X_MULTI_TX_COS];
 
 	struct sw_rx_bd		*rx_buf_ring;	/* BDs mappings ring */
 	struct sw_rx_page	*rx_page_ring;	/* SGE pages mappings ring */
@@ -426,20 +475,13 @@ struct bnx2x_fastpath {
 
 	u32			cid;
 
+	__le16			fp_hc_idx;
+
 	u8			index;		/* number in fp array */
 	u8			cl_id;		/* eth client id */
 	u8			cl_qzone_id;
 	u8			fw_sb_id;	/* status block number in FW */
 	u8			igu_sb_id;	/* status block number in HW */
-	union db_prod		tx_db;
-
-	u16			tx_pkt_prod;
-	u16			tx_pkt_cons;
-	u16			tx_bd_prod;
-	u16			tx_bd_cons;
-	__le16			*tx_cons_sb;
-
-	__le16			fp_hc_idx;
 
 	u16			rx_bd_prod;
 	u16			rx_bd_cons;
@@ -449,8 +491,7 @@ struct bnx2x_fastpath {
 	/* The last maximal completed SGE */
 	u16			last_max_sge;
 	__le16			*rx_cons_sb;
-	unsigned long		tx_pkt,
-				rx_pkt,
+	unsigned long		rx_pkt,
 				rx_calls;
 
 	/* TPA related */
@@ -489,8 +530,12 @@ struct bnx2x_fastpath {
 #define FCOE_IDX			BNX2X_NUM_ETH_QUEUES(bp)
 #define bnx2x_fcoe_fp(bp)		(&bp->fp[FCOE_IDX])
 #define bnx2x_fcoe(bp, var)		(bnx2x_fcoe_fp(bp)->var)
+#define bnx2x_fcoe_tx(bp, var)		(bnx2x_fcoe_fp(bp)-> \
+						txdata[FIRST_TX_COS_INDEX].var)
 
 
+#define IS_ETH_FP(fp)			(fp->index < \
+					 BNX2X_NUM_ETH_QUEUES(fp->bp))
 #ifdef BCM_CNIC
 #define IS_FCOE_FP(fp)			(fp->index == FCOE_IDX)
 #define IS_FCOE_IDX(idx)		((idx) == FCOE_IDX)
@@ -649,18 +694,23 @@ struct bnx2x_fastpath {
 
 #define HC_INDEX_TOE_TX_CQ_CONS		4 /* Formerly Cstorm TOE CQ index   */
 					  /* (HC_INDEX_C_TOE_TX_CQ_CONS)    */
-#define HC_INDEX_ETH_TX_CQ_CONS		5 /* Formerly Cstorm ETH CQ index   */
+#define HC_INDEX_ETH_TX_CQ_CONS_COS0	5 /* Formerly Cstorm ETH CQ index   */
+					  /* (HC_INDEX_C_ETH_TX_CQ_CONS)    */
+#define HC_INDEX_ETH_TX_CQ_CONS_COS1	6 /* Formerly Cstorm ETH CQ index   */
+					  /* (HC_INDEX_C_ETH_TX_CQ_CONS)    */
+#define HC_INDEX_ETH_TX_CQ_CONS_COS2	7 /* Formerly Cstorm ETH CQ index   */
 					  /* (HC_INDEX_C_ETH_TX_CQ_CONS)    */
 
-#define U_SB_ETH_RX_CQ_INDEX		HC_INDEX_ETH_RX_CQ_CONS
-#define U_SB_ETH_RX_BD_INDEX		HC_INDEX_ETH_RX_BD_CONS
-#define C_SB_ETH_TX_CQ_INDEX		HC_INDEX_ETH_TX_CQ_CONS
+#define HC_INDEX_ETH_FIRST_TX_CQ_CONS	HC_INDEX_ETH_TX_CQ_CONS_COS0
+
 
 #define BNX2X_RX_SB_INDEX \
 	(&fp->sb_index_values[HC_INDEX_ETH_RX_CQ_CONS])
 
-#define BNX2X_TX_SB_INDEX \
-	(&fp->sb_index_values[C_SB_ETH_TX_CQ_INDEX])
+#define BNX2X_TX_SB_INDEX_BASE BNX2X_TX_SB_INDEX_COS0
+
+#define BNX2X_TX_SB_INDEX_COS0 \
+	(&fp->sb_index_values[HC_INDEX_ETH_TX_CQ_CONS_COS0])
 
 /* end of fast path */
 
@@ -845,25 +895,6 @@ extern struct workqueue_struct *bnx2x_wq;
 /* fast-path interrupt contexts E2 */
 #define FP_SB_MAX_E2		HC_SB_MAX_SB_E2
 
-/*
- * cid_cnt paramter below refers to the value returned by
- * 'bnx2x_get_l2_cid_count()' routine
- */
-
-/*
- * The number of FP context allocated by the driver == max number of regular
- * L2 queues + 1 for the FCoE L2 queue
- */
-#define L2_FP_COUNT(cid_cnt)	((cid_cnt) - FCOE_CONTEXT_USE)
-
-/*
- * The number of FP-SB allocated by the driver == max number of regular L2
- * queues + 1 for the CNIC which also consumes an FP-SB
- */
-#define FP_SB_COUNT(cid_cnt)	((cid_cnt) - CNIC_CONTEXT_USE)
-#define NUM_IGU_SB_REQUIRED(cid_cnt) \
-				(FP_SB_COUNT(cid_cnt) - NONE_ETH_CONTEXT_USE)
-
 union cdu_context {
 	struct eth_context eth;
 	char pad[1024];
@@ -871,7 +902,7 @@ union cdu_context {
 
 /* CDU host DB constants */
 #define CDU_ILT_PAGE_SZ_HW	3
-#define CDU_ILT_PAGE_SZ		(4096 << CDU_ILT_PAGE_SZ_HW) /* 32K */
+#define CDU_ILT_PAGE_SZ		(8192 << CDU_ILT_PAGE_SZ_HW) /* 64K */
 #define ILT_PAGE_CIDS		(CDU_ILT_PAGE_SZ / sizeof(union cdu_context))
 
 #ifdef BCM_CNIC
@@ -1048,6 +1079,7 @@ struct bnx2x_fw_stats_data {
 
 /* Public slow path states */
 enum {
+	BNX2X_SP_RTNL_SETUP_TC,
 	BNX2X_SP_RTNL_TX_TIMEOUT,
 };
 
@@ -1226,6 +1258,10 @@ struct bnx2x {
 #define BNX2X_STATE_ERROR		0xf000
 
 	int			multi_mode;
+#define BNX2X_MAX_PRIORITY		8
+#define BNX2X_MAX_ENTRIES_PER_PRI	16
+#define BNX2X_MAX_COS			3
+#define BNX2X_MAX_TX_COS		2
 	int			num_queues;
 	int			disable_tpa;
 
@@ -1275,11 +1311,21 @@ struct bnx2x {
 	struct bnx2x_ilt	*ilt;
 #define BP_ILT(bp)		((bp)->ilt)
 #define ILT_MAX_LINES		256
+/*
+ * Maximum supported number of RSS queues: number of IGU SBs minus one that goes
+ * to CNIC.
+ */
+#define BNX2X_MAX_RSS_COUNT(bp)	((bp)->igu_sb_cnt - CNIC_PRESENT)
 
-	int			l2_cid_count;
-#define L2_ILT_LINES(bp)	(DIV_ROUND_UP((bp)->l2_cid_count, \
-				 ILT_PAGE_CIDS))
-#define BNX2X_DB_SIZE(bp)	((bp)->l2_cid_count * (1 << BNX2X_DB_SHIFT))
+/*
+ * Maximum CID count that might be required by the bnx2x:
+ * Max Tss * Max_Tx_Multi_Cos + CNIC L2 Clients (FCoE and iSCSI related)
+ */
+#define BNX2X_L2_CID_COUNT(bp)	(MAX_TXQS_PER_COS * BNX2X_MULTI_TX_COS +\
+					NON_ETH_CONTEXT_USE + CNIC_PRESENT)
+#define L2_ILT_LINES(bp)	(DIV_ROUND_UP(BNX2X_L2_CID_COUNT(bp),\
+					ILT_PAGE_CIDS))
+#define BNX2X_DB_SIZE(bp)	(BNX2X_L2_CID_COUNT(bp) * (1 << BNX2X_DB_SHIFT))
 
 	int			qm_cid_count;
 
@@ -1421,16 +1467,24 @@ struct bnx2x {
 	u32					dcbx_remote_flags;
 #endif
 	u32					pending_max;
+
+	/* multiple tx classes of service */
+	u8					max_cos;
+
+	/* priority to cos mapping */
+	u8					prio_to_cos[8];
 };
 
 /* Tx queues may be less or equal to Rx queues */
 extern int num_queues;
 #define BNX2X_NUM_QUEUES(bp)	(bp->num_queues)
-#define BNX2X_NUM_ETH_QUEUES(bp) (BNX2X_NUM_QUEUES(bp) - NONE_ETH_CONTEXT_USE)
+#define BNX2X_NUM_ETH_QUEUES(bp) (BNX2X_NUM_QUEUES(bp) - NON_ETH_CONTEXT_USE)
+#define BNX2X_NUM_RX_QUEUES(bp)	BNX2X_NUM_QUEUES(bp)
 
 #define is_multi(bp)		(BNX2X_NUM_QUEUES(bp) > 1)
 
-#define BNX2X_MAX_QUEUES(bp)	(bp->igu_sb_cnt - CNIC_CONTEXT_USE)
+#define BNX2X_MAX_QUEUES(bp)	BNX2X_MAX_RSS_COUNT(bp)
+/* #define is_eth_multi(bp)	(BNX2X_NUM_ETH_QUEUES(bp) > 1) */
 
 #define RSS_IPV4_CAP_MASK						\
 	TSTORM_ETH_FUNCTION_COMMON_CONFIG_RSS_IPV4_CAPABILITY
@@ -1465,35 +1519,40 @@ struct bnx2x_func_init_params {
 };
 
 #define for_each_eth_queue(bp, var) \
-	for (var = 0; var < BNX2X_NUM_ETH_QUEUES(bp); var++)
+	for ((var) = 0; (var) < BNX2X_NUM_ETH_QUEUES(bp); (var)++)
 
 #define for_each_nondefault_eth_queue(bp, var) \
-	for (var = 1; var < BNX2X_NUM_ETH_QUEUES(bp); var++)
+	for ((var) = 1; (var) < BNX2X_NUM_ETH_QUEUES(bp); (var)++)
 
 #define for_each_queue(bp, var) \
-	for (var = 0; var < BNX2X_NUM_QUEUES(bp); var++) \
+	for ((var) = 0; (var) < BNX2X_NUM_QUEUES(bp); (var)++) \
 		if (skip_queue(bp, var))	\
 			continue;		\
 		else
 
+/* Skip forwarding FP */
 #define for_each_rx_queue(bp, var) \
-	for (var = 0; var < BNX2X_NUM_QUEUES(bp); var++) \
+	for ((var) = 0; (var) < BNX2X_NUM_QUEUES(bp); (var)++) \
 		if (skip_rx_queue(bp, var))	\
 			continue;		\
 		else
 
+/* Skip OOO FP */
 #define for_each_tx_queue(bp, var) \
-	for (var = 0; var < BNX2X_NUM_QUEUES(bp); var++) \
+	for ((var) = 0; (var) < BNX2X_NUM_QUEUES(bp); (var)++) \
 		if (skip_tx_queue(bp, var))	\
 			continue;		\
 		else
 
 #define for_each_nondefault_queue(bp, var) \
-	for (var = 1; var < BNX2X_NUM_QUEUES(bp); var++) \
+	for ((var) = 1; (var) < BNX2X_NUM_QUEUES(bp); (var)++) \
 		if (skip_queue(bp, var))	\
 			continue;		\
 		else
 
+#define for_each_cos_in_tx_queue(fp, var) \
+	for ((var) = 0; (var) < (fp)->max_cos; (var)++)
+
 /* skip rx queue
  * if FCOE l2 support is disabled and this is the fcoe L2 queue
  */
diff --git a/drivers/net/bnx2x/bnx2x_cmn.c b/drivers/net/bnx2x/bnx2x_cmn.c
index 8763625b09d0e05403391d225398758a14c42033..e5fac6244f4a3acefa580a81932dc2b187134e16 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/bnx2x/bnx2x_cmn.c
@@ -47,6 +47,25 @@ static inline void bnx2x_bz_fp(struct bnx2x *bp, int index)
 
 	/* Restore the NAPI object as it has been already initialized */
 	fp->napi = orig_napi;
+
+	fp->bp = bp;
+	fp->index = index;
+	if (IS_ETH_FP(fp))
+		fp->max_cos = bp->max_cos;
+	else
+		/* Special queues support only one CoS */
+		fp->max_cos = 1;
+
+	/*
+	 * set the tpa flag for each queue. The tpa flag determines the queue
+	 * minimal size so it must be set prior to queue memory allocation
+	 */
+	fp->disable_tpa = ((bp->flags & TPA_ENABLE_FLAG) == 0);
+
+#ifdef BCM_CNIC
+	/* We don't want TPA on FCoE, FWD and OOO L2 rings */
+	bnx2x_fcoe(bp, disable_tpa) = 1;
+#endif
 }
 
 /**
@@ -77,10 +96,10 @@ int load_count[2][3] = { {0} }; /* per-path: 0-common, 1-port0, 2-port1 */
 /* free skb in the packet ring at pos idx
  * return idx of last bd freed
  */
-static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fastpath *fp,
+static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata,
 			     u16 idx)
 {
-	struct sw_tx_bd *tx_buf = &fp->tx_buf_ring[idx];
+	struct sw_tx_bd *tx_buf = &txdata->tx_buf_ring[idx];
 	struct eth_tx_start_bd *tx_start_bd;
 	struct eth_tx_bd *tx_data_bd;
 	struct sk_buff *skb = tx_buf->skb;
@@ -91,11 +110,11 @@ static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	prefetch(&skb->end);
 
 	DP(BNX2X_MSG_FP, "fp[%d]: pkt_idx %d  buff @(%p)->skb %p\n",
-	   fp->index, idx, tx_buf, skb);
+	   txdata->txq_index, idx, tx_buf, skb);
 
 	/* unmap first bd */
 	DP(BNX2X_MSG_OFF, "free bd_idx %d\n", bd_idx);
-	tx_start_bd = &fp->tx_desc_ring[bd_idx].start_bd;
+	tx_start_bd = &txdata->tx_desc_ring[bd_idx].start_bd;
 	dma_unmap_single(&bp->pdev->dev, BD_UNMAP_ADDR(tx_start_bd),
 			 BD_UNMAP_LEN(tx_start_bd), DMA_TO_DEVICE);
 
@@ -126,7 +145,7 @@ static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	while (nbd > 0) {
 
 		DP(BNX2X_MSG_OFF, "free frag bd_idx %d\n", bd_idx);
-		tx_data_bd = &fp->tx_desc_ring[bd_idx].reg_bd;
+		tx_data_bd = &txdata->tx_desc_ring[bd_idx].reg_bd;
 		dma_unmap_page(&bp->pdev->dev, BD_UNMAP_ADDR(tx_data_bd),
 			       BD_UNMAP_LEN(tx_data_bd), DMA_TO_DEVICE);
 		if (--nbd)
@@ -142,20 +161,19 @@ static u16 bnx2x_free_tx_pkt(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	return new_cons;
 }
 
-int bnx2x_tx_int(struct bnx2x_fastpath *fp)
+int bnx2x_tx_int(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata)
 {
-	struct bnx2x *bp = fp->bp;
 	struct netdev_queue *txq;
-	u16 hw_cons, sw_cons, bd_cons = fp->tx_bd_cons;
+	u16 hw_cons, sw_cons, bd_cons = txdata->tx_bd_cons;
 
 #ifdef BNX2X_STOP_ON_ERROR
 	if (unlikely(bp->panic))
 		return -1;
 #endif
 
-	txq = netdev_get_tx_queue(bp->dev, fp->index);
-	hw_cons = le16_to_cpu(*fp->tx_cons_sb);
-	sw_cons = fp->tx_pkt_cons;
+	txq = netdev_get_tx_queue(bp->dev, txdata->txq_index);
+	hw_cons = le16_to_cpu(*txdata->tx_cons_sb);
+	sw_cons = txdata->tx_pkt_cons;
 
 	while (sw_cons != hw_cons) {
 		u16 pkt_cons;
@@ -164,14 +182,14 @@ int bnx2x_tx_int(struct bnx2x_fastpath *fp)
 
 		DP(NETIF_MSG_TX_DONE, "queue[%d]: hw_cons %u  sw_cons %u "
 				      " pkt_cons %u\n",
-		   fp->index, hw_cons, sw_cons, pkt_cons);
+		   txdata->txq_index, hw_cons, sw_cons, pkt_cons);
 
-		bd_cons = bnx2x_free_tx_pkt(bp, fp, pkt_cons);
+		bd_cons = bnx2x_free_tx_pkt(bp, txdata, pkt_cons);
 		sw_cons++;
 	}
 
-	fp->tx_pkt_cons = sw_cons;
-	fp->tx_bd_cons = bd_cons;
+	txdata->tx_pkt_cons = sw_cons;
+	txdata->tx_bd_cons = bd_cons;
 
 	/* Need to make the tx_bd_cons update visible to start_xmit()
 	 * before checking for netif_tx_queue_stopped().  Without the
@@ -199,7 +217,7 @@ int bnx2x_tx_int(struct bnx2x_fastpath *fp)
 
 		if ((netif_tx_queue_stopped(txq)) &&
 		    (bp->state == BNX2X_STATE_OPEN) &&
-		    (bnx2x_tx_avail(fp) >= MAX_SKB_FRAGS + 3))
+		    (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 3))
 			netif_tx_wake_queue(txq);
 
 		__netif_tx_unlock(txq);
@@ -777,6 +795,7 @@ static irqreturn_t bnx2x_msix_fp_int(int irq, void *fp_cookie)
 {
 	struct bnx2x_fastpath *fp = fp_cookie;
 	struct bnx2x *bp = fp->bp;
+	u8 cos;
 
 	DP(BNX2X_MSG_FP, "got an MSI-X interrupt on IDX:SB "
 			 "[fp %d fw_sd %d igusb %d]\n",
@@ -790,7 +809,10 @@ static irqreturn_t bnx2x_msix_fp_int(int irq, void *fp_cookie)
 
 	/* Handle Rx and Tx according to MSI-X vector */
 	prefetch(fp->rx_cons_sb);
-	prefetch(fp->tx_cons_sb);
+
+	for_each_cos_in_tx_queue(fp, cos)
+		prefetch(fp->txdata[cos].tx_cons_sb);
+
 	prefetch(&fp->sb_running_index[SM_RX_ID]);
 	napi_schedule(&bnx2x_fp(bp, fp->index, napi));
 
@@ -1060,17 +1082,22 @@ void bnx2x_init_rx_rings(struct bnx2x *bp)
 static void bnx2x_free_tx_skbs(struct bnx2x *bp)
 {
 	int i;
+	u8 cos;
 
 	for_each_tx_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
+		for_each_cos_in_tx_queue(fp, cos) {
+			struct bnx2x_fp_txdata *txdata = &fp->txdata[cos];
 
-		u16 bd_cons = fp->tx_bd_cons;
-		u16 sw_prod = fp->tx_pkt_prod;
-		u16 sw_cons = fp->tx_pkt_cons;
+			u16 bd_cons = txdata->tx_bd_cons;
+			u16 sw_prod = txdata->tx_pkt_prod;
+			u16 sw_cons = txdata->tx_pkt_cons;
 
-		while (sw_cons != sw_prod) {
-			bd_cons = bnx2x_free_tx_pkt(bp, fp, TX_BD(sw_cons));
-			sw_cons++;
+			while (sw_cons != sw_prod) {
+				bd_cons = bnx2x_free_tx_pkt(bp, txdata,
+							    TX_BD(sw_cons));
+				sw_cons++;
+			}
 		}
 	}
 }
@@ -1174,7 +1201,7 @@ void bnx2x_free_irq(struct bnx2x *bp)
 {
 	if (bp->flags & USING_MSIX_FLAG)
 		bnx2x_free_msix_irqs(bp, BNX2X_NUM_ETH_QUEUES(bp) +
-				     CNIC_CONTEXT_USE + 1);
+				     CNIC_PRESENT + 1);
 	else if (bp->flags & USING_MSI_FLAG)
 		free_irq(bp->pdev->irq, bp->dev);
 	else
@@ -1196,6 +1223,7 @@ int bnx2x_enable_msix(struct bnx2x *bp)
 	   bp->msix_table[msix_vec].entry, bp->msix_table[msix_vec].entry);
 	msix_vec++;
 #endif
+	/* We need separate vectors for ETH queues only (not FCoE) */
 	for_each_eth_queue(bp, i) {
 		bp->msix_table[msix_vec].entry = msix_vec;
 		DP(NETIF_MSG_IFUP, "msix_table[%d].entry = %d "
@@ -1203,7 +1231,7 @@ int bnx2x_enable_msix(struct bnx2x *bp)
 		msix_vec++;
 	}
 
-	req_cnt = BNX2X_NUM_ETH_QUEUES(bp) + CNIC_CONTEXT_USE + 1;
+	req_cnt = BNX2X_NUM_ETH_QUEUES(bp) + CNIC_PRESENT + 1;
 
 	rc = pci_enable_msix(bp->pdev, &bp->msix_table[0], req_cnt);
 
@@ -1278,7 +1306,7 @@ static int bnx2x_req_msix_irqs(struct bnx2x *bp)
 	}
 
 	i = BNX2X_NUM_ETH_QUEUES(bp);
-	offset = 1 + CNIC_CONTEXT_USE;
+	offset = 1 + CNIC_PRESENT;
 	netdev_info(bp->dev, "using MSI-X  IRQs: sp %d  fp[%d] %d"
 	       " ... fp[%d] %d\n",
 	       bp->msix_table[0].vector,
@@ -1393,13 +1421,12 @@ u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb)
 
 		/* If ethertype is FCoE or FIP - use FCoE ring */
 		if ((ether_type == ETH_P_FCOE) || (ether_type == ETH_P_FIP))
-			return bnx2x_fcoe(bp, index);
+			return bnx2x_fcoe_tx(bp, txq_index);
 	}
 #endif
 	/* Select a none-FCoE queue:  if FCoE is enabled, exclude FCoE L2 ring
 	 */
-	return __skb_tx_hash(dev, skb,
-			dev->real_num_tx_queues - FCOE_CONTEXT_USE);
+	return __skb_tx_hash(dev, skb, BNX2X_NUM_ETH_QUEUES(bp));
 }
 
 void bnx2x_set_num_queues(struct bnx2x *bp)
@@ -1418,20 +1445,38 @@ void bnx2x_set_num_queues(struct bnx2x *bp)
 	}
 
 	/* Add special queues */
-	bp->num_queues += NONE_ETH_CONTEXT_USE;
+	bp->num_queues += NON_ETH_CONTEXT_USE;
 }
 
 static inline int bnx2x_set_real_num_queues(struct bnx2x *bp)
 {
-	int rc, num = bp->num_queues;
+	int rc, tx, rx;
 
-#ifdef BCM_CNIC
-	if (NO_FCOE(bp))
-		num -= FCOE_CONTEXT_USE;
+	tx = MAX_TXQS_PER_COS * bp->max_cos;
+	rx = BNX2X_NUM_ETH_QUEUES(bp);
 
+/* account for fcoe queue */
+#ifdef BCM_CNIC
+	if (!NO_FCOE(bp)) {
+		rx += FCOE_PRESENT;
+		tx += FCOE_PRESENT;
+	}
 #endif
-	netif_set_real_num_tx_queues(bp->dev, num);
-	rc = netif_set_real_num_rx_queues(bp->dev, num);
+
+	rc = netif_set_real_num_tx_queues(bp->dev, tx);
+	if (rc) {
+		BNX2X_ERR("Failed to set real number of Tx queues: %d\n", rc);
+		return rc;
+	}
+	rc = netif_set_real_num_rx_queues(bp->dev, rx);
+	if (rc) {
+		BNX2X_ERR("Failed to set real number of Rx queues: %d\n", rc);
+		return rc;
+	}
+
+	DP(NETIF_MSG_DRV, "Setting real num queues to (tx, rx) (%d, %d)\n",
+			  tx, rx);
+
 	return rc;
 }
 
@@ -1661,28 +1706,18 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 	/* must be called before memory allocation and HW init */
 	bnx2x_ilt_set_info(bp);
 
-	/* zero fastpath structures preserving invariants like napi which are
-	 * allocated only once
+	/*
+	 * Zero fastpath structures preserving invariants like napi, which are
+	 * allocated only once, fp index, max_cos, bp pointer.
+	 * Also set fp->disable_tpa.
 	 */
 	for_each_queue(bp, i)
 		bnx2x_bz_fp(bp, i);
 
+
 	/* Set the receive queues buffer size */
 	bnx2x_set_rx_buf_size(bp);
 
-	/*
-	 * set the tpa flag for each queue. The tpa flag determines the queue
-	 * minimal size so it must be set prior to queue memory allocation
-	 */
-	for_each_queue(bp, i)
-		bnx2x_fp(bp, i, disable_tpa) =
-					((bp->flags & TPA_ENABLE_FLAG) == 0);
-
-#ifdef BCM_CNIC
-	/* We don't want TPA on FCoE L2 ring */
-	bnx2x_fcoe(bp, disable_tpa) = 1;
-#endif
-
 	if (bnx2x_alloc_mem(bp))
 		return -ENOMEM;
 
@@ -1696,6 +1731,12 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 		LOAD_ERROR_EXIT(bp, load_error0);
 	}
 
+	/* configure multi cos mappings in kernel.
+	 * this configuration may be overriden by a multi class queue discipline
+	 * or by a dcbx negotiation result.
+	 */
+	bnx2x_setup_tc(bp->dev, bp->max_cos);
+
 	bnx2x_napi_enable(bp);
 
 	/* Send LOAD_REQUEST command to MCP
@@ -1747,6 +1788,7 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
 		queue_delayed_work(bnx2x_wq, &bp->period_task, 0);
 	} else
 		bp->port.pmf = 0;
+
 	DP(NETIF_MSG_LINK, "pmf %d\n", bp->port.pmf);
 
 	/* Init Function state controlling object */
@@ -2089,6 +2131,7 @@ int bnx2x_set_power_state(struct bnx2x *bp, pci_power_t state)
 int bnx2x_poll(struct napi_struct *napi, int budget)
 {
 	int work_done = 0;
+	u8 cos;
 	struct bnx2x_fastpath *fp = container_of(napi, struct bnx2x_fastpath,
 						 napi);
 	struct bnx2x *bp = fp->bp;
@@ -2101,8 +2144,10 @@ int bnx2x_poll(struct napi_struct *napi, int budget)
 		}
 #endif
 
-		if (bnx2x_has_tx_work(fp))
-			bnx2x_tx_int(fp);
+		for_each_cos_in_tx_queue(fp, cos)
+			if (bnx2x_tx_queue_has_work(&fp->txdata[cos]))
+				bnx2x_tx_int(bp, &fp->txdata[cos]);
+
 
 		if (bnx2x_has_rx_work(fp)) {
 			work_done += bnx2x_rx_int(fp, budget - work_done);
@@ -2164,7 +2209,7 @@ int bnx2x_poll(struct napi_struct *napi, int budget)
  * in Other Operating Systems(TM)
  */
 static noinline u16 bnx2x_tx_split(struct bnx2x *bp,
-				   struct bnx2x_fastpath *fp,
+				   struct bnx2x_fp_txdata *txdata,
 				   struct sw_tx_bd *tx_buf,
 				   struct eth_tx_start_bd **tx_bd, u16 hlen,
 				   u16 bd_prod, int nbd)
@@ -2185,7 +2230,7 @@ static noinline u16 bnx2x_tx_split(struct bnx2x *bp,
 	/* now get a new data BD
 	 * (after the pbd) and fill it */
 	bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
-	d_tx_bd = &fp->tx_desc_ring[bd_prod].reg_bd;
+	d_tx_bd = &txdata->tx_desc_ring[bd_prod].reg_bd;
 
 	mapping = HILO_U64(le32_to_cpu(h_tx_bd->addr_hi),
 			   le32_to_cpu(h_tx_bd->addr_lo)) + hlen;
@@ -2481,8 +2526,10 @@ static inline u8 bnx2x_set_pbd_csum(struct bnx2x *bp, struct sk_buff *skb,
 netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct bnx2x *bp = netdev_priv(dev);
+
 	struct bnx2x_fastpath *fp;
 	struct netdev_queue *txq;
+	struct bnx2x_fp_txdata *txdata;
 	struct sw_tx_bd *tx_buf;
 	struct eth_tx_start_bd *tx_start_bd, *first_bd;
 	struct eth_tx_bd *tx_data_bd, *total_pkt_bd = NULL;
@@ -2490,7 +2537,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct eth_tx_parse_bd_e2 *pbd_e2 = NULL;
 	u32 pbd_e2_parsing_data = 0;
 	u16 pkt_prod, bd_prod;
-	int nbd, fp_index;
+	int nbd, txq_index, fp_index, txdata_index;
 	dma_addr_t mapping;
 	u32 xmit_type = bnx2x_xmit_type(bp, skb);
 	int i;
@@ -2504,12 +2551,43 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		return NETDEV_TX_BUSY;
 #endif
 
-	fp_index = skb_get_queue_mapping(skb);
-	txq = netdev_get_tx_queue(dev, fp_index);
+	txq_index = skb_get_queue_mapping(skb);
+	txq = netdev_get_tx_queue(dev, txq_index);
+
+	BUG_ON(txq_index >= MAX_ETH_TXQ_IDX(bp) + FCOE_PRESENT);
+
+	/* decode the fastpath index and the cos index from the txq */
+	fp_index = TXQ_TO_FP(txq_index);
+	txdata_index = TXQ_TO_COS(txq_index);
+
+#ifdef BCM_CNIC
+	/*
+	 * Override the above for the FCoE queue:
+	 *   - FCoE fp entry is right after the ETH entries.
+	 *   - FCoE L2 queue uses bp->txdata[0] only.
+	 */
+	if (unlikely(!NO_FCOE(bp) && (txq_index ==
+				      bnx2x_fcoe_tx(bp, txq_index)))) {
+		fp_index = FCOE_IDX;
+		txdata_index = 0;
+	}
+#endif
+
+	/* enable this debug print to view the transmission queue being used
+	DP(BNX2X_MSG_FP, "indices: txq %d, fp %d, txdata %d",
+	   txq_index, fp_index, txdata_index); */
 
+	/* locate the fastpath and the txdata */
 	fp = &bp->fp[fp_index];
+	txdata = &fp->txdata[txdata_index];
+
+	/* enable this debug print to view the tranmission details
+	DP(BNX2X_MSG_FP,"transmitting packet cid %d fp index %d txdata_index %d"
+			" tx_data ptr %p fp pointer %p",
+	   txdata->cid, fp_index, txdata_index, txdata, fp); */
 
-	if (unlikely(bnx2x_tx_avail(fp) < (skb_shinfo(skb)->nr_frags + 3))) {
+	if (unlikely(bnx2x_tx_avail(bp, txdata) <
+		     (skb_shinfo(skb)->nr_frags + 3))) {
 		fp->eth_q_stats.driver_xoff++;
 		netif_tx_stop_queue(txq);
 		BNX2X_ERR("BUG! Tx ring full when queue awake!\n");
@@ -2518,7 +2596,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	DP(NETIF_MSG_TX_QUEUED, "queue[%d]: SKB: summed %x  protocol %x  "
 				"protocol(%x,%x) gso type %x  xmit_type %x\n",
-	   fp_index, skb->ip_summed, skb->protocol, ipv6_hdr(skb)->nexthdr,
+	   txq_index, skb->ip_summed, skb->protocol, ipv6_hdr(skb)->nexthdr,
 	   ip_hdr(skb)->protocol, skb_shinfo(skb)->gso_type, xmit_type);
 
 	eth = (struct ethhdr *)skb->data;
@@ -2567,15 +2645,15 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* get current pkt produced now - advance it just before sending packet
 	 * since mapping of pages may fail and cause packet to be dropped
 	 */
-	pkt_prod = fp->tx_pkt_prod;
-	bd_prod = TX_BD(fp->tx_bd_prod);
+	pkt_prod = txdata->tx_pkt_prod;
+	bd_prod = TX_BD(txdata->tx_bd_prod);
 
 	/* get a tx_buf and first BD
 	 * tx_start_bd may be changed during SPLIT,
 	 * but first_bd will always stay first
 	 */
-	tx_buf = &fp->tx_buf_ring[TX_BD(pkt_prod)];
-	tx_start_bd = &fp->tx_desc_ring[bd_prod].start_bd;
+	tx_buf = &txdata->tx_buf_ring[TX_BD(pkt_prod)];
+	tx_start_bd = &txdata->tx_desc_ring[bd_prod].start_bd;
 	first_bd = tx_start_bd;
 
 	tx_start_bd->bd_flags.as_bitfield = ETH_TX_BD_FLAGS_START_BD;
@@ -2586,13 +2664,13 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	SET_FLAG(tx_start_bd->general_data, ETH_TX_START_BD_HDR_NBDS, 1);
 
 	/* remember the first BD of the packet */
-	tx_buf->first_bd = fp->tx_bd_prod;
+	tx_buf->first_bd = txdata->tx_bd_prod;
 	tx_buf->skb = skb;
 	tx_buf->flags = 0;
 
 	DP(NETIF_MSG_TX_QUEUED,
 	   "sending pkt %u @%p  next_idx %u  bd %u @%p\n",
-	   pkt_prod, tx_buf, fp->tx_pkt_prod, bd_prod, tx_start_bd);
+	   pkt_prod, tx_buf, txdata->tx_pkt_prod, bd_prod, tx_start_bd);
 
 	if (vlan_tx_tag_present(skb)) {
 		tx_start_bd->vlan_or_ethertype =
@@ -2609,7 +2687,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		bnx2x_set_sbd_csum(bp, skb, tx_start_bd, xmit_type);
 
 	if (!CHIP_IS_E1x(bp)) {
-		pbd_e2 = &fp->tx_desc_ring[bd_prod].parse_bd_e2;
+		pbd_e2 = &txdata->tx_desc_ring[bd_prod].parse_bd_e2;
 		memset(pbd_e2, 0, sizeof(struct eth_tx_parse_bd_e2));
 		/* Set PBD in checksum offload case */
 		if (xmit_type & XMIT_CSUM)
@@ -2631,7 +2709,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 					      eth->h_dest);
 		}
 	} else {
-		pbd_e1x = &fp->tx_desc_ring[bd_prod].parse_bd_e1x;
+		pbd_e1x = &txdata->tx_desc_ring[bd_prod].parse_bd_e1x;
 		memset(pbd_e1x, 0, sizeof(struct eth_tx_parse_bd_e1x));
 		/* Set PBD in checksum offload case */
 		if (xmit_type & XMIT_CSUM)
@@ -2663,8 +2741,9 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		tx_start_bd->bd_flags.as_bitfield |= ETH_TX_BD_FLAGS_SW_LSO;
 
 		if (unlikely(skb_headlen(skb) > hlen))
-			bd_prod = bnx2x_tx_split(bp, fp, tx_buf, &tx_start_bd,
-						 hlen, bd_prod, ++nbd);
+			bd_prod = bnx2x_tx_split(bp, txdata, tx_buf,
+						 &tx_start_bd, hlen,
+						 bd_prod, ++nbd);
 		if (!CHIP_IS_E1x(bp))
 			bnx2x_set_pbd_gso_e2(skb, &pbd_e2_parsing_data,
 					     xmit_type);
@@ -2698,14 +2777,15 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			 * before call to bnx2x_free_tx_pkt
 			 */
 			first_bd->nbd = cpu_to_le16(nbd);
-			bnx2x_free_tx_pkt(bp, fp, TX_BD(fp->tx_pkt_prod));
+			bnx2x_free_tx_pkt(bp, txdata,
+					  TX_BD(txdata->tx_pkt_prod));
 			return NETDEV_TX_OK;
 		}
 
 		bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
-		tx_data_bd = &fp->tx_desc_ring[bd_prod].reg_bd;
+		tx_data_bd = &txdata->tx_desc_ring[bd_prod].reg_bd;
 		if (total_pkt_bd == NULL)
-			total_pkt_bd = &fp->tx_desc_ring[bd_prod].reg_bd;
+			total_pkt_bd = &txdata->tx_desc_ring[bd_prod].reg_bd;
 
 		tx_data_bd->addr_hi = cpu_to_le32(U64_HI(mapping));
 		tx_data_bd->addr_lo = cpu_to_le32(U64_LO(mapping));
@@ -2759,7 +2839,7 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		   pbd_e2->parsing_data);
 	DP(NETIF_MSG_TX_QUEUED, "doorbell: nbd %d  bd %u\n", nbd, bd_prod);
 
-	fp->tx_pkt_prod++;
+	txdata->tx_pkt_prod++;
 	/*
 	 * Make sure that the BD data is updated before updating the producer
 	 * since FW might read the BD right after the producer is updated.
@@ -2769,16 +2849,16 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 */
 	wmb();
 
-	fp->tx_db.data.prod += nbd;
+	txdata->tx_db.data.prod += nbd;
 	barrier();
 
-	DOORBELL(bp, fp->cid, fp->tx_db.raw);
+	DOORBELL(bp, txdata->cid, txdata->tx_db.raw);
 
 	mmiowb();
 
-	fp->tx_bd_prod += nbd;
+	txdata->tx_bd_prod += nbd;
 
-	if (unlikely(bnx2x_tx_avail(fp) < MAX_SKB_FRAGS + 3)) {
+	if (unlikely(bnx2x_tx_avail(bp, txdata) < MAX_SKB_FRAGS + 3)) {
 		netif_tx_stop_queue(txq);
 
 		/* paired memory barrier is in bnx2x_tx_int(), we have to keep
@@ -2787,14 +2867,81 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		smp_mb();
 
 		fp->eth_q_stats.driver_xoff++;
-		if (bnx2x_tx_avail(fp) >= MAX_SKB_FRAGS + 3)
+		if (bnx2x_tx_avail(bp, txdata) >= MAX_SKB_FRAGS + 3)
 			netif_tx_wake_queue(txq);
 	}
-	fp->tx_pkt++;
+	txdata->tx_pkt++;
 
 	return NETDEV_TX_OK;
 }
 
+/**
+ * bnx2x_setup_tc - routine to configure net_device for multi tc
+ *
+ * @netdev: net device to configure
+ * @tc: number of traffic classes to enable
+ *
+ * callback connected to the ndo_setup_tc function pointer
+ */
+int bnx2x_setup_tc(struct net_device *dev, u8 num_tc)
+{
+	int cos, prio, count, offset;
+	struct bnx2x *bp = netdev_priv(dev);
+
+	/* setup tc must be called under rtnl lock */
+	ASSERT_RTNL();
+
+	/* no traffic classes requested. aborting */
+	if (!num_tc) {
+		netdev_reset_tc(dev);
+		return 0;
+	}
+
+	/* requested to support too many traffic classes */
+	if (num_tc > bp->max_cos) {
+		DP(NETIF_MSG_TX_ERR, "support for too many traffic classes"
+				     " requested: %d. max supported is %d",
+				     num_tc, bp->max_cos);
+		return -EINVAL;
+	}
+
+	/* declare amount of supported traffic classes */
+	if (netdev_set_num_tc(dev, num_tc)) {
+		DP(NETIF_MSG_TX_ERR, "failed to declare %d traffic classes",
+				     num_tc);
+		return -EINVAL;
+	}
+
+	/* configure priority to traffic class mapping */
+	for (prio = 0; prio < BNX2X_MAX_PRIORITY; prio++) {
+		netdev_set_prio_tc_map(dev, prio, bp->prio_to_cos[prio]);
+		DP(BNX2X_MSG_SP, "mapping priority %d to tc %d",
+		   prio, bp->prio_to_cos[prio]);
+	}
+
+
+	/* Use this configuration to diffrentiate tc0 from other COSes
+	   This can be used for ets or pfc, and save the effort of setting
+	   up a multio class queue disc or negotiating DCBX with a switch
+	netdev_set_prio_tc_map(dev, 0, 0);
+	DP(BNX2X_MSG_SP, "mapping priority %d to tc %d", 0, 0);
+	for (prio = 1; prio < 16; prio++) {
+		netdev_set_prio_tc_map(dev, prio, 1);
+		DP(BNX2X_MSG_SP, "mapping priority %d to tc %d", prio, 1);
+	} */
+
+	/* configure traffic class to transmission queue mapping */
+	for (cos = 0; cos < bp->max_cos; cos++) {
+		count = BNX2X_NUM_ETH_QUEUES(bp);
+		offset = cos * MAX_TXQS_PER_COS;
+		netdev_set_tc_queue(dev, cos, count, offset);
+		DP(BNX2X_MSG_SP, "mapping tc %d to offset %d count %d",
+		   cos, offset, count);
+	}
+
+	return 0;
+}
+
 /* called with rtnl_lock */
 int bnx2x_change_mac_addr(struct net_device *dev, void *p)
 {
@@ -2823,6 +2970,7 @@ static void bnx2x_free_fp_mem_at(struct bnx2x *bp, int fp_index)
 {
 	union host_hc_status_block *sb = &bnx2x_fp(bp, fp_index, status_blk);
 	struct bnx2x_fastpath *fp = &bp->fp[fp_index];
+	u8 cos;
 
 	/* Common */
 #ifdef BCM_CNIC
@@ -2871,10 +3019,18 @@ static void bnx2x_free_fp_mem_at(struct bnx2x *bp, int fp_index)
 	/* Tx */
 	if (!skip_tx_queue(bp, fp_index)) {
 		/* fastpath tx rings: tx_buf tx_desc */
-		BNX2X_FREE(bnx2x_fp(bp, fp_index, tx_buf_ring));
-		BNX2X_PCI_FREE(bnx2x_fp(bp, fp_index, tx_desc_ring),
-			       bnx2x_fp(bp, fp_index, tx_desc_mapping),
-			       sizeof(union eth_tx_bd_types) * NUM_TX_BD);
+		for_each_cos_in_tx_queue(fp, cos) {
+			struct bnx2x_fp_txdata *txdata = &fp->txdata[cos];
+
+			DP(BNX2X_MSG_SP,
+			   "freeing tx memory of fp %d cos %d cid %d",
+			   fp_index, cos, txdata->cid);
+
+			BNX2X_FREE(txdata->tx_buf_ring);
+			BNX2X_PCI_FREE(txdata->tx_desc_ring,
+				txdata->tx_desc_mapping,
+				sizeof(union eth_tx_bd_types) * NUM_TX_BD);
+		}
 	}
 	/* end of fastpath */
 }
@@ -2907,19 +3063,17 @@ static int bnx2x_alloc_fp_mem_at(struct bnx2x *bp, int index)
 	union host_hc_status_block *sb;
 	struct bnx2x_fastpath *fp = &bp->fp[index];
 	int ring_size = 0;
+	u8 cos;
 
 	/* if rx_ring_size specified - use it */
 	int rx_ring_size = bp->rx_ring_size ? bp->rx_ring_size :
-			   MAX_RX_AVAIL/bp->num_queues;
+			   MAX_RX_AVAIL/BNX2X_NUM_RX_QUEUES(bp);
 
 	/* allocate at least number of buffers required by FW */
-	rx_ring_size = max_t(int, fp->disable_tpa ? MIN_RX_SIZE_NONTPA :
+	rx_ring_size = max_t(int, bp->disable_tpa ? MIN_RX_SIZE_NONTPA :
 						    MIN_RX_SIZE_TPA,
 				  rx_ring_size);
 
-	bnx2x_fp(bp, index, bp) = bp;
-	bnx2x_fp(bp, index, index) = index;
-
 	/* Common */
 	sb = &bnx2x_fp(bp, index, status_blk);
 #ifdef BCM_CNIC
@@ -2947,11 +3101,19 @@ static int bnx2x_alloc_fp_mem_at(struct bnx2x *bp, int index)
 	/* Tx */
 	if (!skip_tx_queue(bp, index)) {
 		/* fastpath tx rings: tx_buf tx_desc */
-		BNX2X_ALLOC(bnx2x_fp(bp, index, tx_buf_ring),
+		for_each_cos_in_tx_queue(fp, cos) {
+			struct bnx2x_fp_txdata *txdata = &fp->txdata[cos];
+
+			DP(BNX2X_MSG_SP, "allocating tx memory of "
+					 "fp %d cos %d",
+			   index, cos);
+
+			BNX2X_ALLOC(txdata->tx_buf_ring,
 				sizeof(struct sw_tx_bd) * NUM_TX_BD);
-		BNX2X_PCI_ALLOC(bnx2x_fp(bp, index, tx_desc_ring),
-				&bnx2x_fp(bp, index, tx_desc_mapping),
+			BNX2X_PCI_ALLOC(txdata->tx_desc_ring,
+				&txdata->tx_desc_mapping,
 				sizeof(union eth_tx_bd_types) * NUM_TX_BD);
+		}
 	}
 
 	/* Rx */
@@ -2994,7 +3156,7 @@ static int bnx2x_alloc_fp_mem_at(struct bnx2x *bp, int index)
 						index, ring_size);
 	/* FW will drop all packets if queue is not big enough,
 	 * In these cases we disable the queue
-	 * Min size diferent for TPA and non-TPA queues
+	 * Min size is different for OOO, TPA and non-TPA queues
 	 */
 	if (ring_size < (fp->disable_tpa ?
 				MIN_RX_SIZE_NONTPA : MIN_RX_SIZE_TPA)) {
@@ -3012,12 +3174,14 @@ int bnx2x_alloc_fp_mem(struct bnx2x *bp)
 	/**
 	 * 1. Allocate FP for leading - fatal if error
 	 * 2. {CNIC} Allocate FCoE FP - fatal if error
-	 * 3. Allocate RSS - fix number of queues if error
+	 * 3. {CNIC} Allocate OOO + FWD - disable OOO if error
+	 * 4. Allocate RSS - fix number of queues if error
 	 */
 
 	/* leading */
 	if (bnx2x_alloc_fp_mem_at(bp, 0))
 		return -ENOMEM;
+
 #ifdef BCM_CNIC
 	if (!NO_FCOE(bp))
 		/* FCoE */
@@ -3027,6 +3191,7 @@ int bnx2x_alloc_fp_mem(struct bnx2x *bp)
 			 */
 			return -ENOMEM;
 #endif
+
 	/* RSS */
 	for_each_nondefault_eth_queue(bp, i)
 		if (bnx2x_alloc_fp_mem_at(bp, i))
@@ -3044,7 +3209,7 @@ int bnx2x_alloc_fp_mem(struct bnx2x *bp)
 		 * FCOE_IDX < FWD_IDX < OOO_IDX
 		 */
 
-		/* move FCoE fp */
+		/* move FCoE fp even NO_FCOE_FLAG is on */
 		bnx2x_move_fp(bp, FCOE_IDX, FCOE_IDX - delta);
 #endif
 		bp->num_queues -= delta;
@@ -3067,16 +3232,23 @@ int __devinit bnx2x_alloc_mem_bp(struct bnx2x *bp)
 	struct bnx2x_fastpath *fp;
 	struct msix_entry *tbl;
 	struct bnx2x_ilt *ilt;
+	int msix_table_size = 0;
+
+	/*
+	 * The biggest MSI-X table we might need is as a maximum number of fast
+	 * path IGU SBs plus default SB (for PF).
+	 */
+	msix_table_size = bp->igu_sb_cnt + 1;
 
-	/* fp array */
-	fp = kzalloc(L2_FP_COUNT(bp->l2_cid_count)*sizeof(*fp), GFP_KERNEL);
+	/* fp array: RSS plus CNIC related L2 queues */
+	fp = kzalloc((BNX2X_MAX_RSS_COUNT(bp) + NON_ETH_CONTEXT_USE) *
+		     sizeof(*fp), GFP_KERNEL);
 	if (!fp)
 		goto alloc_err;
 	bp->fp = fp;
 
 	/* msix table */
-	tbl = kzalloc((FP_SB_COUNT(bp->l2_cid_count) + 1) * sizeof(*tbl),
-				  GFP_KERNEL);
+	tbl = kzalloc(msix_table_size * sizeof(*tbl), GFP_KERNEL);
 	if (!tbl)
 		goto alloc_err;
 	bp->msix_table = tbl;
diff --git a/drivers/net/bnx2x/bnx2x_cmn.h b/drivers/net/bnx2x/bnx2x_cmn.h
index c016e20c5c2b1b32188c8f09e056d35204709e77..595d4cdada3e3f1e69271e76e502e26b776d2c94 100644
--- a/drivers/net/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/bnx2x/bnx2x_cmn.h
@@ -439,6 +439,9 @@ int bnx2x_nic_load(struct bnx2x *bp, int load_mode);
 /* hard_xmit callback */
 netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev);
 
+/* setup_tc callback */
+int bnx2x_setup_tc(struct net_device *dev, u8 num_tc);
+
 /* select_queue callback */
 u16 bnx2x_select_queue(struct net_device *dev, struct sk_buff *skb);
 
@@ -454,7 +457,7 @@ void bnx2x_update_rx_prod(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			u16 bd_prod, u16 rx_comp_prod, u16 rx_sge_prod);
 
 /* NAPI poll Tx part */
-int bnx2x_tx_int(struct bnx2x_fastpath *fp);
+int bnx2x_tx_int(struct bnx2x *bp, struct bnx2x_fp_txdata *txdata);
 
 /* suspend/resume callbacks */
 int bnx2x_suspend(struct pci_dev *pdev, pm_message_t state);
@@ -715,21 +718,22 @@ static inline u16 bnx2x_ack_int(struct bnx2x *bp)
 		return bnx2x_igu_ack_int(bp);
 }
 
-static inline int bnx2x_has_tx_work_unload(struct bnx2x_fastpath *fp)
+static inline int bnx2x_has_tx_work_unload(struct bnx2x_fp_txdata *txdata)
 {
 	/* Tell compiler that consumer and producer can change */
 	barrier();
-	return fp->tx_pkt_prod != fp->tx_pkt_cons;
+	return txdata->tx_pkt_prod != txdata->tx_pkt_cons;
 }
 
-static inline u16 bnx2x_tx_avail(struct bnx2x_fastpath *fp)
+static inline u16 bnx2x_tx_avail(struct bnx2x *bp,
+				 struct bnx2x_fp_txdata *txdata)
 {
 	s16 used;
 	u16 prod;
 	u16 cons;
 
-	prod = fp->tx_bd_prod;
-	cons = fp->tx_bd_cons;
+	prod = txdata->tx_bd_prod;
+	cons = txdata->tx_bd_cons;
 
 	/* NUM_TX_RINGS = number of "next-page" entries
 	   It will be used as a threshold */
@@ -737,21 +741,30 @@ static inline u16 bnx2x_tx_avail(struct bnx2x_fastpath *fp)
 
 #ifdef BNX2X_STOP_ON_ERROR
 	WARN_ON(used < 0);
-	WARN_ON(used > fp->bp->tx_ring_size);
-	WARN_ON((fp->bp->tx_ring_size - used) > MAX_TX_AVAIL);
+	WARN_ON(used > bp->tx_ring_size);
+	WARN_ON((bp->tx_ring_size - used) > MAX_TX_AVAIL);
 #endif
 
-	return (s16)(fp->bp->tx_ring_size) - used;
+	return (s16)(bp->tx_ring_size) - used;
 }
 
-static inline int bnx2x_has_tx_work(struct bnx2x_fastpath *fp)
+static inline int bnx2x_tx_queue_has_work(struct bnx2x_fp_txdata *txdata)
 {
 	u16 hw_cons;
 
 	/* Tell compiler that status block fields can change */
 	barrier();
-	hw_cons = le16_to_cpu(*fp->tx_cons_sb);
-	return hw_cons != fp->tx_pkt_cons;
+	hw_cons = le16_to_cpu(*txdata->tx_cons_sb);
+	return hw_cons != txdata->tx_pkt_cons;
+}
+
+static inline bool bnx2x_has_tx_work(struct bnx2x_fastpath *fp)
+{
+	u8 cos;
+	for_each_cos_in_tx_queue(fp, cos)
+		if (bnx2x_tx_queue_has_work(&fp->txdata[cos]))
+			return true;
+	return false;
 }
 
 static inline int bnx2x_has_rx_work(struct bnx2x_fastpath *fp)
@@ -963,7 +976,10 @@ static inline int bnx2x_func_start(struct bnx2x *bp)
 	/* Function parameters */
 	start_params->mf_mode = bp->mf_mode;
 	start_params->sd_vlan_tag = bp->mf_ov;
+	if (CHIP_IS_E1x(bp))
 		start_params->network_cos_mode = OVERRIDE_COS;
+	else
+		start_params->network_cos_mode = STATIC_COS;
 
 	return bnx2x_func_state_change(bp, &func_params);
 }
@@ -1023,39 +1039,41 @@ static inline void bnx2x_free_tpa_pool(struct bnx2x *bp,
 	}
 }
 
-static inline void bnx2x_init_tx_ring_one(struct bnx2x_fastpath *fp)
+static inline void bnx2x_init_tx_ring_one(struct bnx2x_fp_txdata *txdata)
 {
 	int i;
 
 	for (i = 1; i <= NUM_TX_RINGS; i++) {
 		struct eth_tx_next_bd *tx_next_bd =
-			&fp->tx_desc_ring[TX_DESC_CNT * i - 1].next_bd;
+			&txdata->tx_desc_ring[TX_DESC_CNT * i - 1].next_bd;
 
 		tx_next_bd->addr_hi =
-			cpu_to_le32(U64_HI(fp->tx_desc_mapping +
+			cpu_to_le32(U64_HI(txdata->tx_desc_mapping +
 				    BCM_PAGE_SIZE*(i % NUM_TX_RINGS)));
 		tx_next_bd->addr_lo =
-			cpu_to_le32(U64_LO(fp->tx_desc_mapping +
+			cpu_to_le32(U64_LO(txdata->tx_desc_mapping +
 				    BCM_PAGE_SIZE*(i % NUM_TX_RINGS)));
 	}
 
-	SET_FLAG(fp->tx_db.data.header.header, DOORBELL_HDR_DB_TYPE, 1);
-	fp->tx_db.data.zero_fill1 = 0;
-	fp->tx_db.data.prod = 0;
+	SET_FLAG(txdata->tx_db.data.header.header, DOORBELL_HDR_DB_TYPE, 1);
+	txdata->tx_db.data.zero_fill1 = 0;
+	txdata->tx_db.data.prod = 0;
 
-	fp->tx_pkt_prod = 0;
-	fp->tx_pkt_cons = 0;
-	fp->tx_bd_prod = 0;
-	fp->tx_bd_cons = 0;
-	fp->tx_pkt = 0;
+	txdata->tx_pkt_prod = 0;
+	txdata->tx_pkt_cons = 0;
+	txdata->tx_bd_prod = 0;
+	txdata->tx_bd_cons = 0;
+	txdata->tx_pkt = 0;
 }
 
 static inline void bnx2x_init_tx_rings(struct bnx2x *bp)
 {
 	int i;
+	u8 cos;
 
 	for_each_tx_queue(bp, i)
-		bnx2x_init_tx_ring_one(&bp->fp[i]);
+		for_each_cos_in_tx_queue(&bp->fp[i], cos)
+			bnx2x_init_tx_ring_one(&bp->fp[i].txdata[cos]);
 }
 
 static inline void bnx2x_set_next_page_rx_bd(struct bnx2x_fastpath *fp)
@@ -1257,12 +1275,23 @@ static inline u32 bnx2x_rx_ustorm_prods_offset(struct bnx2x_fastpath *fp)
 		return USTORM_RX_PRODS_E1X_OFFSET(BP_PORT(bp), fp->cl_id);
 }
 
+static inline void bnx2x_init_txdata(struct bnx2x *bp,
+	struct bnx2x_fp_txdata *txdata, u32 cid, int txq_index,
+	__le16 *tx_cons_sb)
+{
+	txdata->cid = cid;
+	txdata->txq_index = txq_index;
+	txdata->tx_cons_sb = tx_cons_sb;
+
+	DP(BNX2X_MSG_SP, "created tx data cid %d, txq %d",
+	   txdata->cid, txdata->txq_index);
+}
 
 #ifdef BCM_CNIC
 static inline u8 bnx2x_cnic_eth_cl_id(struct bnx2x *bp, u8 cl_idx)
 {
 	return bp->cnic_base_cl_id + cl_idx +
-		(bp->pf_num >> 1) * NONE_ETH_CONTEXT_USE;
+		(bp->pf_num >> 1) * NON_ETH_CONTEXT_USE;
 }
 
 static inline u8 bnx2x_cnic_fw_sb_id(struct bnx2x *bp)
@@ -1293,10 +1322,13 @@ static inline void bnx2x_init_fcoe_fp(struct bnx2x *bp)
 	bnx2x_fcoe(bp, cid) = BNX2X_FCOE_ETH_CID;
 	bnx2x_fcoe(bp, fw_sb_id) = DEF_SB_ID;
 	bnx2x_fcoe(bp, igu_sb_id) = bp->igu_dsb_id;
-	bnx2x_fcoe(bp, bp) = bp;
-	bnx2x_fcoe(bp, index) = FCOE_IDX;
 	bnx2x_fcoe(bp, rx_cons_sb) = BNX2X_FCOE_L2_RX_INDEX;
-	bnx2x_fcoe(bp, tx_cons_sb) = BNX2X_FCOE_L2_TX_INDEX;
+
+	bnx2x_init_txdata(bp, &bnx2x_fcoe(bp, txdata[0]),
+			  fp->cid, FCOE_TXQ_IDX(bp), BNX2X_FCOE_L2_TX_INDEX);
+
+	DP(BNX2X_MSG_SP, "created fcoe tx data (fp index %d)", fp->index);
+
 	/* qZone id equals to FW (per path) client id */
 	bnx2x_fcoe(bp, cl_qzone_id) = bnx2x_fp_qzone_id(fp);
 	/* init shortcut */
@@ -1306,9 +1338,13 @@ static inline void bnx2x_init_fcoe_fp(struct bnx2x *bp)
 	/* Configure Queue State object */
 	__set_bit(BNX2X_Q_TYPE_HAS_RX, &q_type);
 	__set_bit(BNX2X_Q_TYPE_HAS_TX, &q_type);
-	bnx2x_init_queue_obj(bp, &fp->q_obj, fp->cl_id, fp->cid, BP_FUNC(bp),
-		bnx2x_sp(bp, q_rdata), bnx2x_sp_mapping(bp, q_rdata),
-			      q_type);
+
+	/* No multi-CoS for FCoE L2 client */
+	BUG_ON(fp->max_cos != 1);
+
+	bnx2x_init_queue_obj(bp, &fp->q_obj, fp->cl_id, &fp->cid, 1,
+			     BP_FUNC(bp), bnx2x_sp(bp, q_rdata),
+			     bnx2x_sp_mapping(bp, q_rdata), q_type);
 
 	DP(NETIF_MSG_IFUP, "queue[%d]: bnx2x_init_sb(%p,%p) cl_id %d fw_sb %d "
 			   "igu_sb %d\n",
@@ -1318,15 +1354,16 @@ static inline void bnx2x_init_fcoe_fp(struct bnx2x *bp)
 #endif
 
 static inline int bnx2x_clean_tx_queue(struct bnx2x *bp,
-				       struct bnx2x_fastpath *fp)
+				       struct bnx2x_fp_txdata *txdata)
 {
 	int cnt = 1000;
 
-	while (bnx2x_has_tx_work_unload(fp)) {
+	while (bnx2x_has_tx_work_unload(txdata)) {
 		if (!cnt) {
 			BNX2X_ERR("timeout waiting for queue[%d]: "
-				 "fp->tx_pkt_prod(%d) != fp->tx_pkt_cons(%d)\n",
-				  fp->index, fp->tx_pkt_prod, fp->tx_pkt_cons);
+				 "txdata->tx_pkt_prod(%d) != txdata->tx_pkt_cons(%d)\n",
+				  txdata->txq_index, txdata->tx_pkt_prod,
+				  txdata->tx_pkt_cons);
 #ifdef BNX2X_STOP_ON_ERROR
 			bnx2x_panic();
 			return -EBUSY;
diff --git a/drivers/net/bnx2x/bnx2x_dcb.c b/drivers/net/bnx2x/bnx2x_dcb.c
index 45cf3ceef144e3352f8e041fefa32d97a5829563..3bfba44961d3b66059777b47bbfacc7281e0fad2 100644
--- a/drivers/net/bnx2x/bnx2x_dcb.c
+++ b/drivers/net/bnx2x/bnx2x_dcb.c
@@ -653,6 +653,26 @@ static inline void bnx2x_update_drv_flags(struct bnx2x *bp, u32 flags, u32 set)
 	}
 }
 
+static inline void bnx2x_dcbx_update_tc_mapping(struct bnx2x *bp)
+{
+	u8 prio, cos;
+	for (cos = 0; cos < bp->dcbx_port_params.ets.num_of_cos; cos++) {
+		for (prio = 0; prio < BNX2X_MAX_PRIORITY; prio++) {
+			if (bp->dcbx_port_params.ets.cos_params[cos].pri_bitmask
+			    & (1 << prio)) {
+				bp->prio_to_cos[prio] = cos;
+			}
+		}
+	}
+
+	/* setup tc must be called under rtnl lock, but we can't take it here
+	 * as we are handling an attetntion on a work queue which must be
+	 * flushed at some rtnl-locked contexts (e.g. if down)
+	 */
+	if (!test_and_set_bit(BNX2X_SP_RTNL_SETUP_TC, &bp->sp_rtnl_state))
+		schedule_delayed_work(&bp->sp_rtnl_task, 0);
+}
+
 void bnx2x_dcbx_set_params(struct bnx2x *bp, u32 state)
 {
 	switch (state) {
@@ -690,6 +710,11 @@ void bnx2x_dcbx_set_params(struct bnx2x *bp, u32 state)
 #endif
 			bnx2x_dcbx_stop_hw_tx(bp);
 
+			/* reconfigure the netdevice with the results of the new
+			 * dcbx negotiation.
+			 */
+			bnx2x_dcbx_update_tc_mapping(bp);
+
 			return;
 		}
 	case BNX2X_DCBX_STATE_TX_PAUSED:
diff --git a/drivers/net/bnx2x/bnx2x_ethtool.c b/drivers/net/bnx2x/bnx2x_ethtool.c
index 1a3ed418946df4fd095507c62a4e028b71959fb5..ac0223135b7cd0d61d9d2dba867c5f515bc82269 100644
--- a/drivers/net/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/bnx2x/bnx2x_ethtool.c
@@ -1616,6 +1616,7 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode)
 	unsigned char *packet;
 	struct bnx2x_fastpath *fp_rx = &bp->fp[0];
 	struct bnx2x_fastpath *fp_tx = &bp->fp[0];
+	struct bnx2x_fp_txdata *txdata = &fp_tx->txdata[0];
 	u16 tx_start_idx, tx_idx;
 	u16 rx_start_idx, rx_idx;
 	u16 pkt_prod, bd_prod, rx_comp_cons;
@@ -1670,17 +1671,17 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode)
 
 	/* send the loopback packet */
 	num_pkts = 0;
-	tx_start_idx = le16_to_cpu(*fp_tx->tx_cons_sb);
+	tx_start_idx = le16_to_cpu(*txdata->tx_cons_sb);
 	rx_start_idx = le16_to_cpu(*fp_rx->rx_cons_sb);
 
-	pkt_prod = fp_tx->tx_pkt_prod++;
-	tx_buf = &fp_tx->tx_buf_ring[TX_BD(pkt_prod)];
-	tx_buf->first_bd = fp_tx->tx_bd_prod;
+	pkt_prod = txdata->tx_pkt_prod++;
+	tx_buf = &txdata->tx_buf_ring[TX_BD(pkt_prod)];
+	tx_buf->first_bd = txdata->tx_bd_prod;
 	tx_buf->skb = skb;
 	tx_buf->flags = 0;
 
-	bd_prod = TX_BD(fp_tx->tx_bd_prod);
-	tx_start_bd = &fp_tx->tx_desc_ring[bd_prod].start_bd;
+	bd_prod = TX_BD(txdata->tx_bd_prod);
+	tx_start_bd = &txdata->tx_desc_ring[bd_prod].start_bd;
 	tx_start_bd->addr_hi = cpu_to_le32(U64_HI(mapping));
 	tx_start_bd->addr_lo = cpu_to_le32(U64_LO(mapping));
 	tx_start_bd->nbd = cpu_to_le16(2); /* start + pbd */
@@ -1697,27 +1698,27 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode)
 	/* turn on parsing and get a BD */
 	bd_prod = TX_BD(NEXT_TX_IDX(bd_prod));
 
-	pbd_e1x = &fp_tx->tx_desc_ring[bd_prod].parse_bd_e1x;
-	pbd_e2 = &fp_tx->tx_desc_ring[bd_prod].parse_bd_e2;
+	pbd_e1x = &txdata->tx_desc_ring[bd_prod].parse_bd_e1x;
+	pbd_e2 = &txdata->tx_desc_ring[bd_prod].parse_bd_e2;
 
 	memset(pbd_e2, 0, sizeof(struct eth_tx_parse_bd_e2));
 	memset(pbd_e1x, 0, sizeof(struct eth_tx_parse_bd_e1x));
 
 	wmb();
 
-	fp_tx->tx_db.data.prod += 2;
+	txdata->tx_db.data.prod += 2;
 	barrier();
-	DOORBELL(bp, fp_tx->index, fp_tx->tx_db.raw);
+	DOORBELL(bp, txdata->cid, txdata->tx_db.raw);
 
 	mmiowb();
 	barrier();
 
 	num_pkts++;
-	fp_tx->tx_bd_prod += 2; /* start + pbd */
+	txdata->tx_bd_prod += 2; /* start + pbd */
 
 	udelay(100);
 
-	tx_idx = le16_to_cpu(*fp_tx->tx_cons_sb);
+	tx_idx = le16_to_cpu(*txdata->tx_cons_sb);
 	if (tx_idx != tx_start_idx + num_pkts)
 		goto test_loopback_exit;
 
@@ -1731,7 +1732,7 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode)
 		 * bnx2x_tx_int()), as both are taking netif_tx_lock().
 		 */
 		local_bh_disable();
-		bnx2x_tx_int(fp_tx);
+		bnx2x_tx_int(bp, txdata);
 		local_bh_enable();
 	}
 
diff --git a/drivers/net/bnx2x/bnx2x_hsi.h b/drivers/net/bnx2x/bnx2x_hsi.h
index 0692d75756df07a24531326c79653a862b9c1a81..ce3b5662ca5acdde6f298d13466e60c5741b143e 100644
--- a/drivers/net/bnx2x/bnx2x_hsi.h
+++ b/drivers/net/bnx2x/bnx2x_hsi.h
@@ -2545,7 +2545,7 @@ struct host_func_stats {
 
 #define BCM_5710_FW_MAJOR_VERSION			7
 #define BCM_5710_FW_MINOR_VERSION			0
-#define BCM_5710_FW_REVISION_VERSION		20
+#define BCM_5710_FW_REVISION_VERSION		23
 #define BCM_5710_FW_ENGINEERING_VERSION		0
 #define BCM_5710_FW_COMPILE_FLAGS			1
 
diff --git a/drivers/net/bnx2x/bnx2x_init.h b/drivers/net/bnx2x/bnx2x_init.h
index df9f196dd6e854473a62a23ec99936d0ff1cb524..82795a8349f6342c9a65371c51c79eb625482aa2 100644
--- a/drivers/net/bnx2x/bnx2x_init.h
+++ b/drivers/net/bnx2x/bnx2x_init.h
@@ -128,11 +128,10 @@ enum {
 	MODE_MF_NIV                    = 0x00000800,
 	MODE_E3_A0                     = 0x00001000,
 	MODE_E3_B0                     = 0x00002000,
-	MODE_COS_BC                    = 0x00004000,
-	MODE_COS3                      = 0x00008000,
-	MODE_COS6                      = 0x00010000,
-	MODE_LITTLE_ENDIAN             = 0x00020000,
-	MODE_BIG_ENDIAN                = 0x00040000,
+	MODE_COS3                      = 0x00004000,
+	MODE_COS6                      = 0x00008000,
+	MODE_LITTLE_ENDIAN             = 0x00010000,
+	MODE_BIG_ENDIAN                = 0x00020000,
 };
 
 /* Init Blocks */
@@ -179,7 +178,7 @@ enum {
 #define BNX2X_TOE_Q		3
 #define BNX2X_TOE_ACK_Q		6
 #define BNX2X_ISCSI_Q		9
-#define BNX2X_ISCSI_ACK_Q	8
+#define BNX2X_ISCSI_ACK_Q	11
 #define BNX2X_FCOE_Q		10
 
 /* Vnics per mode */
@@ -257,14 +256,16 @@ static inline void bnx2x_map_q_cos(struct bnx2x *bp, u32 q_num, u32 new_cos)
 }
 
 /* Configures the QM according to the specified per-traffic-type COSes */
-static inline void bnx2x_dcb_config_qm(struct bnx2x *bp,
+static inline void bnx2x_dcb_config_qm(struct bnx2x *bp, enum cos_mode mode,
 				       struct priority_cos *traffic_cos)
 {
 	bnx2x_map_q_cos(bp, BNX2X_FCOE_Q,
 			traffic_cos[LLFC_TRAFFIC_TYPE_FCOE].cos);
 	bnx2x_map_q_cos(bp, BNX2X_ISCSI_Q,
 			traffic_cos[LLFC_TRAFFIC_TYPE_ISCSI].cos);
-	if (INIT_MODE_FLAGS(bp) & MODE_COS_BC) {
+	bnx2x_map_q_cos(bp, BNX2X_ISCSI_ACK_Q,
+		traffic_cos[LLFC_TRAFFIC_TYPE_ISCSI].cos);
+	if (mode != STATIC_COS) {
 		/* required only in backward compatible COS mode */
 		bnx2x_map_q_cos(bp, BNX2X_ETH_Q,
 				traffic_cos[LLFC_TRAFFIC_TYPE_NW].cos);
@@ -272,8 +273,6 @@ static inline void bnx2x_dcb_config_qm(struct bnx2x *bp,
 				traffic_cos[LLFC_TRAFFIC_TYPE_NW].cos);
 		bnx2x_map_q_cos(bp, BNX2X_TOE_ACK_Q,
 				traffic_cos[LLFC_TRAFFIC_TYPE_NW].cos);
-		bnx2x_map_q_cos(bp, BNX2X_ISCSI_ACK_Q,
-				traffic_cos[LLFC_TRAFFIC_TYPE_ISCSI].cos);
 	}
 }
 
diff --git a/drivers/net/bnx2x/bnx2x_main.c b/drivers/net/bnx2x/bnx2x_main.c
index 53f4ec3d1d9ebae82c710950b85b07a28a6f76b2..8a374a77cdc96928be0c08861123c9993cdbe87b 100644
--- a/drivers/net/bnx2x/bnx2x_main.c
+++ b/drivers/net/bnx2x/bnx2x_main.c
@@ -767,6 +767,7 @@ void bnx2x_panic_dump(struct bnx2x *bp)
 	int func = BP_FUNC(bp);
 #ifdef BNX2X_STOP_ON_ERROR
 	u16 start = 0, end = 0;
+	u8 cos;
 #endif
 
 	bp->stats_state = STATS_STATE_DISABLED;
@@ -822,8 +823,9 @@ void bnx2x_panic_dump(struct bnx2x *bp)
 			CHIP_IS_E1x(bp) ?
 			sb_data_e1x.index_data :
 			sb_data_e2.index_data;
-		int data_size;
+		u8 data_size, cos;
 		u32 *sb_data_p;
+		struct bnx2x_fp_txdata txdata;
 
 		/* Rx */
 		BNX2X_ERR("fp%d: rx_bd_prod(0x%x)  rx_bd_cons(0x%x)"
@@ -838,11 +840,17 @@ void bnx2x_panic_dump(struct bnx2x *bp)
 			  le16_to_cpu(fp->fp_hc_idx));
 
 		/* Tx */
-		BNX2X_ERR("fp%d: tx_pkt_prod(0x%x)  tx_pkt_cons(0x%x)"
-			  "  tx_bd_prod(0x%x)  tx_bd_cons(0x%x)"
-			  "  *tx_cons_sb(0x%x)\n",
-			  i, fp->tx_pkt_prod, fp->tx_pkt_cons, fp->tx_bd_prod,
-			  fp->tx_bd_cons, le16_to_cpu(*fp->tx_cons_sb));
+		for_each_cos_in_tx_queue(fp, cos)
+		{
+			txdata = fp->txdata[cos];
+			BNX2X_ERR("fp%d: tx_pkt_prod(0x%x)  tx_pkt_cons(0x%x)"
+				  "  tx_bd_prod(0x%x)  tx_bd_cons(0x%x)"
+				  "  *tx_cons_sb(0x%x)\n",
+				  i, txdata.tx_pkt_prod,
+				  txdata.tx_pkt_cons, txdata.tx_bd_prod,
+				  txdata.tx_bd_cons,
+				  le16_to_cpu(*txdata.tx_cons_sb));
+		}
 
 		loop = CHIP_IS_E1x(bp) ?
 			HC_SB_MAX_INDICES_E1X : HC_SB_MAX_INDICES_E2;
@@ -961,23 +969,31 @@ void bnx2x_panic_dump(struct bnx2x *bp)
 	/* Tx */
 	for_each_tx_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
+		for_each_cos_in_tx_queue(fp, cos) {
+			struct bnx2x_fp_txdata *txdata = &fp->txdata[cos];
+
+			start = TX_BD(le16_to_cpu(*txdata->tx_cons_sb) - 10);
+			end = TX_BD(le16_to_cpu(*txdata->tx_cons_sb) + 245);
+			for (j = start; j != end; j = TX_BD(j + 1)) {
+				struct sw_tx_bd *sw_bd =
+					&txdata->tx_buf_ring[j];
+
+				BNX2X_ERR("fp%d: txdata %d, "
+					  "packet[%x]=[%p,%x]\n",
+					  i, cos, j, sw_bd->skb,
+					  sw_bd->first_bd);
+			}
 
-		start = TX_BD(le16_to_cpu(*fp->tx_cons_sb) - 10);
-		end = TX_BD(le16_to_cpu(*fp->tx_cons_sb) + 245);
-		for (j = start; j != end; j = TX_BD(j + 1)) {
-			struct sw_tx_bd *sw_bd = &fp->tx_buf_ring[j];
-
-			BNX2X_ERR("fp%d: packet[%x]=[%p,%x]\n",
-				  i, j, sw_bd->skb, sw_bd->first_bd);
-		}
-
-		start = TX_BD(fp->tx_bd_cons - 10);
-		end = TX_BD(fp->tx_bd_cons + 254);
-		for (j = start; j != end; j = TX_BD(j + 1)) {
-			u32 *tx_bd = (u32 *)&fp->tx_desc_ring[j];
+			start = TX_BD(txdata->tx_bd_cons - 10);
+			end = TX_BD(txdata->tx_bd_cons + 254);
+			for (j = start; j != end; j = TX_BD(j + 1)) {
+				u32 *tx_bd = (u32 *)&txdata->tx_desc_ring[j];
 
-			BNX2X_ERR("fp%d: tx_bd[%x]=[%x:%x:%x:%x]\n",
-				  i, j, tx_bd[0], tx_bd[1], tx_bd[2], tx_bd[3]);
+				BNX2X_ERR("fp%d: txdata %d, tx_bd[%x]="
+					  "[%x:%x:%x:%x]\n",
+					  i, cos, j, tx_bd[0], tx_bd[1],
+					  tx_bd[2], tx_bd[3]);
+			}
 		}
 	}
 #endif
@@ -1533,7 +1549,7 @@ static void bnx2x_igu_int_disable(struct bnx2x *bp)
 		BNX2X_ERR("BUG! proper val not read from IGU!\n");
 }
 
-static void bnx2x_int_disable(struct bnx2x *bp)
+void bnx2x_int_disable(struct bnx2x *bp)
 {
 	if (bp->common.int_block == INT_BLOCK_HC)
 		bnx2x_hc_int_disable(bp);
@@ -1663,6 +1679,11 @@ void bnx2x_sp_event(struct bnx2x_fastpath *fp, union eth_rx_cqe *rr_cqe)
 		drv_cmd = BNX2X_Q_CMD_SETUP;
 		break;
 
+	case (RAMROD_CMD_ID_ETH_TX_QUEUE_SETUP):
+		DP(NETIF_MSG_IFUP, "got MULTI[%d] tx-only setup ramrod\n", cid);
+		drv_cmd = BNX2X_Q_CMD_SETUP_TX_ONLY;
+		break;
+
 	case (RAMROD_CMD_ID_ETH_HALT):
 		DP(NETIF_MSG_IFDOWN, "got MULTI[%d] halt ramrod\n", cid);
 		drv_cmd = BNX2X_Q_CMD_HALT;
@@ -1722,6 +1743,7 @@ irqreturn_t bnx2x_interrupt(int irq, void *dev_instance)
 	u16 status = bnx2x_ack_int(bp);
 	u16 mask;
 	int i;
+	u8 cos;
 
 	/* Return here if interrupt is shared and it's not for us */
 	if (unlikely(status == 0)) {
@@ -1738,11 +1760,12 @@ irqreturn_t bnx2x_interrupt(int irq, void *dev_instance)
 	for_each_eth_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 
-		mask = 0x2 << (fp->index + CNIC_CONTEXT_USE);
+		mask = 0x2 << (fp->index + CNIC_PRESENT);
 		if (status & mask) {
 			/* Handle Rx or Tx according to SB id */
 			prefetch(fp->rx_cons_sb);
-			prefetch(fp->tx_cons_sb);
+			for_each_cos_in_tx_queue(fp, cos)
+				prefetch(fp->txdata[cos].tx_cons_sb);
 			prefetch(&fp->sb_running_index[SM_RX_ID]);
 			napi_schedule(&bnx2x_fp(bp, fp->index, napi));
 			status &= ~mask;
@@ -2632,15 +2655,43 @@ void bnx2x_func_init(struct bnx2x *bp, struct bnx2x_func_init_params *p)
 	}
 }
 
-static inline unsigned long bnx2x_get_q_flags(struct bnx2x *bp,
-					      struct bnx2x_fastpath *fp,
-					      bool leading)
+/**
+ * bnx2x_get_tx_only_flags - Return common flags
+ *
+ * @bp		device handle
+ * @fp		queue handle
+ * @zero_stats	TRUE if statistics zeroing is needed
+ *
+ * Return the flags that are common for the Tx-only and not normal connections.
+ */
+static inline unsigned long bnx2x_get_common_flags(struct bnx2x *bp,
+						   struct bnx2x_fastpath *fp,
+						   bool zero_stats)
 {
 	unsigned long flags = 0;
 
 	/* PF driver will always initialize the Queue to an ACTIVE state */
 	__set_bit(BNX2X_Q_FLG_ACTIVE, &flags);
 
+	/* tx only connections collect statistics (on the same index as the
+	 *  parent connection). The statistics are zeroed when the parent
+	 *  connection is initialized.
+	 */
+	if (stat_counter_valid(bp, fp)) {
+		__set_bit(BNX2X_Q_FLG_STATS, &flags);
+		if (zero_stats)
+			__set_bit(BNX2X_Q_FLG_ZERO_STATS, &flags);
+	}
+
+	return flags;
+}
+
+static inline unsigned long bnx2x_get_q_flags(struct bnx2x *bp,
+					      struct bnx2x_fastpath *fp,
+					      bool leading)
+{
+	unsigned long flags = 0;
+
 	/* calculate other queue flags */
 	if (IS_MF_SD(bp))
 		__set_bit(BNX2X_Q_FLG_OV, &flags);
@@ -2651,11 +2702,6 @@ static inline unsigned long bnx2x_get_q_flags(struct bnx2x *bp,
 	if (!fp->disable_tpa)
 		__set_bit(BNX2X_Q_FLG_TPA, &flags);
 
-	if (stat_counter_valid(bp, fp)) {
-		__set_bit(BNX2X_Q_FLG_STATS, &flags);
-		__set_bit(BNX2X_Q_FLG_ZERO_STATS, &flags);
-	}
-
 	if (leading) {
 		__set_bit(BNX2X_Q_FLG_LEADING_RSS, &flags);
 		__set_bit(BNX2X_Q_FLG_MCAST, &flags);
@@ -2664,11 +2710,13 @@ static inline unsigned long bnx2x_get_q_flags(struct bnx2x *bp,
 	/* Always set HW VLAN stripping */
 	__set_bit(BNX2X_Q_FLG_VLAN, &flags);
 
-	return flags;
+
+	return flags | bnx2x_get_common_flags(bp, fp, true);
 }
 
 static void bnx2x_pf_q_prep_general(struct bnx2x *bp,
-	struct bnx2x_fastpath *fp, struct bnx2x_general_setup_params *gen_init)
+	struct bnx2x_fastpath *fp, struct bnx2x_general_setup_params *gen_init,
+	u8 cos)
 {
 	gen_init->stat_id = bnx2x_stats_id(fp);
 	gen_init->spcl_id = fp->cl_id;
@@ -2678,6 +2726,8 @@ static void bnx2x_pf_q_prep_general(struct bnx2x *bp,
 		gen_init->mtu = BNX2X_FCOE_MINI_JUMBO_MTU;
 	else
 		gen_init->mtu = bp->dev->mtu;
+
+	gen_init->cos = cos;
 }
 
 static void bnx2x_pf_rx_q_prep(struct bnx2x *bp,
@@ -2745,14 +2795,15 @@ static void bnx2x_pf_rx_q_prep(struct bnx2x *bp,
 	if (IS_FCOE_FP(fp))
 		rxq_init->sb_cq_index = HC_SP_INDEX_ETH_FCOE_RX_CQ_CONS;
 	else
-		rxq_init->sb_cq_index = U_SB_ETH_RX_CQ_INDEX;
+		rxq_init->sb_cq_index = HC_INDEX_ETH_RX_CQ_CONS;
 }
 
 static void bnx2x_pf_tx_q_prep(struct bnx2x *bp,
-	struct bnx2x_fastpath *fp, struct bnx2x_txq_setup_params *txq_init)
+	struct bnx2x_fastpath *fp, struct bnx2x_txq_setup_params *txq_init,
+	u8 cos)
 {
-	txq_init->dscr_map = fp->tx_desc_mapping;
-	txq_init->sb_cq_index = C_SB_ETH_TX_CQ_INDEX;
+	txq_init->dscr_map = fp->txdata[cos].tx_desc_mapping;
+	txq_init->sb_cq_index = HC_INDEX_ETH_FIRST_TX_CQ_CONS + cos;
 	txq_init->traffic_type = LLFC_TRAFFIC_TYPE_NW;
 	txq_init->fw_sb_id = fp->fw_sb_id;
 
@@ -2948,6 +2999,7 @@ static inline void bnx2x_sp_prod_update(struct bnx2x *bp)
 static inline bool bnx2x_is_contextless_ramrod(int cmd, int cmd_type)
 {
 	if ((cmd_type == NONE_CONNECTION_TYPE) ||
+	    (cmd == RAMROD_CMD_ID_ETH_FORWARD_SETUP) ||
 	    (cmd == RAMROD_CMD_ID_ETH_CLASSIFICATION_RULES) ||
 	    (cmd == RAMROD_CMD_ID_ETH_FILTER_RULES) ||
 	    (cmd == RAMROD_CMD_ID_ETH_MULTICAST_RULES) ||
@@ -4270,12 +4322,13 @@ static inline void bnx2x_handle_rx_mode_eqe(struct bnx2x *bp)
 static inline struct bnx2x_queue_sp_obj *bnx2x_cid_to_q_obj(
 	struct bnx2x *bp, u32 cid)
 {
+	DP(BNX2X_MSG_SP, "retrieving fp from cid %d", cid);
 #ifdef BCM_CNIC
 	if (cid == BNX2X_FCOE_ETH_CID)
 		return &bnx2x_fcoe(bp, q_obj);
 	else
 #endif
-		return &bnx2x_fp(bp, cid, q_obj);
+		return &bnx2x_fp(bp, CID_TO_FP(cid), q_obj);
 }
 
 static void bnx2x_eq_int(struct bnx2x *bp)
@@ -4522,6 +4575,7 @@ void bnx2x_drv_pulse(struct bnx2x *bp)
 
 static void bnx2x_timer(unsigned long data)
 {
+	u8 cos;
 	struct bnx2x *bp = (struct bnx2x *) data;
 
 	if (!netif_running(bp->dev))
@@ -4530,7 +4584,8 @@ static void bnx2x_timer(unsigned long data)
 	if (poll) {
 		struct bnx2x_fastpath *fp = &bp->fp[0];
 
-		bnx2x_tx_int(fp);
+		for_each_cos_in_tx_queue(fp, cos)
+			bnx2x_tx_int(bp, &fp->txdata[cos]);
 		bnx2x_rx_int(fp, 1000);
 	}
 
@@ -4735,10 +4790,17 @@ static void bnx2x_init_sb(struct bnx2x *bp, dma_addr_t mapping, int vfid,
 static void bnx2x_update_coalesce_sb(struct bnx2x *bp, u8 fw_sb_id,
 				     u16 tx_usec, u16 rx_usec)
 {
-	bnx2x_update_coalesce_sb_index(bp, fw_sb_id, U_SB_ETH_RX_CQ_INDEX,
+	bnx2x_update_coalesce_sb_index(bp, fw_sb_id, HC_INDEX_ETH_RX_CQ_CONS,
 				    false, rx_usec);
-	bnx2x_update_coalesce_sb_index(bp, fw_sb_id, C_SB_ETH_TX_CQ_INDEX,
-				    false, tx_usec);
+	bnx2x_update_coalesce_sb_index(bp, fw_sb_id,
+				       HC_INDEX_ETH_TX_CQ_CONS_COS0, false,
+				       tx_usec);
+	bnx2x_update_coalesce_sb_index(bp, fw_sb_id,
+				       HC_INDEX_ETH_TX_CQ_CONS_COS1, false,
+				       tx_usec);
+	bnx2x_update_coalesce_sb_index(bp, fw_sb_id,
+				       HC_INDEX_ETH_TX_CQ_CONS_COS2, false,
+				       tx_usec);
 }
 
 static void bnx2x_init_def_sb(struct bnx2x *bp)
@@ -5035,12 +5097,12 @@ static void bnx2x_init_internal(struct bnx2x *bp, u32 load_code)
 
 static inline u8 bnx2x_fp_igu_sb_id(struct bnx2x_fastpath *fp)
 {
-	return fp->bp->igu_base_sb + fp->index + CNIC_CONTEXT_USE;
+	return fp->bp->igu_base_sb + fp->index + CNIC_PRESENT;
 }
 
 static inline u8 bnx2x_fp_fw_sb_id(struct bnx2x_fastpath *fp)
 {
-	return fp->bp->base_fw_ndsb + fp->index + CNIC_CONTEXT_USE;
+	return fp->bp->base_fw_ndsb + fp->index + CNIC_PRESENT;
 }
 
 static inline u8 bnx2x_fp_cl_id(struct bnx2x_fastpath *fp)
@@ -5051,10 +5113,12 @@ static inline u8 bnx2x_fp_cl_id(struct bnx2x_fastpath *fp)
 		return bnx2x_fp_igu_sb_id(fp);
 }
 
-static void bnx2x_init_fp(struct bnx2x *bp, int fp_idx)
+static void bnx2x_init_eth_fp(struct bnx2x *bp, int fp_idx)
 {
 	struct bnx2x_fastpath *fp = &bp->fp[fp_idx];
+	u8 cos;
 	unsigned long q_type = 0;
+	u32 cids[BNX2X_MULTI_TX_COS] = { 0 };
 
 	fp->cid = fp_idx;
 	fp->cl_id = bnx2x_fp_cl_id(fp);
@@ -5067,14 +5131,25 @@ static void bnx2x_init_fp(struct bnx2x *bp, int fp_idx)
 	fp->ustorm_rx_prods_offset = bnx2x_rx_ustorm_prods_offset(fp);
 	/* Setup SB indicies */
 	fp->rx_cons_sb = BNX2X_RX_SB_INDEX;
-	fp->tx_cons_sb = BNX2X_TX_SB_INDEX;
 
 	/* Configure Queue State object */
 	__set_bit(BNX2X_Q_TYPE_HAS_RX, &q_type);
 	__set_bit(BNX2X_Q_TYPE_HAS_TX, &q_type);
-	bnx2x_init_queue_obj(bp, &fp->q_obj, fp->cl_id, fp->cid, BP_FUNC(bp),
-		bnx2x_sp(bp, q_rdata), bnx2x_sp_mapping(bp, q_rdata),
-			      q_type);
+
+	BUG_ON(fp->max_cos > BNX2X_MULTI_TX_COS);
+
+	/* init tx data */
+	for_each_cos_in_tx_queue(fp, cos) {
+		bnx2x_init_txdata(bp, &fp->txdata[cos],
+				  CID_COS_TO_TX_ONLY_CID(fp->cid, cos),
+				  FP_COS_TO_TXQ(fp, cos),
+				  BNX2X_TX_SB_INDEX_BASE + cos);
+		cids[cos] = fp->txdata[cos].cid;
+	}
+
+	bnx2x_init_queue_obj(bp, &fp->q_obj, fp->cl_id, cids, fp->max_cos,
+			     BP_FUNC(bp), bnx2x_sp(bp, q_rdata),
+			     bnx2x_sp_mapping(bp, q_rdata), q_type);
 
 	/**
 	 * Configure classification DBs: Always enable Tx switching
@@ -5096,7 +5171,7 @@ void bnx2x_nic_init(struct bnx2x *bp, u32 load_code)
 	int i;
 
 	for_each_eth_queue(bp, i)
-		bnx2x_init_fp(bp, i);
+		bnx2x_init_eth_fp(bp, i);
 #ifdef BCM_CNIC
 	if (!NO_FCOE(bp))
 		bnx2x_init_fcoe_fp(bp);
@@ -6718,7 +6793,7 @@ int bnx2x_alloc_mem(struct bnx2x *bp)
 	if (bnx2x_alloc_fw_stats_mem(bp))
 		goto alloc_mem_err;
 
-	bp->context.size = sizeof(union cdu_context) * bp->l2_cid_count;
+	bp->context.size = sizeof(union cdu_context) * BNX2X_L2_CID_COUNT(bp);
 
 	BNX2X_PCI_ALLOC(bp->context.vcxt, &bp->context.cxt_mapping,
 			bp->context.size);
@@ -6837,7 +6912,7 @@ static void __devinit bnx2x_set_int_mode(struct bnx2x *bp)
 		bnx2x_enable_msi(bp);
 		/* falling through... */
 	case INT_MODE_INTx:
-		bp->num_queues = 1 + NONE_ETH_CONTEXT_USE;
+		bp->num_queues = 1 + NON_ETH_CONTEXT_USE;
 		DP(NETIF_MSG_IFUP, "set number of queues to 1\n");
 		break;
 	default:
@@ -6859,8 +6934,8 @@ static void __devinit bnx2x_set_int_mode(struct bnx2x *bp)
 					  "enable MSI-X (%d), "
 					  "set number of queues to %d\n",
 				   bp->num_queues,
-				   1 + NONE_ETH_CONTEXT_USE);
-			bp->num_queues = 1 + NONE_ETH_CONTEXT_USE;
+				   1 + NON_ETH_CONTEXT_USE);
+			bp->num_queues = 1 + NON_ETH_CONTEXT_USE;
 
 			/* Try to enable MSI */
 			if (!(bp->flags & DISABLE_MSI_FLAG))
@@ -6988,6 +7063,8 @@ void bnx2x_ilt_set_info(struct bnx2x *bp)
 static inline void bnx2x_pf_q_prep_init(struct bnx2x *bp,
 	struct bnx2x_fastpath *fp, struct bnx2x_queue_init_params *init_params)
 {
+
+	u8 cos;
 	/* FCoE Queue uses Default SB, thus has no HC capabilities */
 	if (!IS_FCOE_FP(fp)) {
 		__set_bit(BNX2X_Q_FLG_HC, &init_params->rx.flags);
@@ -7013,13 +7090,56 @@ static inline void bnx2x_pf_q_prep_init(struct bnx2x *bp,
 		 * CQ index among the SB indices: FCoE clients uses the default
 		 * SB, therefore it's different.
 		 */
-		init_params->rx.sb_cq_index = U_SB_ETH_RX_CQ_INDEX;
-		init_params->tx.sb_cq_index = C_SB_ETH_TX_CQ_INDEX;
+		init_params->rx.sb_cq_index = HC_INDEX_ETH_RX_CQ_CONS;
+		init_params->tx.sb_cq_index = HC_INDEX_ETH_FIRST_TX_CQ_CONS;
 	}
 
-	init_params->cxt = &bp->context.vcxt[fp->cid].eth;
+	/* set maximum number of COSs supported by this queue */
+	init_params->max_cos = fp->max_cos;
+
+	DP(BNX2X_MSG_SP, "fp: %d setting queue params max cos to: %d",
+	    fp->index, init_params->max_cos);
+
+	/* set the context pointers queue object */
+	for (cos = FIRST_TX_COS_INDEX; cos < init_params->max_cos; cos++)
+		init_params->cxts[cos] =
+			&bp->context.vcxt[fp->txdata[cos].cid].eth;
 }
 
+int bnx2x_setup_tx_only(struct bnx2x *bp, struct bnx2x_fastpath *fp,
+			struct bnx2x_queue_state_params *q_params,
+			struct bnx2x_queue_setup_tx_only_params *tx_only_params,
+			int tx_index, bool leading)
+{
+	memset(tx_only_params, 0, sizeof(*tx_only_params));
+
+	/* Set the command */
+	q_params->cmd = BNX2X_Q_CMD_SETUP_TX_ONLY;
+
+	/* Set tx-only QUEUE flags: don't zero statistics */
+	tx_only_params->flags = bnx2x_get_common_flags(bp, fp, false);
+
+	/* choose the index of the cid to send the slow path on */
+	tx_only_params->cid_index = tx_index;
+
+	/* Set general TX_ONLY_SETUP parameters */
+	bnx2x_pf_q_prep_general(bp, fp, &tx_only_params->gen_params, tx_index);
+
+	/* Set Tx TX_ONLY_SETUP parameters */
+	bnx2x_pf_tx_q_prep(bp, fp, &tx_only_params->txq_params, tx_index);
+
+	DP(BNX2X_MSG_SP, "preparing to send tx-only ramrod for connection:"
+			 "cos %d, primary cid %d, cid %d, "
+			 "client id %d, sp-client id %d, flags %lx",
+	   tx_index, q_params->q_obj->cids[FIRST_TX_COS_INDEX],
+	   q_params->q_obj->cids[tx_index], q_params->q_obj->cl_id,
+	   tx_only_params->gen_params.spcl_id, tx_only_params->flags);
+
+	/* send the ramrod */
+	return bnx2x_queue_state_change(bp, q_params);
+}
+
+
 /**
  * bnx2x_setup_queue - setup queue
  *
@@ -7037,7 +7157,12 @@ int bnx2x_setup_queue(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	struct bnx2x_queue_state_params q_params = {0};
 	struct bnx2x_queue_setup_params *setup_params =
 						&q_params.params.setup;
+	struct bnx2x_queue_setup_tx_only_params *tx_only_params =
+						&q_params.params.tx_only;
 	int rc;
+	u8 tx_index;
+
+	DP(BNX2X_MSG_SP, "setting up queue %d", fp->index);
 
 	/* reset IGU state skip FCoE L2 queue */
 	if (!IS_FCOE_FP(fp))
@@ -7057,10 +7182,13 @@ int bnx2x_setup_queue(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	/* Change the state to INIT */
 	rc = bnx2x_queue_state_change(bp, &q_params);
 	if (rc) {
-		BNX2X_ERR("Queue INIT failed\n");
+		BNX2X_ERR("Queue(%d) INIT failed\n", fp->index);
 		return rc;
 	}
 
+	DP(BNX2X_MSG_SP, "init complete");
+
+
 	/* Now move the Queue to the SETUP state... */
 	memset(setup_params, 0, sizeof(*setup_params));
 
@@ -7068,20 +7196,39 @@ int bnx2x_setup_queue(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 	setup_params->flags = bnx2x_get_q_flags(bp, fp, leading);
 
 	/* Set general SETUP parameters */
-	bnx2x_pf_q_prep_general(bp, fp, &setup_params->gen_params);
+	bnx2x_pf_q_prep_general(bp, fp, &setup_params->gen_params,
+				FIRST_TX_COS_INDEX);
 
-	bnx2x_pf_rx_q_prep(bp, fp, &setup_params->pause,
+	bnx2x_pf_rx_q_prep(bp, fp, &setup_params->pause_params,
 			    &setup_params->rxq_params);
 
-	bnx2x_pf_tx_q_prep(bp, fp, &setup_params->txq_params);
+	bnx2x_pf_tx_q_prep(bp, fp, &setup_params->txq_params,
+			   FIRST_TX_COS_INDEX);
 
 	/* Set the command */
 	q_params.cmd = BNX2X_Q_CMD_SETUP;
 
 	/* Change the state to SETUP */
 	rc = bnx2x_queue_state_change(bp, &q_params);
-	if (rc)
-		BNX2X_ERR("Queue SETUP failed\n");
+	if (rc) {
+		BNX2X_ERR("Queue(%d) SETUP failed\n", fp->index);
+		return rc;
+	}
+
+	/* loop through the relevant tx-only indices */
+	for (tx_index = FIRST_TX_ONLY_COS_INDEX;
+	      tx_index < fp->max_cos;
+	      tx_index++) {
+
+		/* prepare and send tx-only ramrod*/
+		rc = bnx2x_setup_tx_only(bp, fp, &q_params,
+					  tx_only_params, tx_index, leading);
+		if (rc) {
+			BNX2X_ERR("Queue(%d.%d) TX_ONLY_SETUP failed\n",
+				  fp->index, tx_index);
+			return rc;
+		}
+	}
 
 	return rc;
 }
@@ -7089,27 +7236,67 @@ int bnx2x_setup_queue(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 static int bnx2x_stop_queue(struct bnx2x *bp, int index)
 {
 	struct bnx2x_fastpath *fp = &bp->fp[index];
+	struct bnx2x_fp_txdata *txdata;
 	struct bnx2x_queue_state_params q_params = {0};
-	int rc;
+	int rc, tx_index;
+
+	DP(BNX2X_MSG_SP, "stopping queue %d cid %d", index, fp->cid);
 
 	q_params.q_obj = &fp->q_obj;
 	/* We want to wait for completion in this context */
 	__set_bit(RAMROD_COMP_WAIT, &q_params.ramrod_flags);
 
-	/* halt the connection */
+
+	/* close tx-only connections */
+	for (tx_index = FIRST_TX_ONLY_COS_INDEX;
+	     tx_index < fp->max_cos;
+	     tx_index++){
+
+		/* ascertain this is a normal queue*/
+		txdata = &fp->txdata[tx_index];
+
+		DP(BNX2X_MSG_SP, "stopping tx-only queue %d",
+							txdata->txq_index);
+
+		/* send halt terminate on tx-only connection */
+		q_params.cmd = BNX2X_Q_CMD_TERMINATE;
+		memset(&q_params.params.terminate, 0,
+		       sizeof(q_params.params.terminate));
+		q_params.params.terminate.cid_index = tx_index;
+
+		rc = bnx2x_queue_state_change(bp, &q_params);
+		if (rc)
+			return rc;
+
+		/* send halt terminate on tx-only connection */
+		q_params.cmd = BNX2X_Q_CMD_CFC_DEL;
+		memset(&q_params.params.cfc_del, 0,
+		       sizeof(q_params.params.cfc_del));
+		q_params.params.cfc_del.cid_index = tx_index;
+		rc = bnx2x_queue_state_change(bp, &q_params);
+		if (rc)
+			return rc;
+	}
+	/* Stop the primary connection: */
+	/* ...halt the connection */
 	q_params.cmd = BNX2X_Q_CMD_HALT;
 	rc = bnx2x_queue_state_change(bp, &q_params);
 	if (rc)
 		return rc;
 
-	/* terminate the connection */
+	/* ...terminate the connection */
 	q_params.cmd = BNX2X_Q_CMD_TERMINATE;
+	memset(&q_params.params.terminate, 0,
+	       sizeof(q_params.params.terminate));
+	q_params.params.terminate.cid_index = FIRST_TX_COS_INDEX;
 	rc = bnx2x_queue_state_change(bp, &q_params);
 	if (rc)
 		return rc;
-
-	/* delete cfc entry */
+	/* ...delete cfc entry */
 	q_params.cmd = BNX2X_Q_CMD_CFC_DEL;
+	memset(&q_params.params.cfc_del, 0,
+	       sizeof(q_params.params.cfc_del));
+	q_params.params.cfc_del.cid_index = FIRST_TX_COS_INDEX;
 	return bnx2x_queue_state_change(bp, &q_params);
 }
 
@@ -7130,8 +7317,8 @@ static void bnx2x_reset_func(struct bnx2x *bp)
 	for_each_eth_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 		REG_WR8(bp, BAR_CSTRORM_INTMEM +
-			CSTORM_STATUS_BLOCK_DATA_STATE_OFFSET(fp->fw_sb_id),
-			SB_DISABLED);
+			   CSTORM_STATUS_BLOCK_DATA_STATE_OFFSET(fp->fw_sb_id),
+			   SB_DISABLED);
 	}
 
 #ifdef BCM_CNIC
@@ -7142,8 +7329,8 @@ static void bnx2x_reset_func(struct bnx2x *bp)
 #endif
 	/* SP SB */
 	REG_WR8(bp, BAR_CSTRORM_INTMEM +
-		CSTORM_SP_STATUS_BLOCK_DATA_STATE_OFFSET(func),
-		SB_DISABLED);
+		   CSTORM_SP_STATUS_BLOCK_DATA_STATE_OFFSET(func),
+		   SB_DISABLED);
 
 	for (i = 0; i < XSTORM_SPQ_DATA_SIZE / 4; i++)
 		REG_WR(bp, BAR_XSTRORM_INTMEM + XSTORM_SPQ_DATA_OFFSET(func),
@@ -7352,7 +7539,8 @@ void bnx2x_send_unload_done(struct bnx2x *bp)
 void bnx2x_chip_cleanup(struct bnx2x *bp, int unload_mode)
 {
 	int port = BP_PORT(bp);
-	int i, rc;
+	int i, rc = 0;
+	u8 cos;
 	struct bnx2x_mcast_ramrod_params rparam = {0};
 	u32 reset_code;
 
@@ -7360,7 +7548,8 @@ void bnx2x_chip_cleanup(struct bnx2x *bp, int unload_mode)
 	for_each_tx_queue(bp, i) {
 		struct bnx2x_fastpath *fp = &bp->fp[i];
 
-		rc = bnx2x_clean_tx_queue(bp, fp);
+		for_each_cos_in_tx_queue(fp, cos)
+			rc = bnx2x_clean_tx_queue(bp, &fp->txdata[cos]);
 #ifdef BNX2X_STOP_ON_ERROR
 		if (rc)
 			return;
@@ -7888,7 +8077,7 @@ static inline void bnx2x_recovery_failed(struct bnx2x *bp)
 
 /*
  * Assumption: runs under rtnl lock. This together with the fact
- * that it's called only from bnx2x_reset_task() ensure that it
+ * that it's called only from bnx2x_sp_rtnl() ensure that it
  * will never be called when netif_running(bp->dev) is false.
  */
 static void bnx2x_parity_recover(struct bnx2x *bp)
@@ -8045,6 +8234,9 @@ static void bnx2x_sp_rtnl_task(struct work_struct *work)
 	if (!netif_running(bp->dev))
 		goto sp_rtnl_exit;
 
+	if (test_and_clear_bit(BNX2X_SP_RTNL_SETUP_TC, &bp->sp_rtnl_state))
+		bnx2x_setup_tc(bp->dev, bp->dcbx_port_params.ets.num_of_cos);
+
 	/* if stop on error is defined no recovery flows should be executed */
 #ifdef BNX2X_STOP_ON_ERROR
 	BNX2X_ERR("recovery flow called but STOP_ON_ERROR defined "
@@ -8387,14 +8579,11 @@ static void __devinit bnx2x_get_igu_cam_info(struct bnx2x *bp)
 	int vn = BP_E1HVN(bp);
 	int igu_sb_id;
 	u32 val;
-	u8 fid;
+	u8 fid, igu_sb_cnt = 0;
 
 	bp->igu_base_sb = 0xff;
-	bp->igu_sb_cnt = 0;
 	if (CHIP_INT_MODE_IS_BC(bp)) {
-		bp->igu_sb_cnt = min_t(u8, FP_SB_MAX_E1x,
-				       NUM_IGU_SB_REQUIRED(bp->l2_cid_count));
-
+		igu_sb_cnt = bp->igu_sb_cnt;
 		bp->igu_base_sb = (CHIP_MODE_IS_4_PORT(bp) ? pfid : vn) *
 			FP_SB_MAX_E1x;
 
@@ -8420,19 +8609,21 @@ static void __devinit bnx2x_get_igu_cam_info(struct bnx2x *bp)
 			else {
 				if (bp->igu_base_sb == 0xff)
 					bp->igu_base_sb = igu_sb_id;
-				bp->igu_sb_cnt++;
+				igu_sb_cnt++;
 			}
 		}
 	}
 
-	/* It's expected that number of CAM entries for this
-	 * functions is equal to the MSI-X table size (which was a
-	 * used during bp->l2_cid_count value calculation.
-	 * We want a harsh warning if these values are different!
+#ifdef CONFIG_PCI_MSI
+	/*
+	 * It's expected that number of CAM entries for this functions is equal
+	 * to the number evaluated based on the MSI-X table size. We want a
+	 * harsh warning if these values are different!
 	 */
-	WARN_ON(bp->igu_sb_cnt != NUM_IGU_SB_REQUIRED(bp->l2_cid_count));
+	WARN_ON(bp->igu_sb_cnt != igu_sb_cnt);
+#endif
 
-	if (bp->igu_sb_cnt == 0)
+	if (igu_sb_cnt == 0)
 		BNX2X_ERR("CAM configuration error\n");
 }
 
@@ -8961,13 +9152,14 @@ static int __devinit bnx2x_get_hwinfo(struct bnx2x *bp)
 
 	bnx2x_get_common_hwinfo(bp);
 
+	/*
+	 * initialize IGU parameters
+	 */
 	if (CHIP_IS_E1x(bp)) {
 		bp->common.int_block = INT_BLOCK_HC;
 
 		bp->igu_dsb_id = DEF_SB_IGU_ID;
 		bp->igu_base_sb = 0;
-		bp->igu_sb_cnt = min_t(u8, FP_SB_MAX_E1x,
-				       NUM_IGU_SB_REQUIRED(bp->l2_cid_count));
 	} else {
 		bp->common.int_block = INT_BLOCK_IGU;
 		val = REG_RD(bp, IGU_REG_BLOCK_CONFIGURATION);
@@ -9260,10 +9452,8 @@ static void __devinit bnx2x_set_modes_bitmap(struct bnx2x *bp)
 		SET_FLAGS(flags, MODE_E3);
 		if (CHIP_REV(bp) == CHIP_REV_Ax)
 			SET_FLAGS(flags, MODE_E3_A0);
-		else {/*if (CHIP_REV(bp) == CHIP_REV_Bx)*/
-			SET_FLAGS(flags, MODE_E3_B0);
-			SET_FLAGS(flags, MODE_COS_BC);
-		}
+		else /*if (CHIP_REV(bp) == CHIP_REV_Bx)*/
+			SET_FLAGS(flags, MODE_E3_B0 | MODE_COS3);
 	}
 
 	if (IS_MF(bp)) {
@@ -9371,6 +9561,14 @@ static int __devinit bnx2x_init_bp(struct bnx2x *bp)
 		bp->cnic_base_cl_id = FP_SB_MAX_E2;
 #endif
 
+	/* multiple tx priority */
+	if (CHIP_IS_E1x(bp))
+		bp->max_cos = BNX2X_MULTI_TX_COS_E1X;
+	if (CHIP_IS_E2(bp) || CHIP_IS_E3A0(bp))
+		bp->max_cos = BNX2X_MULTI_TX_COS_E2_E3A0;
+	if (CHIP_IS_E3B0(bp))
+		bp->max_cos = BNX2X_MULTI_TX_COS_E3B0;
+
 	return rc;
 }
 
@@ -9696,6 +9894,8 @@ static const struct net_device_ops bnx2x_netdev_ops = {
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= poll_bnx2x,
 #endif
+	.ndo_setup_tc		= bnx2x_setup_tc,
+
 };
 
 static inline int bnx2x_set_coherency_mask(struct bnx2x *bp)
@@ -9797,16 +9997,6 @@ static int __devinit bnx2x_init_dev(struct pci_dev *pdev,
 		goto err_out_release;
 	}
 
-	bp->doorbells = ioremap_nocache(pci_resource_start(pdev, 2),
-					min_t(u64, BNX2X_DB_SIZE(bp),
-					      pci_resource_len(pdev, 2)));
-	if (!bp->doorbells) {
-		dev_err(&bp->pdev->dev,
-			"Cannot map doorbell space, aborting\n");
-		rc = -ENOMEM;
-		goto err_out_unmap;
-	}
-
 	bnx2x_set_power_state(bp, PCI_D0);
 
 	/* clean indirect addresses */
@@ -9859,16 +10049,6 @@ static int __devinit bnx2x_init_dev(struct pci_dev *pdev,
 
 	return 0;
 
-err_out_unmap:
-	if (bp->regview) {
-		iounmap(bp->regview);
-		bp->regview = NULL;
-	}
-	if (bp->doorbells) {
-		iounmap(bp->doorbells);
-		bp->doorbells = NULL;
-	}
-
 err_out_release:
 	if (atomic_read(&pdev->enable_cnt) == 1)
 		pci_release_regions(pdev);
@@ -10143,9 +10323,9 @@ void bnx2x__init_func_obj(struct bnx2x *bp)
 }
 
 /* must be called after sriov-enable */
-static inline int bnx2x_set_qm_cid_count(struct bnx2x *bp, int l2_cid_count)
+static inline int bnx2x_set_qm_cid_count(struct bnx2x *bp)
 {
-	int cid_count = L2_FP_COUNT(l2_cid_count);
+	int cid_count = BNX2X_L2_CID_COUNT(bp);
 
 #ifdef BCM_CNIC
 	cid_count += CNIC_CID_MAX;
@@ -10154,22 +10334,33 @@ static inline int bnx2x_set_qm_cid_count(struct bnx2x *bp, int l2_cid_count)
 }
 
 /**
- * bnx2x_pci_msix_table_size - get the size of the MSI-X table.
+ * bnx2x_get_num_none_def_sbs - return the number of none default SBs
  *
  * @dev:	pci device
  *
  */
-static inline int bnx2x_pci_msix_table_size(struct pci_dev *pdev)
+static inline int bnx2x_get_num_non_def_sbs(struct pci_dev *pdev)
 {
 	int pos;
 	u16 control;
 
 	pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
+
+	/*
+	 * If MSI-X is not supported - return number of SBs needed to support
+	 * one fast path queue: one FP queue + SB for CNIC
+	 */
 	if (!pos)
-		return 0;
+		return 1 + CNIC_PRESENT;
 
+	/*
+	 * The value in the PCI configuration space is the index of the last
+	 * entry, namely one less than the actual size of the table, which is
+	 * exactly what we want to return from this function: number of all SBs
+	 * without the default SB.
+	 */
 	pci_read_config_word(pdev, pos  + PCI_MSI_FLAGS, &control);
-	return (control & PCI_MSIX_FLAGS_QSIZE) + 1;
+	return control & PCI_MSIX_FLAGS_QSIZE;
 }
 
 static int __devinit bnx2x_init_one(struct pci_dev *pdev,
@@ -10178,34 +10369,38 @@ static int __devinit bnx2x_init_one(struct pci_dev *pdev,
 	struct net_device *dev = NULL;
 	struct bnx2x *bp;
 	int pcie_width, pcie_speed;
-	int rc, cid_count;
+	int rc, max_non_def_sbs;
+	int rx_count, tx_count, rss_count;
+	/*
+	 * An estimated maximum supported CoS number according to the chip
+	 * version.
+	 * We will try to roughly estimate the maximum number of CoSes this chip
+	 * may support in order to minimize the memory allocated for Tx
+	 * netdev_queue's. This number will be accurately calculated during the
+	 * initialization of bp->max_cos based on the chip versions AND chip
+	 * revision in the bnx2x_init_bp().
+	 */
+	u8 max_cos_est = 0;
 
 	switch (ent->driver_data) {
 	case BCM57710:
 	case BCM57711:
 	case BCM57711E:
+		max_cos_est = BNX2X_MULTI_TX_COS_E1X;
+		break;
+
 	case BCM57712:
 	case BCM57712_MF:
+		max_cos_est = BNX2X_MULTI_TX_COS_E2_E3A0;
+		break;
+
 	case BCM57800:
 	case BCM57800_MF:
 	case BCM57810:
 	case BCM57810_MF:
 	case BCM57840:
 	case BCM57840_MF:
-		/* The size requested for the MSI-X table corresponds to the
-		 * actual amount of avaliable IGU/HC status blocks. It includes
-		 * the default SB vector but we want cid_count to contain the
-		 * amount of only non-default SBs, that's what '-1' stands for.
-		 */
-		cid_count = bnx2x_pci_msix_table_size(pdev) - 1;
-
-		/* do not allow initial cid_count grow above 16
-		 * since Special CIDs starts from this number
-		 * use old FP_SB_MAX_E1x define for this matter
-		 */
-		cid_count = min_t(int, FP_SB_MAX_E1x, cid_count);
-
-		WARN_ON(!cid_count);
+		max_cos_est = BNX2X_MULTI_TX_COS_E3B0;
 		break;
 
 	default:
@@ -10214,26 +10409,44 @@ static int __devinit bnx2x_init_one(struct pci_dev *pdev,
 		return -ENODEV;
 	}
 
-	cid_count += FCOE_CONTEXT_USE;
+	max_non_def_sbs = bnx2x_get_num_non_def_sbs(pdev);
+
+	/* !!! FIXME !!!
+	 * Do not allow the maximum SB count to grow above 16
+	 * since Special CIDs starts from 16*BNX2X_MULTI_TX_COS=48.
+	 * We will use the FP_SB_MAX_E1x macro for this matter.
+	 */
+	max_non_def_sbs = min_t(int, FP_SB_MAX_E1x, max_non_def_sbs);
+
+	WARN_ON(!max_non_def_sbs);
+
+	/* Maximum number of RSS queues: one IGU SB goes to CNIC */
+	rss_count = max_non_def_sbs - CNIC_PRESENT;
+
+	/* Maximum number of netdev Rx queues: RSS + FCoE L2 */
+	rx_count = rss_count + FCOE_PRESENT;
+
+	/*
+	 * Maximum number of netdev Tx queues:
+	 *      Maximum TSS queues * Maximum supported number of CoS  + FCoE L2
+	 */
+	tx_count = MAX_TXQS_PER_COS * max_cos_est + FCOE_PRESENT;
 
 	/* dev zeroed in init_etherdev */
-	dev = alloc_etherdev_mq(sizeof(*bp), cid_count);
+	dev = alloc_etherdev_mqs(sizeof(*bp), tx_count, rx_count);
 	if (!dev) {
 		dev_err(&pdev->dev, "Cannot allocate net device\n");
 		return -ENOMEM;
 	}
 
-	/* We don't need a Tx queue for a CNIC and an OOO Rx-only ring,
-	 * so update a cid_count after a netdev allocation.
-	 */
-	cid_count += CNIC_CONTEXT_USE;
-
 	bp = netdev_priv(dev);
-	bp->msg_enable = debug;
 
-	pci_set_drvdata(pdev, dev);
+	DP(NETIF_MSG_DRV, "Allocated netdev with %d tx and %d rx queues\n",
+			  tx_count, rx_count);
 
-	bp->l2_cid_count = cid_count;
+	bp->igu_sb_cnt = max_non_def_sbs;
+	bp->msg_enable = debug;
+	pci_set_drvdata(pdev, dev);
 
 	rc = bnx2x_init_dev(pdev, dev, ent->driver_data);
 	if (rc < 0) {
@@ -10241,14 +10454,28 @@ static int __devinit bnx2x_init_one(struct pci_dev *pdev,
 		return rc;
 	}
 
-	BNX2X_DEV_INFO("cid_count=%d\n", cid_count);
+	DP(NETIF_MSG_DRV, "max_non_def_sbs %d", max_non_def_sbs);
 
 	rc = bnx2x_init_bp(bp);
 	if (rc)
 		goto init_one_exit;
 
+	/*
+	 * Map doorbels here as we need the real value of bp->max_cos which
+	 * is initialized in bnx2x_init_bp().
+	 */
+	bp->doorbells = ioremap_nocache(pci_resource_start(pdev, 2),
+					min_t(u64, BNX2X_DB_SIZE(bp),
+					      pci_resource_len(pdev, 2)));
+	if (!bp->doorbells) {
+		dev_err(&bp->pdev->dev,
+			"Cannot map doorbell space, aborting\n");
+		rc = -ENOMEM;
+		goto init_one_exit;
+	}
+
 	/* calc qm_cid_count */
-	bp->qm_cid_count = bnx2x_set_qm_cid_count(bp, cid_count);
+	bp->qm_cid_count = bnx2x_set_qm_cid_count(bp);
 
 #ifdef BCM_CNIC
 	/* disable FCOE L2 queue for E1x*/
diff --git a/drivers/net/bnx2x/bnx2x_sp.c b/drivers/net/bnx2x/bnx2x_sp.c
index f6322a15ed4da1dadd9eafa9b577dd3a82cbfd86..358c339975362641e75b8557c86ebb0b65a814e1 100644
--- a/drivers/net/bnx2x/bnx2x_sp.c
+++ b/drivers/net/bnx2x/bnx2x_sp.c
@@ -4195,15 +4195,29 @@ static int bnx2x_queue_comp_cmd(struct bnx2x *bp,
 
 	if (!test_and_clear_bit(cmd, &cur_pending)) {
 		BNX2X_ERR("Bad MC reply %d for queue %d in state %d "
-			  "pending 0x%lx, next_state %d\n", cmd, o->cid,
+			  "pending 0x%lx, next_state %d\n", cmd,
+			  o->cids[BNX2X_PRIMARY_CID_INDEX],
 			  o->state, cur_pending, o->next_state);
 		return -EINVAL;
 	}
 
+	if (o->next_tx_only >= o->max_cos)
+		/* >= becuase tx only must always be smaller than cos since the
+		 * primary connection suports COS 0
+		 */
+		BNX2X_ERR("illegal value for next tx_only: %d. max cos was %d",
+			   o->next_tx_only, o->max_cos);
+
 	DP(BNX2X_MSG_SP, "Completing command %d for queue %d, "
-			 "setting state to %d\n", cmd, o->cid, o->next_state);
+			 "setting state to %d\n", cmd,
+			 o->cids[BNX2X_PRIMARY_CID_INDEX], o->next_state);
+
+	if (o->next_tx_only)  /* print num tx-only if any exist */
+		DP(BNX2X_MSG_SP, "primary cid %d: num tx-only cons %d",
+			   o->cids[BNX2X_PRIMARY_CID_INDEX], o->next_tx_only);
 
 	o->state = o->next_state;
+	o->num_tx_only = o->next_tx_only;
 	o->next_state = BNX2X_Q_STATE_MAX;
 
 	/* It's important that o->state and o->next_state are
@@ -4230,135 +4244,193 @@ static void bnx2x_q_fill_setup_data_e2(struct bnx2x *bp,
 				CLIENT_INIT_RX_DATA_TPA_EN_IPV6;
 }
 
-static void bnx2x_q_fill_setup_data_cmn(struct bnx2x *bp,
-				struct bnx2x_queue_state_params *cmd_params,
-				struct client_init_ramrod_data *data)
-{
-	struct bnx2x_queue_sp_obj *o = cmd_params->q_obj;
-	struct bnx2x_queue_setup_params *params = &cmd_params->params.setup;
-
-
-	/* general */
-	data->general.client_id = o->cl_id;
-
-	if (test_bit(BNX2X_Q_FLG_STATS, &params->flags)) {
-		data->general.statistics_counter_id =
-					params->gen_params.stat_id;
-		data->general.statistics_en_flg = 1;
-		data->general.statistics_zero_flg =
-			test_bit(BNX2X_Q_FLG_ZERO_STATS, &params->flags);
+static void bnx2x_q_fill_init_general_data(struct bnx2x *bp,
+				struct bnx2x_queue_sp_obj *o,
+				struct bnx2x_general_setup_params *params,
+				struct client_init_general_data *gen_data,
+				unsigned long *flags)
+{
+	gen_data->client_id = o->cl_id;
+
+	if (test_bit(BNX2X_Q_FLG_STATS, flags)) {
+		gen_data->statistics_counter_id =
+					params->stat_id;
+		gen_data->statistics_en_flg = 1;
+		gen_data->statistics_zero_flg =
+			test_bit(BNX2X_Q_FLG_ZERO_STATS, flags);
 	} else
-		data->general.statistics_counter_id =
+		gen_data->statistics_counter_id =
 					DISABLE_STATISTIC_COUNTER_ID_VALUE;
 
-	data->general.is_fcoe_flg = test_bit(BNX2X_Q_FLG_FCOE, &params->flags);
-	data->general.activate_flg = test_bit(BNX2X_Q_FLG_ACTIVE,
-					      &params->flags);
-	data->general.sp_client_id = params->gen_params.spcl_id;
-	data->general.mtu = cpu_to_le16(params->gen_params.mtu);
-	data->general.func_id = o->func_id;
+	gen_data->is_fcoe_flg = test_bit(BNX2X_Q_FLG_FCOE, flags);
+	gen_data->activate_flg = test_bit(BNX2X_Q_FLG_ACTIVE, flags);
+	gen_data->sp_client_id = params->spcl_id;
+	gen_data->mtu = cpu_to_le16(params->mtu);
+	gen_data->func_id = o->func_id;
 
 
-	data->general.cos = params->txq_params.cos;
+	gen_data->cos = params->cos;
 
-	data->general.traffic_type =
-		test_bit(BNX2X_Q_FLG_FCOE, &params->flags) ?
+	gen_data->traffic_type =
+		test_bit(BNX2X_Q_FLG_FCOE, flags) ?
 		LLFC_TRAFFIC_TYPE_FCOE : LLFC_TRAFFIC_TYPE_NW;
 
-	/* Rx data */
-	data->rx.tpa_en = test_bit(BNX2X_Q_FLG_TPA, &params->flags) *
+	DP(BNX2X_MSG_SP, "flags: active %d, cos %d, stats en %d",
+	   gen_data->activate_flg, gen_data->cos, gen_data->statistics_en_flg);
+}
+
+static void bnx2x_q_fill_init_tx_data(struct bnx2x_queue_sp_obj *o,
+				struct bnx2x_txq_setup_params *params,
+				struct client_init_tx_data *tx_data,
+				unsigned long *flags)
+{
+	tx_data->enforce_security_flg =
+		test_bit(BNX2X_Q_FLG_TX_SEC, flags);
+	tx_data->default_vlan =
+		cpu_to_le16(params->default_vlan);
+	tx_data->default_vlan_flg =
+		test_bit(BNX2X_Q_FLG_DEF_VLAN, flags);
+	tx_data->tx_switching_flg =
+		test_bit(BNX2X_Q_FLG_TX_SWITCH, flags);
+	tx_data->anti_spoofing_flg =
+		test_bit(BNX2X_Q_FLG_ANTI_SPOOF, flags);
+	tx_data->tx_status_block_id = params->fw_sb_id;
+	tx_data->tx_sb_index_number = params->sb_cq_index;
+	tx_data->tss_leading_client_id = params->tss_leading_cl_id;
+
+	tx_data->tx_bd_page_base.lo =
+		cpu_to_le32(U64_LO(params->dscr_map));
+	tx_data->tx_bd_page_base.hi =
+		cpu_to_le32(U64_HI(params->dscr_map));
+
+	/* Don't configure any Tx switching mode during queue SETUP */
+	tx_data->state = 0;
+}
+
+static void bnx2x_q_fill_init_pause_data(struct bnx2x_queue_sp_obj *o,
+				struct rxq_pause_params *params,
+				struct client_init_rx_data *rx_data)
+{
+	/* flow control data */
+	rx_data->cqe_pause_thr_low = cpu_to_le16(params->rcq_th_lo);
+	rx_data->cqe_pause_thr_high = cpu_to_le16(params->rcq_th_hi);
+	rx_data->bd_pause_thr_low = cpu_to_le16(params->bd_th_lo);
+	rx_data->bd_pause_thr_high = cpu_to_le16(params->bd_th_hi);
+	rx_data->sge_pause_thr_low = cpu_to_le16(params->sge_th_lo);
+	rx_data->sge_pause_thr_high = cpu_to_le16(params->sge_th_hi);
+	rx_data->rx_cos_mask = cpu_to_le16(params->pri_map);
+}
+
+static void bnx2x_q_fill_init_rx_data(struct bnx2x_queue_sp_obj *o,
+				struct bnx2x_rxq_setup_params *params,
+				struct client_init_rx_data *rx_data,
+				unsigned long *flags)
+{
+		/* Rx data */
+	rx_data->tpa_en = test_bit(BNX2X_Q_FLG_TPA, flags) *
 				CLIENT_INIT_RX_DATA_TPA_EN_IPV4;
-	data->rx.vmqueue_mode_en_flg = 0;
+	rx_data->vmqueue_mode_en_flg = 0;
 
-	data->rx.cache_line_alignment_log_size =
-		params->rxq_params.cache_line_log;
-	data->rx.enable_dynamic_hc =
-		test_bit(BNX2X_Q_FLG_DHC, &params->flags);
-	data->rx.max_sges_for_packet = params->rxq_params.max_sges_pkt;
-	data->rx.client_qzone_id = params->rxq_params.cl_qzone_id;
-	data->rx.max_agg_size = cpu_to_le16(params->rxq_params.tpa_agg_sz);
+	rx_data->cache_line_alignment_log_size =
+		params->cache_line_log;
+	rx_data->enable_dynamic_hc =
+		test_bit(BNX2X_Q_FLG_DHC, flags);
+	rx_data->max_sges_for_packet = params->max_sges_pkt;
+	rx_data->client_qzone_id = params->cl_qzone_id;
+	rx_data->max_agg_size = cpu_to_le16(params->tpa_agg_sz);
 
 	/* Always start in DROP_ALL mode */
-	data->rx.state = cpu_to_le16(CLIENT_INIT_RX_DATA_UCAST_DROP_ALL |
+	rx_data->state = cpu_to_le16(CLIENT_INIT_RX_DATA_UCAST_DROP_ALL |
 				     CLIENT_INIT_RX_DATA_MCAST_DROP_ALL);
 
 	/* We don't set drop flags */
-	data->rx.drop_ip_cs_err_flg = 0;
-	data->rx.drop_tcp_cs_err_flg = 0;
-	data->rx.drop_ttl0_flg = 0;
-	data->rx.drop_udp_cs_err_flg = 0;
-	data->rx.inner_vlan_removal_enable_flg =
-		test_bit(BNX2X_Q_FLG_VLAN, &params->flags);
-	data->rx.outer_vlan_removal_enable_flg =
-		test_bit(BNX2X_Q_FLG_OV, &params->flags);
-	data->rx.status_block_id = params->rxq_params.fw_sb_id;
-	data->rx.rx_sb_index_number = params->rxq_params.sb_cq_index;
-	data->rx.max_tpa_queues = params->rxq_params.max_tpa_queues;
-	data->rx.max_bytes_on_bd = cpu_to_le16(params->rxq_params.buf_sz);
-	data->rx.sge_buff_size = cpu_to_le16(params->rxq_params.sge_buf_sz);
-	data->rx.bd_page_base.lo =
-		cpu_to_le32(U64_LO(params->rxq_params.dscr_map));
-	data->rx.bd_page_base.hi =
-		cpu_to_le32(U64_HI(params->rxq_params.dscr_map));
-	data->rx.sge_page_base.lo =
-		cpu_to_le32(U64_LO(params->rxq_params.sge_map));
-	data->rx.sge_page_base.hi =
-		cpu_to_le32(U64_HI(params->rxq_params.sge_map));
-	data->rx.cqe_page_base.lo =
-		cpu_to_le32(U64_LO(params->rxq_params.rcq_map));
-	data->rx.cqe_page_base.hi =
-		cpu_to_le32(U64_HI(params->rxq_params.rcq_map));
-	data->rx.is_leading_rss = test_bit(BNX2X_Q_FLG_LEADING_RSS,
-					   &params->flags);
-
-	if (test_bit(BNX2X_Q_FLG_MCAST, &params->flags)) {
-		data->rx.approx_mcast_engine_id = o->func_id;
-		data->rx.is_approx_mcast = 1;
+	rx_data->drop_ip_cs_err_flg = 0;
+	rx_data->drop_tcp_cs_err_flg = 0;
+	rx_data->drop_ttl0_flg = 0;
+	rx_data->drop_udp_cs_err_flg = 0;
+	rx_data->inner_vlan_removal_enable_flg =
+		test_bit(BNX2X_Q_FLG_VLAN, flags);
+	rx_data->outer_vlan_removal_enable_flg =
+		test_bit(BNX2X_Q_FLG_OV, flags);
+	rx_data->status_block_id = params->fw_sb_id;
+	rx_data->rx_sb_index_number = params->sb_cq_index;
+	rx_data->max_tpa_queues = params->max_tpa_queues;
+	rx_data->max_bytes_on_bd = cpu_to_le16(params->buf_sz);
+	rx_data->sge_buff_size = cpu_to_le16(params->sge_buf_sz);
+	rx_data->bd_page_base.lo =
+		cpu_to_le32(U64_LO(params->dscr_map));
+	rx_data->bd_page_base.hi =
+		cpu_to_le32(U64_HI(params->dscr_map));
+	rx_data->sge_page_base.lo =
+		cpu_to_le32(U64_LO(params->sge_map));
+	rx_data->sge_page_base.hi =
+		cpu_to_le32(U64_HI(params->sge_map));
+	rx_data->cqe_page_base.lo =
+		cpu_to_le32(U64_LO(params->rcq_map));
+	rx_data->cqe_page_base.hi =
+		cpu_to_le32(U64_HI(params->rcq_map));
+	rx_data->is_leading_rss = test_bit(BNX2X_Q_FLG_LEADING_RSS, flags);
+
+	if (test_bit(BNX2X_Q_FLG_MCAST, flags)) {
+		rx_data->approx_mcast_engine_id = o->func_id;
+		rx_data->is_approx_mcast = 1;
 	}
 
-	data->rx.rss_engine_id = params->rxq_params.rss_engine_id;
-
-	/* flow control data */
-	data->rx.cqe_pause_thr_low = cpu_to_le16(params->pause.rcq_th_lo);
-	data->rx.cqe_pause_thr_high = cpu_to_le16(params->pause.rcq_th_hi);
-	data->rx.bd_pause_thr_low = cpu_to_le16(params->pause.bd_th_lo);
-	data->rx.bd_pause_thr_high = cpu_to_le16(params->pause.bd_th_hi);
-	data->rx.sge_pause_thr_low = cpu_to_le16(params->pause.sge_th_lo);
-	data->rx.sge_pause_thr_high = cpu_to_le16(params->pause.sge_th_hi);
-	data->rx.rx_cos_mask = cpu_to_le16(params->pause.pri_map);
+	rx_data->rss_engine_id = params->rss_engine_id;
 
 	/* silent vlan removal */
-	data->rx.silent_vlan_removal_flg =
-		test_bit(BNX2X_Q_FLG_SILENT_VLAN_REM, &params->flags);
-	data->rx.silent_vlan_value =
-		cpu_to_le16(params->rxq_params.silent_removal_value);
-	data->rx.silent_vlan_mask =
-		cpu_to_le16(params->rxq_params.silent_removal_mask);
-
-	/* Tx data */
-	data->tx.enforce_security_flg =
-		test_bit(BNX2X_Q_FLG_TX_SEC, &params->flags);
-	data->tx.default_vlan =
-		cpu_to_le16(params->txq_params.default_vlan);
-	data->tx.default_vlan_flg =
-		test_bit(BNX2X_Q_FLG_DEF_VLAN, &params->flags);
-	data->tx.tx_switching_flg =
-		test_bit(BNX2X_Q_FLG_TX_SWITCH, &params->flags);
-	data->tx.anti_spoofing_flg =
-		test_bit(BNX2X_Q_FLG_ANTI_SPOOF, &params->flags);
-	data->tx.tx_status_block_id = params->txq_params.fw_sb_id;
-	data->tx.tx_sb_index_number = params->txq_params.sb_cq_index;
-	data->tx.tss_leading_client_id = params->txq_params.tss_leading_cl_id;
-
-	data->tx.tx_bd_page_base.lo =
-		cpu_to_le32(U64_LO(params->txq_params.dscr_map));
-	data->tx.tx_bd_page_base.hi =
-		cpu_to_le32(U64_HI(params->txq_params.dscr_map));
+	rx_data->silent_vlan_removal_flg =
+		test_bit(BNX2X_Q_FLG_SILENT_VLAN_REM, flags);
+	rx_data->silent_vlan_value =
+		cpu_to_le16(params->silent_removal_value);
+	rx_data->silent_vlan_mask =
+		cpu_to_le16(params->silent_removal_mask);
 
-	/* Don't configure any Tx switching mode during queue SETUP */
-	data->tx.state = 0;
 }
 
+/* initialize the general, tx and rx parts of a queue object */
+static void bnx2x_q_fill_setup_data_cmn(struct bnx2x *bp,
+				struct bnx2x_queue_state_params *cmd_params,
+				struct client_init_ramrod_data *data)
+{
+	bnx2x_q_fill_init_general_data(bp, cmd_params->q_obj,
+				       &cmd_params->params.setup.gen_params,
+				       &data->general,
+				       &cmd_params->params.setup.flags);
+
+	bnx2x_q_fill_init_tx_data(cmd_params->q_obj,
+				  &cmd_params->params.setup.txq_params,
+				  &data->tx,
+				  &cmd_params->params.setup.flags);
+
+	bnx2x_q_fill_init_rx_data(cmd_params->q_obj,
+				  &cmd_params->params.setup.rxq_params,
+				  &data->rx,
+				  &cmd_params->params.setup.flags);
+
+	bnx2x_q_fill_init_pause_data(cmd_params->q_obj,
+				     &cmd_params->params.setup.pause_params,
+				     &data->rx);
+}
+
+/* initialize the general and tx parts of a tx-only queue object */
+static void bnx2x_q_fill_setup_tx_only(struct bnx2x *bp,
+				struct bnx2x_queue_state_params *cmd_params,
+				struct tx_queue_init_ramrod_data *data)
+{
+	bnx2x_q_fill_init_general_data(bp, cmd_params->q_obj,
+				       &cmd_params->params.tx_only.gen_params,
+				       &data->general,
+				       &cmd_params->params.tx_only.flags);
+
+	bnx2x_q_fill_init_tx_data(cmd_params->q_obj,
+				  &cmd_params->params.tx_only.txq_params,
+				  &data->tx,
+				  &cmd_params->params.tx_only.flags);
+
+	DP(BNX2X_MSG_SP, "cid %d, tx bd page lo %x hi %x",cmd_params->q_obj->cids[0],
+	   data->tx.tx_bd_page_base.lo, data->tx.tx_bd_page_base.hi);
+}
 
 /**
  * bnx2x_q_init - init HW/FW queue
@@ -4377,6 +4449,7 @@ static inline int bnx2x_q_init(struct bnx2x *bp,
 	struct bnx2x_queue_sp_obj *o = params->q_obj;
 	struct bnx2x_queue_init_params *init = &params->params.init;
 	u16 hc_usec;
+	u8 cos;
 
 	/* Tx HC configuration */
 	if (test_bit(BNX2X_Q_TYPE_HAS_TX, &o->type) &&
@@ -4401,7 +4474,12 @@ static inline int bnx2x_q_init(struct bnx2x *bp,
 	}
 
 	/* Set CDU context validation values */
-	bnx2x_set_ctx_validation(bp, init->cxt, o->cid);
+	for (cos = 0; cos < o->max_cos; cos++) {
+		DP(BNX2X_MSG_SP, "setting context validation. cid %d, cos %d",
+				 o->cids[cos], cos);
+		DP(BNX2X_MSG_SP, "context pointer %p", init->cxts[cos]);
+		bnx2x_set_ctx_validation(bp, init->cxts[cos], o->cids[cos]);
+	}
 
 	/* As no ramrod is sent, complete the command immediately  */
 	o->complete_cmd(bp, o, BNX2X_Q_CMD_INIT);
@@ -4429,7 +4507,8 @@ static inline int bnx2x_q_send_setup_e1x(struct bnx2x *bp,
 
 	mb();
 
-	return bnx2x_sp_post(bp, ramrod, o->cid, U64_HI(data_mapping),
+	return bnx2x_sp_post(bp, ramrod, o->cids[BNX2X_PRIMARY_CID_INDEX],
+			     U64_HI(data_mapping),
 			     U64_LO(data_mapping), ETH_CONNECTION_TYPE);
 }
 
@@ -4449,9 +4528,57 @@ static inline int bnx2x_q_send_setup_e2(struct bnx2x *bp,
 	bnx2x_q_fill_setup_data_cmn(bp, params, rdata);
 	bnx2x_q_fill_setup_data_e2(bp, params, rdata);
 
-	mb();
 
-	return bnx2x_sp_post(bp, ramrod, o->cid, U64_HI(data_mapping),
+	return bnx2x_sp_post(bp, ramrod, o->cids[BNX2X_PRIMARY_CID_INDEX],
+			     U64_HI(data_mapping),
+			     U64_LO(data_mapping), ETH_CONNECTION_TYPE);
+}
+
+static inline int bnx2x_q_send_setup_tx_only(struct bnx2x *bp,
+				  struct bnx2x_queue_state_params *params)
+{
+	struct bnx2x_queue_sp_obj *o = params->q_obj;
+	struct tx_queue_init_ramrod_data *rdata =
+		(struct tx_queue_init_ramrod_data *)o->rdata;
+	dma_addr_t data_mapping = o->rdata_mapping;
+	int ramrod = RAMROD_CMD_ID_ETH_TX_QUEUE_SETUP;
+	struct bnx2x_queue_setup_tx_only_params *tx_only_params =
+		&params->params.tx_only;
+	u8 cid_index = tx_only_params->cid_index;
+
+
+	if (cid_index >= o->max_cos) {
+		BNX2X_ERR("queue[%d]: cid_index (%d) is out of range\n",
+			  o->cl_id, cid_index);
+		return -EINVAL;
+	}
+
+	DP(BNX2X_MSG_SP, "parameters received: cos: %d sp-id: %d",
+			 tx_only_params->gen_params.cos,
+			 tx_only_params->gen_params.spcl_id);
+
+	/* Clear the ramrod data */
+	memset(rdata, 0, sizeof(*rdata));
+
+	/* Fill the ramrod data */
+	bnx2x_q_fill_setup_tx_only(bp, params, rdata);
+
+	DP(BNX2X_MSG_SP, "sending tx-only ramrod: cid %d, client-id %d,"
+			 "sp-client id %d, cos %d",
+			 o->cids[cid_index],
+			 rdata->general.client_id,
+			 rdata->general.sp_client_id, rdata->general.cos);
+
+	/*
+	 *  No need for an explicit memory barrier here as long we would
+	 *  need to ensure the ordering of writing to the SPQ element
+	 *  and updating of the SPQ producer which involves a memory
+	 *  read and we will have to put a full memory barrier there
+	 *  (inside bnx2x_sp_post()).
+	 */
+
+	return bnx2x_sp_post(bp, ramrod, o->cids[cid_index],
+			     U64_HI(data_mapping),
 			     U64_LO(data_mapping), ETH_CONNECTION_TYPE);
 }
 
@@ -4521,17 +4648,27 @@ static inline int bnx2x_q_send_update(struct bnx2x *bp,
 	struct client_update_ramrod_data *rdata =
 		(struct client_update_ramrod_data *)o->rdata;
 	dma_addr_t data_mapping = o->rdata_mapping;
+	struct bnx2x_queue_update_params *update_params =
+		&params->params.update;
+	u8 cid_index = update_params->cid_index;
+
+	if (cid_index >= o->max_cos) {
+		BNX2X_ERR("queue[%d]: cid_index (%d) is out of range\n",
+			  o->cl_id, cid_index);
+		return -EINVAL;
+	}
+
 
 	/* Clear the ramrod data */
 	memset(rdata, 0, sizeof(*rdata));
 
 	/* Fill the ramrod data */
-	bnx2x_q_fill_update_data(bp, o, &params->params.update, rdata);
+	bnx2x_q_fill_update_data(bp, o, update_params, rdata);
 
 	mb();
 
-	return bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_CLIENT_UPDATE, o->cid,
-			     U64_HI(data_mapping),
+	return bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_CLIENT_UPDATE,
+			     o->cids[cid_index], U64_HI(data_mapping),
 			     U64_LO(data_mapping), ETH_CONNECTION_TYPE);
 }
 
@@ -4588,7 +4725,8 @@ static inline int bnx2x_q_send_halt(struct bnx2x *bp,
 {
 	struct bnx2x_queue_sp_obj *o = params->q_obj;
 
-	return bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_HALT, o->cid, 0, o->cl_id,
+	return bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_HALT,
+			     o->cids[BNX2X_PRIMARY_CID_INDEX], 0, o->cl_id,
 			     ETH_CONNECTION_TYPE);
 }
 
@@ -4596,18 +4734,32 @@ static inline int bnx2x_q_send_cfc_del(struct bnx2x *bp,
 				       struct bnx2x_queue_state_params *params)
 {
 	struct bnx2x_queue_sp_obj *o = params->q_obj;
+	u8 cid_idx = params->params.cfc_del.cid_index;
 
-	return bnx2x_sp_post(bp, RAMROD_CMD_ID_COMMON_CFC_DEL, o->cid, 0, 0,
-			     NONE_CONNECTION_TYPE);
+	if (cid_idx >= o->max_cos) {
+		BNX2X_ERR("queue[%d]: cid_index (%d) is out of range\n",
+			  o->cl_id, cid_idx);
+		return -EINVAL;
+	}
+
+	return bnx2x_sp_post(bp, RAMROD_CMD_ID_COMMON_CFC_DEL,
+			     o->cids[cid_idx], 0, 0, NONE_CONNECTION_TYPE);
 }
 
 static inline int bnx2x_q_send_terminate(struct bnx2x *bp,
 					struct bnx2x_queue_state_params *params)
 {
 	struct bnx2x_queue_sp_obj *o = params->q_obj;
+	u8 cid_index = params->params.terminate.cid_index;
 
-	return bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_TERMINATE, o->cid, 0, 0,
-			     ETH_CONNECTION_TYPE);
+	if (cid_index >= o->max_cos) {
+		BNX2X_ERR("queue[%d]: cid_index (%d) is out of range\n",
+			  o->cl_id, cid_index);
+		return -EINVAL;
+	}
+
+	return bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_TERMINATE,
+			     o->cids[cid_index], 0, 0, ETH_CONNECTION_TYPE);
 }
 
 static inline int bnx2x_q_send_empty(struct bnx2x *bp,
@@ -4615,7 +4767,8 @@ static inline int bnx2x_q_send_empty(struct bnx2x *bp,
 {
 	struct bnx2x_queue_sp_obj *o = params->q_obj;
 
-	return bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_EMPTY, o->cid, 0, 0,
+	return bnx2x_sp_post(bp, RAMROD_CMD_ID_ETH_EMPTY,
+			     o->cids[BNX2X_PRIMARY_CID_INDEX], 0, 0,
 			     ETH_CONNECTION_TYPE);
 }
 
@@ -4625,6 +4778,8 @@ static inline int bnx2x_queue_send_cmd_cmn(struct bnx2x *bp,
 	switch (params->cmd) {
 	case BNX2X_Q_CMD_INIT:
 		return bnx2x_q_init(bp, params);
+	case BNX2X_Q_CMD_SETUP_TX_ONLY:
+		return bnx2x_q_send_setup_tx_only(bp, params);
 	case BNX2X_Q_CMD_DEACTIVATE:
 		return bnx2x_q_send_deactivate(bp, params);
 	case BNX2X_Q_CMD_ACTIVATE:
@@ -4654,6 +4809,7 @@ static int bnx2x_queue_send_cmd_e1x(struct bnx2x *bp,
 	case BNX2X_Q_CMD_SETUP:
 		return bnx2x_q_send_setup_e1x(bp, params);
 	case BNX2X_Q_CMD_INIT:
+	case BNX2X_Q_CMD_SETUP_TX_ONLY:
 	case BNX2X_Q_CMD_DEACTIVATE:
 	case BNX2X_Q_CMD_ACTIVATE:
 	case BNX2X_Q_CMD_UPDATE:
@@ -4676,6 +4832,7 @@ static int bnx2x_queue_send_cmd_e2(struct bnx2x *bp,
 	case BNX2X_Q_CMD_SETUP:
 		return bnx2x_q_send_setup_e2(bp, params);
 	case BNX2X_Q_CMD_INIT:
+	case BNX2X_Q_CMD_SETUP_TX_ONLY:
 	case BNX2X_Q_CMD_DEACTIVATE:
 	case BNX2X_Q_CMD_ACTIVATE:
 	case BNX2X_Q_CMD_UPDATE:
@@ -4713,6 +4870,9 @@ static int bnx2x_queue_chk_transition(struct bnx2x *bp,
 {
 	enum bnx2x_q_state state = o->state, next_state = BNX2X_Q_STATE_MAX;
 	enum bnx2x_queue_cmd cmd = params->cmd;
+	struct bnx2x_queue_update_params *update_params =
+		 &params->params.update;
+	u8 next_tx_only = o->num_tx_only;
 
 	switch (state) {
 	case BNX2X_Q_STATE_RESET:
@@ -4738,13 +4898,42 @@ static int bnx2x_queue_chk_transition(struct bnx2x *bp,
 			 (cmd == BNX2X_Q_CMD_UPDATE_TPA))
 			next_state = BNX2X_Q_STATE_ACTIVE;
 
+		else if (cmd == BNX2X_Q_CMD_SETUP_TX_ONLY) {
+			next_state = BNX2X_Q_STATE_MULTI_COS;
+			next_tx_only = 1;
+		}
+
 		else if (cmd == BNX2X_Q_CMD_HALT)
 			next_state = BNX2X_Q_STATE_STOPPED;
 
 		else if (cmd == BNX2X_Q_CMD_UPDATE) {
-			struct bnx2x_queue_update_params *update_params =
-				&params->params.update;
+			/* If "active" state change is requested, update the
+			 *  state accordingly.
+			 */
+			if (test_bit(BNX2X_Q_UPDATE_ACTIVATE_CHNG,
+				     &update_params->update_flags) &&
+			    !test_bit(BNX2X_Q_UPDATE_ACTIVATE,
+				      &update_params->update_flags))
+				next_state = BNX2X_Q_STATE_INACTIVE;
+			else
+				next_state = BNX2X_Q_STATE_ACTIVE;
+		}
 
+		break;
+	case BNX2X_Q_STATE_MULTI_COS:
+		if (cmd == BNX2X_Q_CMD_TERMINATE)
+			next_state = BNX2X_Q_STATE_MCOS_TERMINATED;
+
+		else if (cmd == BNX2X_Q_CMD_SETUP_TX_ONLY) {
+			next_state = BNX2X_Q_STATE_MULTI_COS;
+			next_tx_only = o->num_tx_only + 1;
+		}
+
+		else if ((cmd == BNX2X_Q_CMD_EMPTY) ||
+			 (cmd == BNX2X_Q_CMD_UPDATE_TPA))
+			next_state = BNX2X_Q_STATE_MULTI_COS;
+
+		else if (cmd == BNX2X_Q_CMD_UPDATE) {
 			/* If "active" state change is requested, update the
 			 *  state accordingly.
 			 */
@@ -4754,7 +4943,17 @@ static int bnx2x_queue_chk_transition(struct bnx2x *bp,
 				      &update_params->update_flags))
 				next_state = BNX2X_Q_STATE_INACTIVE;
 			else
+				next_state = BNX2X_Q_STATE_MULTI_COS;
+		}
+
+		break;
+	case BNX2X_Q_STATE_MCOS_TERMINATED:
+		if (cmd == BNX2X_Q_CMD_CFC_DEL) {
+			next_tx_only = o->num_tx_only - 1;
+			if (next_tx_only == 0)
 				next_state = BNX2X_Q_STATE_ACTIVE;
+			else
+				next_state = BNX2X_Q_STATE_MULTI_COS;
 		}
 
 		break;
@@ -4770,18 +4969,18 @@ static int bnx2x_queue_chk_transition(struct bnx2x *bp,
 			next_state = BNX2X_Q_STATE_STOPPED;
 
 		else if (cmd == BNX2X_Q_CMD_UPDATE) {
-			struct bnx2x_queue_update_params *update_params =
-				&params->params.update;
-
 			/* If "active" state change is requested, update the
 			 * state accordingly.
 			 */
 			if (test_bit(BNX2X_Q_UPDATE_ACTIVATE_CHNG,
 				     &update_params->update_flags) &&
 			    test_bit(BNX2X_Q_UPDATE_ACTIVATE,
-				     &update_params->update_flags))
-				next_state = BNX2X_Q_STATE_ACTIVE;
-			else
+				     &update_params->update_flags)){
+				if (o->num_tx_only == 0)
+					next_state = BNX2X_Q_STATE_ACTIVE;
+				else /* tx only queues exist for this queue */
+					next_state = BNX2X_Q_STATE_MULTI_COS;
+			} else
 				next_state = BNX2X_Q_STATE_INACTIVE;
 		}
 
@@ -4805,6 +5004,7 @@ static int bnx2x_queue_chk_transition(struct bnx2x *bp,
 		DP(BNX2X_MSG_SP, "Good state transition: %d(%d)->%d\n",
 				 state, cmd, next_state);
 		o->next_state = next_state;
+		o->next_tx_only = next_tx_only;
 		return 0;
 	}
 
@@ -4815,12 +5015,17 @@ static int bnx2x_queue_chk_transition(struct bnx2x *bp,
 
 void bnx2x_init_queue_obj(struct bnx2x *bp,
 			  struct bnx2x_queue_sp_obj *obj,
-			  u8 cl_id, u32 cid, u8 func_id, void *rdata,
+			  u8 cl_id, u32 *cids, u8 cid_cnt, u8 func_id,
+			  void *rdata,
 			  dma_addr_t rdata_mapping, unsigned long type)
 {
 	memset(obj, 0, sizeof(*obj));
 
-	obj->cid = cid;
+	/* We support only BNX2X_MULTI_TX_COS Tx CoS at the moment */
+	BUG_ON(BNX2X_MULTI_TX_COS < cid_cnt);
+
+	memcpy(obj->cids, cids, sizeof(obj->cids[0]) * cid_cnt);
+	obj->max_cos = cid_cnt;
 	obj->cl_id = cl_id;
 	obj->func_id = func_id;
 	obj->rdata = rdata;
@@ -4840,6 +5045,13 @@ void bnx2x_init_queue_obj(struct bnx2x *bp,
 	obj->set_pending = bnx2x_queue_set_pending;
 }
 
+void bnx2x_queue_set_cos_cid(struct bnx2x *bp,
+			     struct bnx2x_queue_sp_obj *obj,
+			     u32 cid, u8 index)
+{
+	obj->cids[index] = cid;
+}
+
 /********************** Function state object *********************************/
 
 static int bnx2x_func_wait_comp(struct bnx2x *bp,
diff --git a/drivers/net/bnx2x/bnx2x_sp.h b/drivers/net/bnx2x/bnx2x_sp.h
index 86eaa80721ea0556dc73a7c85517743914c147e4..83f3b0b4421162cef76da66fa0c785a3ba3949c5 100644
--- a/drivers/net/bnx2x/bnx2x_sp.h
+++ b/drivers/net/bnx2x/bnx2x_sp.h
@@ -721,6 +721,8 @@ enum bnx2x_q_state {
 	BNX2X_Q_STATE_RESET,
 	BNX2X_Q_STATE_INITIALIZED,
 	BNX2X_Q_STATE_ACTIVE,
+	BNX2X_Q_STATE_MULTI_COS,
+	BNX2X_Q_STATE_MCOS_TERMINATED,
 	BNX2X_Q_STATE_INACTIVE,
 	BNX2X_Q_STATE_STOPPED,
 	BNX2X_Q_STATE_TERMINATED,
@@ -732,6 +734,7 @@ enum bnx2x_q_state {
 enum bnx2x_queue_cmd {
 	BNX2X_Q_CMD_INIT,
 	BNX2X_Q_CMD_SETUP,
+	BNX2X_Q_CMD_SETUP_TX_ONLY,
 	BNX2X_Q_CMD_DEACTIVATE,
 	BNX2X_Q_CMD_ACTIVATE,
 	BNX2X_Q_CMD_UPDATE,
@@ -774,6 +777,13 @@ enum bnx2x_q_type {
 	BNX2X_Q_TYPE_HAS_TX,
 };
 
+#define BNX2X_PRIMARY_CID_INDEX			0
+#define BNX2X_MULTI_TX_COS_E1X			1
+#define BNX2X_MULTI_TX_COS_E2_E3A0		2
+#define BNX2X_MULTI_TX_COS_E3B0			3
+#define BNX2X_MULTI_TX_COS			BNX2X_MULTI_TX_COS_E3B0
+
+
 struct bnx2x_queue_init_params {
 	struct {
 		unsigned long	flags;
@@ -790,7 +800,20 @@ struct bnx2x_queue_init_params {
 	} rx;
 
 	/* CID context in the host memory */
-	struct eth_context *cxt;
+	struct eth_context *cxts[BNX2X_MULTI_TX_COS];
+
+	/* maximum number of cos supported by hardware */
+	u8 max_cos;
+};
+
+struct bnx2x_queue_terminate_params {
+	/* index within the tx_only cids of this queue object */
+	u8 cid_index;
+};
+
+struct bnx2x_queue_cfc_del_params {
+	/* index within the tx_only cids of this queue object */
+	u8 cid_index;
 };
 
 struct bnx2x_queue_update_params {
@@ -798,6 +821,8 @@ struct bnx2x_queue_update_params {
 	u16		def_vlan;
 	u16		silent_removal_value;
 	u16		silent_removal_mask;
+/* index within the tx_only cids of this queue object */
+	u8		cid_index;
 };
 
 struct rxq_pause_params {
@@ -817,6 +842,7 @@ struct bnx2x_general_setup_params {
 
 	u8		spcl_id;
 	u16		mtu;
+	u8		cos;
 };
 
 struct bnx2x_rxq_setup_params {
@@ -863,13 +889,20 @@ struct bnx2x_txq_setup_params {
 };
 
 struct bnx2x_queue_setup_params {
-	struct rxq_pause_params pause;
 	struct bnx2x_general_setup_params gen_params;
-	struct bnx2x_rxq_setup_params rxq_params;
 	struct bnx2x_txq_setup_params txq_params;
+	struct bnx2x_rxq_setup_params rxq_params;
+	struct rxq_pause_params pause_params;
 	unsigned long flags;
 };
 
+struct bnx2x_queue_setup_tx_only_params {
+	struct bnx2x_general_setup_params	gen_params;
+	struct bnx2x_txq_setup_params		txq_params;
+	unsigned long				flags;
+	/* index within the tx_only cids of this queue object */
+	u8					cid_index;
+};
 
 struct bnx2x_queue_state_params {
 	struct bnx2x_queue_sp_obj *q_obj;
@@ -878,21 +911,36 @@ struct bnx2x_queue_state_params {
 	enum bnx2x_queue_cmd cmd;
 
 	/* may have RAMROD_COMP_WAIT set only */
-	unsigned long	ramrod_flags;
+	unsigned long ramrod_flags;
 
 	/* Params according to the current command */
 	union {
-		struct bnx2x_queue_update_params update;
-		struct bnx2x_queue_setup_params   setup;
-		struct bnx2x_queue_init_params	  init;
+		struct bnx2x_queue_update_params	update;
+		struct bnx2x_queue_setup_params		setup;
+		struct bnx2x_queue_init_params		init;
+		struct bnx2x_queue_setup_tx_only_params	tx_only;
+		struct bnx2x_queue_terminate_params	terminate;
+		struct bnx2x_queue_cfc_del_params	cfc_del;
 	} params;
 };
 
 struct bnx2x_queue_sp_obj {
-	u32		cid;
+	u32		cids[BNX2X_MULTI_TX_COS];
 	u8		cl_id;
 	u8		func_id;
 
+	/*
+	 * number of traffic classes supported by queue.
+	 * The primary connection of the queue suppotrs the first traffic
+	 * class. Any further traffic class is suppoted by a tx-only
+	 * connection.
+	 *
+	 * Therefore max_cos is also a number of valid entries in the cids
+	 * array.
+	 */
+	u8 max_cos;
+	u8 num_tx_only, next_tx_only;
+
 	enum bnx2x_q_state state, next_state;
 
 	/* bits from enum bnx2x_q_type */
@@ -1106,9 +1154,9 @@ int bnx2x_func_state_change(struct bnx2x *bp,
 
 /******************* Queue State **************/
 void bnx2x_init_queue_obj(struct bnx2x *bp,
-			  struct bnx2x_queue_sp_obj *obj, u8 cl_id, u32 cid,
-			  u8 func_id, void *rdata, dma_addr_t rdata_mapping,
-			  unsigned long type);
+			  struct bnx2x_queue_sp_obj *obj, u8 cl_id, u32 *cids,
+			  u8 cid_cnt, u8 func_id, void *rdata,
+			  dma_addr_t rdata_mapping, unsigned long type);
 
 int bnx2x_queue_state_change(struct bnx2x *bp,
 			     struct bnx2x_queue_state_params *params);
diff --git a/drivers/net/bnx2x/bnx2x_stats.c b/drivers/net/bnx2x/bnx2x_stats.c
index 54c07f557ad4cb4cd6c0132c718c3318618e4568..771f6803b23856f7ffe928a4d23545d6ac4be896 100644
--- a/drivers/net/bnx2x/bnx2x_stats.c
+++ b/drivers/net/bnx2x/bnx2x_stats.c
@@ -1185,7 +1185,7 @@ static void bnx2x_stats_update(struct bnx2x *bp)
 
 	if (netif_msg_timer(bp)) {
 		struct bnx2x_eth_stats *estats = &bp->eth_stats;
-		int i;
+		int i, cos;
 
 		netdev_dbg(bp->dev, "brb drops %u  brb truncate %u\n",
 		       estats->brb_drop_lo, estats->brb_truncate_lo);
@@ -1206,20 +1206,32 @@ static void bnx2x_stats_update(struct bnx2x *bp)
 
 		for_each_eth_queue(bp, i) {
 			struct bnx2x_fastpath *fp = &bp->fp[i];
+			struct bnx2x_fp_txdata *txdata;
 			struct bnx2x_eth_q_stats *qstats = &fp->eth_q_stats;
-			struct netdev_queue *txq =
-				netdev_get_tx_queue(bp->dev, i);
-
-			printk(KERN_DEBUG "%s: tx avail(%4u)  *tx_cons_sb(%u)"
-					  "  tx pkt(%lu) tx calls (%lu)"
-					  "  %s (Xoff events %u)\n",
-			       fp->name, bnx2x_tx_avail(fp),
-			       le16_to_cpu(*fp->tx_cons_sb),
-			       bnx2x_hilo(&qstats->
-					  total_unicast_packets_transmitted_hi),
-			       fp->tx_pkt,
-			       (netif_tx_queue_stopped(txq) ? "Xoff" : "Xon"),
-			       qstats->driver_xoff);
+			struct netdev_queue *txq;
+
+			printk(KERN_DEBUG "%s: tx pkt(%lu) (Xoff events %u)",
+				fp->name, bnx2x_hilo(
+				&qstats->total_unicast_packets_transmitted_hi),
+				qstats->driver_xoff);
+
+			for_each_cos_in_tx_queue(fp, cos) {
+				txdata = &fp->txdata[cos];
+				txq = netdev_get_tx_queue(bp->dev,
+						FP_COS_TO_TXQ(fp, cos));
+
+				printk(KERN_DEBUG "%d: tx avail(%4u)"
+				       "  *tx_cons_sb(%u)"
+				       "  tx calls (%lu)"
+				       "  %s\n",
+				       cos,
+				       bnx2x_tx_avail(bp, txdata),
+				       le16_to_cpu(*txdata->tx_cons_sb),
+				       txdata->tx_pkt,
+				       (netif_tx_queue_stopped(txq) ?
+					"Xoff" : "Xon")
+				       );
+			}
 		}
 	}