myri10ge: improve parity error detection and recovery

Improve myri10ge parity error detection and recovery:
1) Don't restore PCI config space to a rebooted NIC until AFTER the
   host is quiescent.
2) Let myri10ge_close() know the NIC is dead, so it won't waste time
   waiting for a dead nic to respond to MXGEFW_CMD_ETHERNET_DOWN
3) When the NIC is quiet (link down, or otherwise idle link) use
   a pci config space read to detect a rebooted NIC.  Otherwise
   we might never notice that a NIC rebooted

Signed-off-by: Andrew Gallatin <gallatin@myri.com>
Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Brice Goglin 2009-08-07 10:44:22 +00:00 committed by David S. Miller
parent c9145a2df0
commit d02342151c

View file

@ -75,7 +75,7 @@
#include "myri10ge_mcp.h" #include "myri10ge_mcp.h"
#include "myri10ge_mcp_gen_header.h" #include "myri10ge_mcp_gen_header.h"
#define MYRI10GE_VERSION_STR "1.5.0-1.418" #define MYRI10GE_VERSION_STR "1.5.0-1.432"
MODULE_DESCRIPTION("Myricom 10G driver (10GbE)"); MODULE_DESCRIPTION("Myricom 10G driver (10GbE)");
MODULE_AUTHOR("Maintainer: help@myri.com"); MODULE_AUTHOR("Maintainer: help@myri.com");
@ -188,6 +188,7 @@ struct myri10ge_slice_state {
dma_addr_t fw_stats_bus; dma_addr_t fw_stats_bus;
int watchdog_tx_done; int watchdog_tx_done;
int watchdog_tx_req; int watchdog_tx_req;
int watchdog_rx_done;
#ifdef CONFIG_MYRI10GE_DCA #ifdef CONFIG_MYRI10GE_DCA
int cached_dca_tag; int cached_dca_tag;
int cpu; int cpu;
@ -256,6 +257,7 @@ struct myri10ge_priv {
u32 link_changes; u32 link_changes;
u32 msg_enable; u32 msg_enable;
unsigned int board_number; unsigned int board_number;
int rebooted;
}; };
static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat"; static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat";
@ -2552,17 +2554,22 @@ static int myri10ge_close(struct net_device *dev)
netif_carrier_off(dev); netif_carrier_off(dev);
netif_tx_stop_all_queues(dev); netif_tx_stop_all_queues(dev);
old_down_cnt = mgp->down_cnt; if (mgp->rebooted == 0) {
mb(); old_down_cnt = mgp->down_cnt;
status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0); mb();
if (status) status =
printk(KERN_ERR "myri10ge: %s: Couldn't bring down link\n", myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
dev->name); if (status)
printk(KERN_ERR
wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, HZ); "myri10ge: %s: Couldn't bring down link\n",
if (old_down_cnt == mgp->down_cnt) dev->name);
printk(KERN_ERR "myri10ge: %s never got down irq\n", dev->name);
wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt,
HZ);
if (old_down_cnt == mgp->down_cnt)
printk(KERN_ERR "myri10ge: %s never got down irq\n",
dev->name);
}
netif_tx_disable(dev); netif_tx_disable(dev);
myri10ge_free_irq(mgp); myri10ge_free_irq(mgp);
for (i = 0; i < mgp->num_slices; i++) for (i = 0; i < mgp->num_slices; i++)
@ -3427,12 +3434,13 @@ static void myri10ge_watchdog(struct work_struct *work)
container_of(work, struct myri10ge_priv, watchdog_work); container_of(work, struct myri10ge_priv, watchdog_work);
struct myri10ge_tx_buf *tx; struct myri10ge_tx_buf *tx;
u32 reboot; u32 reboot;
int status; int status, rebooted;
int i; int i;
u16 cmd, vendor; u16 cmd, vendor;
mgp->watchdog_resets++; mgp->watchdog_resets++;
pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd); pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
rebooted = 0;
if ((cmd & PCI_COMMAND_MASTER) == 0) { if ((cmd & PCI_COMMAND_MASTER) == 0) {
/* Bus master DMA disabled? Check to see /* Bus master DMA disabled? Check to see
* if the card rebooted due to a parity error * if the card rebooted due to a parity error
@ -3444,9 +3452,12 @@ static void myri10ge_watchdog(struct work_struct *work)
myri10ge_reset_recover ? " " : " not"); myri10ge_reset_recover ? " " : " not");
if (myri10ge_reset_recover == 0) if (myri10ge_reset_recover == 0)
return; return;
rtnl_lock();
mgp->rebooted = 1;
rebooted = 1;
myri10ge_close(mgp->dev);
myri10ge_reset_recover--; myri10ge_reset_recover--;
mgp->rebooted = 0;
/* /*
* A rebooted nic will come back with config space as * A rebooted nic will come back with config space as
* it was after power was applied to PCIe bus. * it was after power was applied to PCIe bus.
@ -3494,8 +3505,10 @@ static void myri10ge_watchdog(struct work_struct *work)
} }
} }
rtnl_lock(); if (!rebooted) {
myri10ge_close(mgp->dev); rtnl_lock();
myri10ge_close(mgp->dev);
}
status = myri10ge_load_firmware(mgp, 1); status = myri10ge_load_firmware(mgp, 1);
if (status != 0) if (status != 0)
printk(KERN_ERR "myri10ge: %s: failed to load firmware\n", printk(KERN_ERR "myri10ge: %s: failed to load firmware\n",
@ -3516,12 +3529,14 @@ static void myri10ge_watchdog_timer(unsigned long arg)
{ {
struct myri10ge_priv *mgp; struct myri10ge_priv *mgp;
struct myri10ge_slice_state *ss; struct myri10ge_slice_state *ss;
int i, reset_needed; int i, reset_needed, busy_slice_cnt;
u32 rx_pause_cnt; u32 rx_pause_cnt;
u16 cmd;
mgp = (struct myri10ge_priv *)arg; mgp = (struct myri10ge_priv *)arg;
rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause); rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
busy_slice_cnt = 0;
for (i = 0, reset_needed = 0; for (i = 0, reset_needed = 0;
i < mgp->num_slices && reset_needed == 0; ++i) { i < mgp->num_slices && reset_needed == 0; ++i) {
@ -3559,8 +3574,22 @@ static void myri10ge_watchdog_timer(unsigned long arg)
reset_needed = 1; reset_needed = 1;
} }
} }
if (ss->watchdog_tx_done != ss->tx.done ||
ss->watchdog_rx_done != ss->rx_done.cnt) {
busy_slice_cnt++;
}
ss->watchdog_tx_done = ss->tx.done; ss->watchdog_tx_done = ss->tx.done;
ss->watchdog_tx_req = ss->tx.req; ss->watchdog_tx_req = ss->tx.req;
ss->watchdog_rx_done = ss->rx_done.cnt;
}
/* if we've sent or received no traffic, poll the NIC to
* ensure it is still there. Otherwise, we risk not noticing
* an error in a timely fashion */
if (busy_slice_cnt == 0) {
pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
if ((cmd & PCI_COMMAND_MASTER) == 0) {
reset_needed = 1;
}
} }
mgp->watchdog_pause = rx_pause_cnt; mgp->watchdog_pause = rx_pause_cnt;