16 files changed, 1276 insertions, 801 deletions
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index c222189f5bb2..890622b87c8f 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -6,8 +6,9 @@ obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
 			   firmware.o power.o dlpar.o mobility.o
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_SCANLOG)	+= scanlog.o
-obj-$(CONFIG_EEH)	+= eeh.o eeh_dev.o eeh_cache.o eeh_driver.o \
-			   eeh_event.o eeh_sysfs.o eeh_pseries.o
+obj-$(CONFIG_EEH)	+= eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
+			   eeh_driver.o eeh_event.o eeh_sysfs.o \
+			   eeh_pseries.o
 obj-$(CONFIG_KEXEC)	+= kexec.o
 obj-$(CONFIG_PCI)	+= pci.o pci_dlpar.o
 obj-$(CONFIG_PSERIES_MSI)	+= msi.o
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index ecd394cf34e6..9a04322b1736 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -92,6 +92,20 @@ struct eeh_ops *eeh_ops = NULL;
 int eeh_subsystem_enabled;
 EXPORT_SYMBOL(eeh_subsystem_enabled);
 
+/*
+ * EEH probe mode support. The intention is to support multiple
+ * platforms for EEH. Some platforms like pSeries do PCI emunation
+ * based on device tree. However, other platforms like powernv probe
+ * PCI devices from hardware. The flag is used to distinguish that.
+ * In addition, struct eeh_ops::probe would be invoked for particular
+ * OF node or PCI device so that the corresponding PE would be created
+ * there.
+ */
+int eeh_probe_mode;
+
+/* Global EEH mutex */
+DEFINE_MUTEX(eeh_mutex);
+
 /* Lock to avoid races due to multiple reports of an error */
 static DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
@@ -204,22 +218,12 @@ static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
 		}
 	}
 
-	/* Gather status on devices under the bridge */
-	if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) {
-		struct device_node *child;
-
-		for_each_child_of_node(dn, child) {
-			if (of_node_to_eeh_dev(child))
-				n += eeh_gather_pci_data(of_node_to_eeh_dev(child), buf+n, len-n);
-		}
-	}
-
 	return n;
 }
 
 /**
  * eeh_slot_error_detail - Generate combined log including driver log and error log
- * @edev: device to report error log for
+ * @pe: EEH PE
  * @severity: temporary or permanent error log
  *
  * This routine should be called to generate the combined log, which
@@ -227,17 +231,22 @@ static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len)
  * out from the config space of the corresponding PCI device, while
  * the error log is fetched through platform dependent function call.
  */
-void eeh_slot_error_detail(struct eeh_dev *edev, int severity)
+void eeh_slot_error_detail(struct eeh_pe *pe, int severity)
 {
 	size_t loglen = 0;
-	pci_regs_buf[0] = 0;
+	struct eeh_dev *edev;
 
-	eeh_pci_enable(edev, EEH_OPT_THAW_MMIO);
-	eeh_ops->configure_bridge(eeh_dev_to_of_node(edev));
-	eeh_restore_bars(edev);
-	loglen = eeh_gather_pci_data(edev, pci_regs_buf, EEH_PCI_REGS_LOG_LEN);
+	eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
+	eeh_ops->configure_bridge(pe);
+	eeh_pe_restore_bars(pe);
 
-	eeh_ops->get_log(eeh_dev_to_of_node(edev), severity, pci_regs_buf, loglen);
+	pci_regs_buf[0] = 0;
+	eeh_pe_for_each_dev(pe, edev) {
+		loglen += eeh_gather_pci_data(edev, pci_regs_buf,
+				EEH_PCI_REGS_LOG_LEN);
+        }
+
+	eeh_ops->get_log(pe, severity, pci_regs_buf, loglen);
 }
 
 /**
@@ -261,126 +270,8 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 }
 
 /**
- * eeh_find_device_pe - Retrieve the PE for the given device
- * @dn: device node
- *
- * Return the PE under which this device lies
- */
-struct device_node *eeh_find_device_pe(struct device_node *dn)
-{
-	while (dn->parent && of_node_to_eeh_dev(dn->parent) &&
-	       (of_node_to_eeh_dev(dn->parent)->mode & EEH_MODE_SUPPORTED)) {
-		dn = dn->parent;
-	}
-	return dn;
-}
-
-/**
- * __eeh_mark_slot - Mark all child devices as failed
- * @parent: parent device
- * @mode_flag: failure flag
- *
- * Mark all devices that are children of this device as failed.
- * Mark the device driver too, so that it can see the failure
- * immediately; this is critical, since some drivers poll
- * status registers in interrupts ... If a driver is polling,
- * and the slot is frozen, then the driver can deadlock in
- * an interrupt context, which is bad.
- */
-static void __eeh_mark_slot(struct device_node *parent, int mode_flag)
-{
-	struct device_node *dn;
-
-	for_each_child_of_node(parent, dn) {
-		if (of_node_to_eeh_dev(dn)) {
-			/* Mark the pci device driver too */
-			struct pci_dev *dev = of_node_to_eeh_dev(dn)->pdev;
-
-			of_node_to_eeh_dev(dn)->mode |= mode_flag;
-
-			if (dev && dev->driver)
-				dev->error_state = pci_channel_io_frozen;
-
-			__eeh_mark_slot(dn, mode_flag);
-		}
-	}
-}
-
-/**
- * eeh_mark_slot - Mark the indicated device and its children as failed
- * @dn: parent device
- * @mode_flag: failure flag
- *
- * Mark the indicated device and its child devices as failed.
- * The device drivers are marked as failed as well.
- */
-void eeh_mark_slot(struct device_node *dn, int mode_flag)
-{
-	struct pci_dev *dev;
-	dn = eeh_find_device_pe(dn);
-
-	/* Back up one, since config addrs might be shared */
-	if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
-		dn = dn->parent;
-
-	of_node_to_eeh_dev(dn)->mode |= mode_flag;
-
-	/* Mark the pci device too */
-	dev = of_node_to_eeh_dev(dn)->pdev;
-	if (dev)
-		dev->error_state = pci_channel_io_frozen;
-
-	__eeh_mark_slot(dn, mode_flag);
-}
-
-/**
- * __eeh_clear_slot - Clear failure flag for the child devices
- * @parent: parent device
- * @mode_flag: flag to be cleared
- *
- * Clear failure flag for the child devices.
- */
-static void __eeh_clear_slot(struct device_node *parent, int mode_flag)
-{
-	struct device_node *dn;
-
-	for_each_child_of_node(parent, dn) {
-		if (of_node_to_eeh_dev(dn)) {
-			of_node_to_eeh_dev(dn)->mode &= ~mode_flag;
-			of_node_to_eeh_dev(dn)->check_count = 0;
-			__eeh_clear_slot(dn, mode_flag);
-		}
-	}
-}
-
-/**
- * eeh_clear_slot - Clear failure flag for the indicated device and its children
- * @dn: parent device
- * @mode_flag: flag to be cleared
- *
- * Clear failure flag for the indicated device and its children.
- */
-void eeh_clear_slot(struct device_node *dn, int mode_flag)
-{
-	unsigned long flags;
-	raw_spin_lock_irqsave(&confirm_error_lock, flags);
-	
-	dn = eeh_find_device_pe(dn);
-	
-	/* Back up one, since config addrs might be shared */
-	if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
-		dn = dn->parent;
-
-	of_node_to_eeh_dev(dn)->mode &= ~mode_flag;
-	of_node_to_eeh_dev(dn)->check_count = 0;
-	__eeh_clear_slot(dn, mode_flag);
-	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
-}
-
-/**
- * eeh_dn_check_failure - Check if all 1's data is due to EEH slot freeze
- * @dn: device node
- * @dev: pci device, if known
+ * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @edev: eeh device
  *
  * Check for an EEH failure for the given device node.  Call this
  * routine if the result of a read was all 0xff's and you want to
@@ -392,11 +283,13 @@ void eeh_clear_slot(struct device_node *dn, int mode_flag)
  *
  * It is safe to call this routine in an interrupt context.
  */
-int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
+int eeh_dev_check_failure(struct eeh_dev *edev)
 {
 	int ret;
 	unsigned long flags;
-	struct eeh_dev *edev;
+	struct device_node *dn;
+	struct pci_dev *dev;
+	struct eeh_pe *pe;
 	int rc = 0;
 	const char *location;
 
@@ -405,23 +298,23 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	if (!eeh_subsystem_enabled)
 		return 0;
 
-	if (!dn) {
+	if (!edev) {
 		eeh_stats.no_dn++;
 		return 0;
 	}
-	dn = eeh_find_device_pe(dn);
-	edev = of_node_to_eeh_dev(dn);
+	dn = eeh_dev_to_of_node(edev);
+	dev = eeh_dev_to_pci_dev(edev);
+	pe = edev->pe;
 
 	/* Access to IO BARs might get this far and still not want checking. */
-	if (!(edev->mode & EEH_MODE_SUPPORTED) ||
-	    edev->mode & EEH_MODE_NOCHECK) {
+	if (!pe) {
 		eeh_stats.ignored_check++;
-		pr_debug("EEH: Ignored check (%x) for %s %s\n",
-			edev->mode, eeh_pci_name(dev), dn->full_name);
+		pr_debug("EEH: Ignored check for %s %s\n",
+			eeh_pci_name(dev), dn->full_name);
 		return 0;
 	}
 
-	if (!edev->config_addr && !edev->pe_config_addr) {
+	if (!pe->addr && !pe->config_addr) {
 		eeh_stats.no_cfg_addr++;
 		return 0;
 	}
@@ -434,13 +327,13 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	 */
 	raw_spin_lock_irqsave(&confirm_error_lock, flags);
 	rc = 1;
-	if (edev->mode & EEH_MODE_ISOLATED) {
-		edev->check_count++;
-		if (edev->check_count % EEH_MAX_FAILS == 0) {
+	if (pe->state & EEH_PE_ISOLATED) {
+		pe->check_count++;
+		if (pe->check_count % EEH_MAX_FAILS == 0) {
 			location = of_get_property(dn, "ibm,loc-code", NULL);
 			printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
 				"location=%s driver=%s pci addr=%s\n",
-				edev->check_count, location,
+				pe->check_count, location,
 				eeh_driver_name(dev), eeh_pci_name(dev));
 			printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
 				eeh_driver_name(dev));
@@ -456,7 +349,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	 * function zero of a multi-function device.
 	 * In any case they must share a common PHB.
 	 */
-	ret = eeh_ops->get_state(dn, NULL);
+	ret = eeh_ops->get_state(pe, NULL);
 
 	/* Note that config-io to empty slots may fail;
 	 * they are empty when they don't have children.
@@ -469,7 +362,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
 	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
 		eeh_stats.false_positives++;
-		edev->false_positives ++;
+		pe->false_positives++;
 		rc = 0;
 		goto dn_unlock;
 	}
@@ -480,10 +373,10 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	 * with other functions on this device, and functions under
 	 * bridges.
 	 */
-	eeh_mark_slot(dn, EEH_MODE_ISOLATED);
+	eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
 	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
 
-	eeh_send_failure_event(edev);
+	eeh_send_failure_event(pe);
 
 	/* Most EEH events are due to device driver bugs.  Having
 	 * a stack trace will help the device-driver authors figure
@@ -497,7 +390,7 @@ dn_unlock:
 	return rc;
 }
 
-EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
+EXPORT_SYMBOL_GPL(eeh_dev_check_failure);
 
 /**
  * eeh_check_failure - Check if all 1's data is due to EEH slot freeze
@@ -514,21 +407,19 @@ EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
 unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val)
 {
 	unsigned long addr;
-	struct pci_dev *dev;
-	struct device_node *dn;
+	struct eeh_dev *edev;
 
 	/* Finding the phys addr + pci device; this is pretty quick. */
 	addr = eeh_token_to_phys((unsigned long __force) token);
-	dev = pci_addr_cache_get_device(addr);
-	if (!dev) {
+	edev = eeh_addr_cache_get_dev(addr);
+	if (!edev) {
 		eeh_stats.no_device++;
 		return val;
 	}
 
-	dn = pci_device_to_OF_node(dev);
-	eeh_dn_check_failure(dn, dev);
+	eeh_dev_check_failure(edev);
 
-	pci_dev_put(dev);
+	pci_dev_put(eeh_dev_to_pci_dev(edev));
 	return val;
 }
 
@@ -537,23 +428,22 @@ EXPORT_SYMBOL(eeh_check_failure);
 
 /**
  * eeh_pci_enable - Enable MMIO or DMA transfers for this slot
- * @edev: pci device node
+ * @pe: EEH PE
  *
  * This routine should be called to reenable frozen MMIO or DMA
  * so that it would work correctly again. It's useful while doing
  * recovery or log collection on the indicated device.
  */
-int eeh_pci_enable(struct eeh_dev *edev, int function)
+int eeh_pci_enable(struct eeh_pe *pe, int function)
 {
 	int rc;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
 
-	rc = eeh_ops->set_option(dn, function);
+	rc = eeh_ops->set_option(pe, function);
 	if (rc)
-		printk(KERN_WARNING "EEH: Unexpected state change %d, err=%d dn=%s\n",
-		        function, rc, dn->full_name);
+		pr_warning("%s: Unexpected state change %d on PHB#%d-PE#%x, err=%d\n",
+			__func__, function, pe->phb->global_number, pe->addr, rc);
 
-	rc = eeh_ops->wait_state(dn, PCI_BUS_RESET_WAIT_MSEC);
+	rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
 	if (rc > 0 && (rc & EEH_STATE_MMIO_ENABLED) &&
 	   (function == EEH_OPT_THAW_MMIO))
 		return 0;
@@ -571,17 +461,24 @@ int eeh_pci_enable(struct eeh_dev *edev, int function)
  */
 int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
 {
-	struct device_node *dn = pci_device_to_OF_node(dev);
+	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
+	struct eeh_pe *pe = edev->pe;
+
+	if (!pe) {
+		pr_err("%s: No PE found on PCI device %s\n",
+			__func__, pci_name(dev));
+		return -EINVAL;
+	}
 
 	switch (state) {
 	case pcie_deassert_reset:
-		eeh_ops->reset(dn, EEH_RESET_DEACTIVATE);
+		eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
 		break;
 	case pcie_hot_reset:
-		eeh_ops->reset(dn, EEH_RESET_HOT);
+		eeh_ops->reset(pe, EEH_RESET_HOT);
 		break;
 	case pcie_warm_reset:
-		eeh_ops->reset(dn, EEH_RESET_FUNDAMENTAL);
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
 		break;
 	default:
 		return -EINVAL;
@@ -591,66 +488,37 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
 }
 
 /**
- * __eeh_set_pe_freset - Check the required reset for child devices
- * @parent: parent device
- * @freset: return value
- *
- * Each device might have its preferred reset type: fundamental or
- * hot reset. The routine is used to collect the information from
- * the child devices so that they could be reset accordingly.
- */
-void __eeh_set_pe_freset(struct device_node *parent, unsigned int *freset)
-{
-	struct device_node *dn;
-
-	for_each_child_of_node(parent, dn) {
-		if (of_node_to_eeh_dev(dn)) {
-			struct pci_dev *dev = of_node_to_eeh_dev(dn)->pdev;
-
-			if (dev && dev->driver)
-				*freset |= dev->needs_freset;
-
-			__eeh_set_pe_freset(dn, freset);
-		}
-	}
-}
-
-/**
- * eeh_set_pe_freset - Check the required reset for the indicated device and its children
- * @dn: parent device
- * @freset: return value
+ * eeh_set_pe_freset - Check the required reset for the indicated device
+ * @data: EEH device
+ * @flag: return value
  *
  * Each device might have its preferred reset type: fundamental or
  * hot reset. The routine is used to collected the information for
  * the indicated device and its children so that the bunch of the
  * devices could be reset properly.
  */
-void eeh_set_pe_freset(struct device_node *dn, unsigned int *freset)
+static void *eeh_set_dev_freset(void *data, void *flag)
 {
 	struct pci_dev *dev;
-	dn = eeh_find_device_pe(dn);
-
-	/* Back up one, since config addrs might be shared */
-	if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
-		dn = dn->parent;
+	unsigned int *freset = (unsigned int *)flag;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
 
-	dev = of_node_to_eeh_dev(dn)->pdev;
+	dev = eeh_dev_to_pci_dev(edev);
 	if (dev)
 		*freset |= dev->needs_freset;
 
-	__eeh_set_pe_freset(dn, freset);
+	return NULL;
 }
 
 /**
  * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second
- * @edev: pci device node to be reset.
+ * @pe: EEH PE
  *
  * Assert the PCI #RST line for 1/4 second.
  */
-static void eeh_reset_pe_once(struct eeh_dev *edev)
+static void eeh_reset_pe_once(struct eeh_pe *pe)
 {
 	unsigned int freset = 0;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
 
 	/* Determine type of EEH reset required for
 	 * Partitionable Endpoint, a hot-reset (1)
@@ -658,12 +526,12 @@ static void eeh_reset_pe_once(struct eeh_dev *edev)
 	 * A fundamental reset required by any device under
 	 * Partitionable Endpoint trumps hot-reset.
   	 */
-	eeh_set_pe_freset(dn, &freset);
+	eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
 
 	if (freset)
-		eeh_ops->reset(dn, EEH_RESET_FUNDAMENTAL);
+		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
 	else
-		eeh_ops->reset(dn, EEH_RESET_HOT);
+		eeh_ops->reset(pe, EEH_RESET_HOT);
 
 	/* The PCI bus requires that the reset be held high for at least
 	 * a 100 milliseconds. We wait a bit longer 'just in case'.
@@ -675,9 +543,9 @@ static void eeh_reset_pe_once(struct eeh_dev *edev)
 	 * pci slot reset line is dropped. Make sure we don't miss
 	 * these, and clear the flag now.
 	 */
-	eeh_clear_slot(dn, EEH_MODE_ISOLATED);
+	eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
 
-	eeh_ops->reset(dn, EEH_RESET_DEACTIVATE);
+	eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
 
 	/* After a PCI slot has been reset, the PCI Express spec requires
 	 * a 1.5 second idle time for the bus to stabilize, before starting
@@ -689,116 +557,36 @@ static void eeh_reset_pe_once(struct eeh_dev *edev)
 
 /**
  * eeh_reset_pe - Reset the indicated PE
- * @edev: PCI device associated EEH device
+ * @pe: EEH PE
  *
  * This routine should be called to reset indicated device, including
  * PE. A PE might include multiple PCI devices and sometimes PCI bridges
  * might be involved as well.
  */
-int eeh_reset_pe(struct eeh_dev *edev)
+int eeh_reset_pe(struct eeh_pe *pe)
 {
 	int i, rc;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
 
 	/* Take three shots at resetting the bus */
 	for (i=0; i<3; i++) {
-		eeh_reset_pe_once(edev);
+		eeh_reset_pe_once(pe);
 
-		rc = eeh_ops->wait_state(dn, PCI_BUS_RESET_WAIT_MSEC);
+		rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
 		if (rc == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE))
 			return 0;
 
 		if (rc < 0) {
-			printk(KERN_ERR "EEH: unrecoverable slot failure %s\n",
-			       dn->full_name);
+			pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x",
+				__func__, pe->phb->global_number, pe->addr);
 			return -1;
 		}
-		printk(KERN_ERR "EEH: bus reset %d failed on slot %s, rc=%d\n",
-		       i+1, dn->full_name, rc);
+		pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n",
+			i+1, pe->phb->global_number, pe->addr, rc);
 	}
 
 	return -1;
 }
 
-/** Save and restore of PCI BARs
- *
- * Although firmware will set up BARs during boot, it doesn't
- * set up device BAR's after a device reset, although it will,
- * if requested, set up bridge configuration. Thus, we need to
- * configure the PCI devices ourselves.  
- */
-
-/**
- * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
- * @edev: PCI device associated EEH device
- *
- * Loads the PCI configuration space base address registers,
- * the expansion ROM base address, the latency timer, and etc.
- * from the saved values in the device node.
- */
-static inline void eeh_restore_one_device_bars(struct eeh_dev *edev)
-{
-	int i;
-	u32 cmd;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
-
-	if (!edev->phb)
-		return;
-
-	for (i=4; i<10; i++) {
-		eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
-	}
-
-	/* 12 == Expansion ROM Address */
-	eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
-
-#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
-#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
-
-	eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
-	            SAVED_BYTE(PCI_CACHE_LINE_SIZE));
-
-	eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
-	            SAVED_BYTE(PCI_LATENCY_TIMER));
-
-	/* max latency, min grant, interrupt pin and line */
-	eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
-
-	/* Restore PERR & SERR bits, some devices require it,
-	 * don't touch the other command bits
-	 */
-	eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd);
-	if (edev->config_space[1] & PCI_COMMAND_PARITY)
-		cmd |= PCI_COMMAND_PARITY;
-	else
-		cmd &= ~PCI_COMMAND_PARITY;
-	if (edev->config_space[1] & PCI_COMMAND_SERR)
-		cmd |= PCI_COMMAND_SERR;
-	else
-		cmd &= ~PCI_COMMAND_SERR;
-	eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
-}
-
-/**
- * eeh_restore_bars - Restore the PCI config space info
- * @edev: EEH device
- *
- * This routine performs a recursive walk to the children
- * of this device as well.
- */
-void eeh_restore_bars(struct eeh_dev *edev)
-{
-	struct device_node *dn;
-	if (!edev)
-		return;
-	
-	if ((edev->mode & EEH_MODE_SUPPORTED) && !IS_BRIDGE(edev->class_code))
-		eeh_restore_one_device_bars(edev);
-
-	for_each_child_of_node(eeh_dev_to_of_node(edev), dn)
-		eeh_restore_bars(of_node_to_eeh_dev(dn));
-}
-
 /**
  * eeh_save_bars - Save device bars
  * @edev: PCI device associated EEH device
@@ -808,7 +596,7 @@ void eeh_restore_bars(struct eeh_dev *edev)
  * PCI devices are added individually; but, for the restore,
  * an entire slot is reset at a time.
  */
-static void eeh_save_bars(struct eeh_dev *edev)
+void eeh_save_bars(struct eeh_dev *edev)
 {
 	int i;
 	struct device_node *dn;
@@ -822,102 +610,6 @@ static void eeh_save_bars(struct eeh_dev *edev)
 }
 
 /**
- * eeh_early_enable - Early enable EEH on the indicated device
- * @dn: device node
- * @data: BUID
- *
- * Enable EEH functionality on the specified PCI device. The function
- * is expected to be called before real PCI probing is done. However,
- * the PHBs have been initialized at this point.
- */
-static void *eeh_early_enable(struct device_node *dn, void *data)
-{
-	int ret;
-	const u32 *class_code = of_get_property(dn, "class-code", NULL);
-	const u32 *vendor_id = of_get_property(dn, "vendor-id", NULL);
-	const u32 *device_id = of_get_property(dn, "device-id", NULL);
-	const u32 *regs;
-	int enable;
-	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
-
-	edev->class_code = 0;
-	edev->mode = 0;
-	edev->check_count = 0;
-	edev->freeze_count = 0;
-	edev->false_positives = 0;
-
-	if (!of_device_is_available(dn))
-		return NULL;
-
-	/* Ignore bad nodes. */
-	if (!class_code || !vendor_id || !device_id)
-		return NULL;
-
-	/* There is nothing to check on PCI to ISA bridges */
-	if (dn->type && !strcmp(dn->type, "isa")) {
-		edev->mode |= EEH_MODE_NOCHECK;
-		return NULL;
-	}
-	edev->class_code = *class_code;
-
-	/* Ok... see if this device supports EEH.  Some do, some don't,
-	 * and the only way to find out is to check each and every one.
-	 */
-	regs = of_get_property(dn, "reg", NULL);
-	if (regs) {
-		/* First register entry is addr (00BBSS00)  */
-		/* Try to enable eeh */
-		ret = eeh_ops->set_option(dn, EEH_OPT_ENABLE);
-
-		enable = 0;
-		if (ret == 0) {
-			edev->config_addr = regs[0];
-
-			/* If the newer, better, ibm,get-config-addr-info is supported, 
-			 * then use that instead.
-			 */
-			edev->pe_config_addr = eeh_ops->get_pe_addr(dn);
-
-			/* Some older systems (Power4) allow the
-			 * ibm,set-eeh-option call to succeed even on nodes
-			 * where EEH is not supported. Verify support
-			 * explicitly.
-			 */
-			ret = eeh_ops->get_state(dn, NULL);
-			if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
-				enable = 1;
-		}
-
-		if (enable) {
-			eeh_subsystem_enabled = 1;
-			edev->mode |= EEH_MODE_SUPPORTED;
-
-			pr_debug("EEH: %s: eeh enabled, config=%x pe_config=%x\n",
-				 dn->full_name, edev->config_addr,
-				 edev->pe_config_addr);
-		} else {
-
-			/* This device doesn't support EEH, but it may have an
-			 * EEH parent, in which case we mark it as supported.
-			 */
-			if (dn->parent && of_node_to_eeh_dev(dn->parent) &&
-			    (of_node_to_eeh_dev(dn->parent)->mode & EEH_MODE_SUPPORTED)) {
-				/* Parent supports EEH. */
-				edev->mode |= EEH_MODE_SUPPORTED;
-				edev->config_addr = of_node_to_eeh_dev(dn->parent)->config_addr;
-				return NULL;
-			}
-		}
-	} else {
-		printk(KERN_WARNING "EEH: %s: unable to get reg property.\n",
-		       dn->full_name);
-	}
-
-	eeh_save_bars(edev);
-	return NULL;
-}
-
-/**
  * eeh_ops_register - Register platform dependent EEH operations
  * @ops: platform dependent EEH operations
  *
@@ -982,7 +674,7 @@ int __exit eeh_ops_unregister(const char *name)
  * Even if force-off is set, the EEH hardware is still enabled, so that
  * newer systems can boot.
  */
-void __init eeh_init(void)
+static int __init eeh_init(void)
 {
 	struct pci_controller *hose, *tmp;
 	struct device_node *phb;
@@ -992,27 +684,34 @@ void __init eeh_init(void)
 	if (!eeh_ops) {
 		pr_warning("%s: Platform EEH operation not found\n",
 			__func__);
-		return;
+		return -EEXIST;
 	} else if ((ret = eeh_ops->init())) {
 		pr_warning("%s: Failed to call platform init function (%d)\n",
 			__func__, ret);
-		return;
+		return ret;
 	}
 
 	raw_spin_lock_init(&confirm_error_lock);
 
 	/* Enable EEH for all adapters */
-	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
-		phb = hose->dn;
-		traverse_pci_devices(phb, eeh_early_enable, NULL);
+	if (eeh_probe_mode_devtree()) {
+		list_for_each_entry_safe(hose, tmp,
+			&hose_list, list_node) {
+			phb = hose->dn;
+			traverse_pci_devices(phb, eeh_ops->of_probe, NULL);
+		}
 	}
 
 	if (eeh_subsystem_enabled)
-		printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n");
+		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
 	else
-		printk(KERN_WARNING "EEH: No capable adapters found\n");
+		pr_warning("EEH: No capable adapters found\n");
+
+	return ret;
 }
 
+core_initcall_sync(eeh_init);
+
 /**
  * eeh_add_device_early - Enable EEH for the indicated device_node
  * @dn: device node for which to set up EEH
@@ -1029,7 +728,7 @@ static void eeh_add_device_early(struct device_node *dn)
 {
 	struct pci_controller *phb;
 
-	if (!dn || !of_node_to_eeh_dev(dn))
+	if (!of_node_to_eeh_dev(dn))
 		return;
 	phb = of_node_to_eeh_dev(dn)->phb;
 
@@ -1037,7 +736,8 @@ static void eeh_add_device_early(struct device_node *dn)
 	if (NULL == phb || 0 == phb->buid)
 		return;
 
-	eeh_early_enable(dn, NULL);
+	/* FIXME: hotplug support on POWERNV */
+	eeh_ops->of_probe(dn, NULL);
 }
 
 /**
@@ -1087,7 +787,7 @@ static void eeh_add_device_late(struct pci_dev *dev)
 	edev->pdev = dev;
 	dev->dev.archdata.edev = edev;
 
-	pci_addr_cache_insert_device(dev);
+	eeh_addr_cache_insert_dev(dev);
 	eeh_sysfs_add_device(dev);
 }
 
@@ -1117,6 +817,7 @@ EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
 /**
  * eeh_remove_device - Undo EEH setup for the indicated pci device
  * @dev: pci device to be removed
+ * @purge_pe: remove the PE or not
  *
  * This routine should be called when a device is removed from
  * a running system (e.g. by hotplug or dlpar).  It unregisters
@@ -1124,7 +825,7 @@ EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
  * this device will no longer be detected after this call; thus,
  * i/o errors affecting this slot may leave this device unusable.
  */
-static void eeh_remove_device(struct pci_dev *dev)
+static void eeh_remove_device(struct pci_dev *dev, int purge_pe)
 {
 	struct eeh_dev *edev;
 
@@ -1143,28 +844,30 @@ static void eeh_remove_device(struct pci_dev *dev)
 	dev->dev.archdata.edev = NULL;
 	pci_dev_put(dev);
 
-	pci_addr_cache_remove_device(dev);
+	eeh_rmv_from_parent_pe(edev, purge_pe);
+	eeh_addr_cache_rmv_dev(dev);
 	eeh_sysfs_remove_device(dev);
 }
 
 /**
  * eeh_remove_bus_device - Undo EEH setup for the indicated PCI device
  * @dev: PCI device
+ * @purge_pe: remove the corresponding PE or not
  *
  * This routine must be called when a device is removed from the
  * running system through hotplug or dlpar. The corresponding
  * PCI address cache will be removed.
  */
-void eeh_remove_bus_device(struct pci_dev *dev)
+void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe)
 {
 	struct pci_bus *bus = dev->subordinate;
 	struct pci_dev *child, *tmp;
 
-	eeh_remove_device(dev);
+	eeh_remove_device(dev, purge_pe);
 
 	if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
 		list_for_each_entry_safe(child, tmp, &bus->devices, bus_list)
-			 eeh_remove_bus_device(child);
+			 eeh_remove_bus_device(child, purge_pe);
 	}
 }
 EXPORT_SYMBOL_GPL(eeh_remove_bus_device);
diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c
index e5ae1c687c66..5a4c87903057 100644
--- a/arch/powerpc/platforms/pseries/eeh_cache.c
+++ b/arch/powerpc/platforms/pseries/eeh_cache.c
@@ -50,6 +50,7 @@ struct pci_io_addr_range {
 	struct rb_node rb_node;
 	unsigned long addr_lo;
 	unsigned long addr_hi;
+	struct eeh_dev *edev;
 	struct pci_dev *pcidev;
 	unsigned int flags;
 };
@@ -59,7 +60,7 @@ static struct pci_io_addr_cache {
 	spinlock_t piar_lock;
 } pci_io_addr_cache_root;
 
-static inline struct pci_dev *__pci_addr_cache_get_device(unsigned long addr)
+static inline struct eeh_dev *__eeh_addr_cache_get_device(unsigned long addr)
 {
 	struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node;
 
@@ -74,7 +75,7 @@ static inline struct pci_dev *__pci_addr_cache_get_device(unsigned long addr)
 				n = n->rb_right;
 			} else {
 				pci_dev_get(piar->pcidev);
-				return piar->pcidev;
+				return piar->edev;
 			}
 		}
 	}
@@ -83,7 +84,7 @@ static inline struct pci_dev *__pci_addr_cache_get_device(unsigned long addr)
 }
 
 /**
- * pci_addr_cache_get_device - Get device, given only address
+ * eeh_addr_cache_get_dev - Get device, given only address
  * @addr: mmio (PIO) phys address or i/o port number
  *
  * Given an mmio phys address, or a port number, find a pci device
@@ -92,15 +93,15 @@ static inline struct pci_dev *__pci_addr_cache_get_device(unsigned long addr)
  * from zero (that is, they do *not* have pci_io_addr added in).
  * It is safe to call this function within an interrupt.
  */
-struct pci_dev *pci_addr_cache_get_device(unsigned long addr)
+struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr)
 {
-	struct pci_dev *dev;
+	struct eeh_dev *edev;
 	unsigned long flags;
 
 	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
-	dev = __pci_addr_cache_get_device(addr);
+	edev = __eeh_addr_cache_get_device(addr);
 	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
-	return dev;
+	return edev;
 }
 
 #ifdef DEBUG
@@ -108,7 +109,7 @@ struct pci_dev *pci_addr_cache_get_device(unsigned long addr)
  * Handy-dandy debug print routine, does nothing more
  * than print out the contents of our addr cache.
  */
-static void pci_addr_cache_print(struct pci_io_addr_cache *cache)
+static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
 {
 	struct rb_node *n;
 	int cnt = 0;
@@ -117,7 +118,7 @@ static void pci_addr_cache_print(struct pci_io_addr_cache *cache)
 	while (n) {
 		struct pci_io_addr_range *piar;
 		piar = rb_entry(n, struct pci_io_addr_range, rb_node);
-		printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n",
+		pr_debug("PCI: %s addr range %d [%lx-%lx]: %s\n",
 		       (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
 		       piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
 		cnt++;
@@ -128,7 +129,7 @@ static void pci_addr_cache_print(struct pci_io_addr_cache *cache)
 
 /* Insert address range into the rb tree. */
 static struct pci_io_addr_range *
-pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
+eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
 		      unsigned long ahi, unsigned int flags)
 {
 	struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
@@ -146,23 +147,24 @@ pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
 		} else {
 			if (dev != piar->pcidev ||
 			    alo != piar->addr_lo || ahi != piar->addr_hi) {
-				printk(KERN_WARNING "PIAR: overlapping address range\n");
+				pr_warning("PIAR: overlapping address range\n");
 			}
 			return piar;
 		}
 	}
-	piar = kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
+	piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC);
 	if (!piar)
 		return NULL;
 
 	pci_dev_get(dev);
 	piar->addr_lo = alo;
 	piar->addr_hi = ahi;
+	piar->edev = pci_dev_to_eeh_dev(dev);
 	piar->pcidev = dev;
 	piar->flags = flags;
 
 #ifdef DEBUG
-	printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n",
+	pr_debug("PIAR: insert range=[%lx:%lx] dev=%s\n",
 	                  alo, ahi, pci_name(dev));
 #endif
 
@@ -172,7 +174,7 @@ pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
 	return piar;
 }
 
-static void __pci_addr_cache_insert_device(struct pci_dev *dev)
+static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
 {
 	struct device_node *dn;
 	struct eeh_dev *edev;
@@ -180,7 +182,7 @@ static void __pci_addr_cache_insert_device(struct pci_dev *dev)
 
 	dn = pci_device_to_OF_node(dev);
 	if (!dn) {
-		printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev));
+		pr_warning("PCI: no pci dn found for dev=%s\n", pci_name(dev));
 		return;
 	}
 
@@ -192,8 +194,7 @@ static void __pci_addr_cache_insert_device(struct pci_dev *dev)
 	}
 
 	/* Skip any devices for which EEH is not enabled. */
-	if (!(edev->mode & EEH_MODE_SUPPORTED) ||
-	    edev->mode & EEH_MODE_NOCHECK) {
+	if (!edev->pe) {
 #ifdef DEBUG
 		pr_info("PCI: skip building address cache for=%s - %s\n",
 			pci_name(dev), dn->full_name);
@@ -212,19 +213,19 @@ static void __pci_addr_cache_insert_device(struct pci_dev *dev)
 			continue;
 		if (start == 0 || ~start == 0 || end == 0 || ~end == 0)
 			 continue;
-		pci_addr_cache_insert(dev, start, end, flags);
+		eeh_addr_cache_insert(dev, start, end, flags);
 	}
 }
 
 /**
- * pci_addr_cache_insert_device - Add a device to the address cache
+ * eeh_addr_cache_insert_dev - Add a device to the address cache
  * @dev: PCI device whose I/O addresses we are interested in.
  *
  * In order to support the fast lookup of devices based on addresses,
  * we maintain a cache of devices that can be quickly searched.
  * This routine adds a device to that cache.
  */
-void pci_addr_cache_insert_device(struct pci_dev *dev)
+void eeh_addr_cache_insert_dev(struct pci_dev *dev)
 {
 	unsigned long flags;
 
@@ -233,11 +234,11 @@ void pci_addr_cache_insert_device(struct pci_dev *dev)
 		return;
 
 	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
-	__pci_addr_cache_insert_device(dev);
+	__eeh_addr_cache_insert_dev(dev);
 	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
 }
 
-static inline void __pci_addr_cache_remove_device(struct pci_dev *dev)
+static inline void __eeh_addr_cache_rmv_dev(struct pci_dev *dev)
 {
 	struct rb_node *n;
 
@@ -258,7 +259,7 @@ restart:
 }
 
 /**
- * pci_addr_cache_remove_device - remove pci device from addr cache
+ * eeh_addr_cache_rmv_dev - remove pci device from addr cache
  * @dev: device to remove
  *
  * Remove a device from the addr-cache tree.
@@ -266,17 +267,17 @@ restart:
  * the tree multiple times (once per resource).
  * But so what; device removal doesn't need to be that fast.
  */
-void pci_addr_cache_remove_device(struct pci_dev *dev)
+void eeh_addr_cache_rmv_dev(struct pci_dev *dev)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
-	__pci_addr_cache_remove_device(dev);
+	__eeh_addr_cache_rmv_dev(dev);
 	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
 }
 
 /**
- * pci_addr_cache_build - Build a cache of I/O addresses
+ * eeh_addr_cache_build - Build a cache of I/O addresses
  *
  * Build a cache of pci i/o addresses.  This cache will be used to
  * find the pci device that corresponds to a given address.
@@ -284,7 +285,7 @@ void pci_addr_cache_remove_device(struct pci_dev *dev)
  * Must be run late in boot process, after the pci controllers
  * have been scanned for devices (after all device resources are known).
  */
-void __init pci_addr_cache_build(void)
+void __init eeh_addr_cache_build(void)
 {
 	struct device_node *dn;
 	struct eeh_dev *edev;
@@ -293,7 +294,7 @@ void __init pci_addr_cache_build(void)
 	spin_lock_init(&pci_io_addr_cache_root.piar_lock);
 
 	for_each_pci_dev(dev) {
-		pci_addr_cache_insert_device(dev);
+		eeh_addr_cache_insert_dev(dev);
 
 		dn = pci_device_to_OF_node(dev);
 		if (!dn)
@@ -312,7 +313,7 @@ void __init pci_addr_cache_build(void)
 
 #ifdef DEBUG
 	/* Verify tree built up above, echo back the list of addrs. */
-	pci_addr_cache_print(&pci_io_addr_cache_root);
+	eeh_addr_cache_print(&pci_io_addr_cache_root);
 #endif
 }
 
diff --git a/arch/powerpc/platforms/pseries/eeh_dev.c b/arch/powerpc/platforms/pseries/eeh_dev.c
index c4507d095900..66442341d3a6 100644
--- a/arch/powerpc/platforms/pseries/eeh_dev.c
+++ b/arch/powerpc/platforms/pseries/eeh_dev.c
@@ -55,7 +55,7 @@ void * __devinit eeh_dev_init(struct device_node *dn, void *data)
 	struct eeh_dev *edev;
 
 	/* Allocate EEH device */
-	edev = zalloc_maybe_bootmem(sizeof(*edev), GFP_KERNEL);
+	edev = kzalloc(sizeof(*edev), GFP_KERNEL);
 	if (!edev) {
 		pr_warning("%s: out of memory\n", __func__);
 		return NULL;
@@ -65,6 +65,7 @@ void * __devinit eeh_dev_init(struct device_node *dn, void *data)
 	PCI_DN(dn)->edev = edev;
 	edev->dn  = dn;
 	edev->phb = phb;
+	INIT_LIST_HEAD(&edev->list);
 
 	return NULL;
 }
@@ -80,6 +81,9 @@ void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb)
 {
 	struct device_node *dn = phb->dn;
 
+	/* EEH PE for PHB */
+	eeh_phb_pe_create(phb);
+
 	/* EEH device for PHB */
 	eeh_dev_init(dn, phb);
 
@@ -93,10 +97,16 @@ void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb)
  * Scan all the existing PHBs and create EEH devices for their OF
  * nodes and their children OF nodes
  */
-void __init eeh_dev_phb_init(void)
+static int __init eeh_dev_phb_init(void)
 {
 	struct pci_controller *phb, *tmp;
 
 	list_for_each_entry_safe(phb, tmp, &hose_list, list_node)
 		eeh_dev_phb_init_dynamic(phb);
+
+	pr_info("EEH: devices created\n");
+
+	return 0;
 }
+
+core_initcall(eeh_dev_phb_init);
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index baf92cd9dfab..a3fefb61097c 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -25,6 +25,7 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/module.h>
 #include <linux/pci.h>
 #include <asm/eeh.h>
 #include <asm/eeh_event.h>
@@ -47,6 +48,41 @@ static inline const char *eeh_pcid_name(struct pci_dev *pdev)
 	return "";
 }
 
+/**
+ * eeh_pcid_get - Get the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is used to retrieve the PCI device driver for
+ * the indicated PCI device. Besides, we will increase the reference
+ * of the PCI device driver to prevent that being unloaded on
+ * the fly. Otherwise, kernel crash would be seen.
+ */
+static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
+{
+	if (!pdev || !pdev->driver)
+		return NULL;
+
+	if (!try_module_get(pdev->driver->driver.owner))
+		return NULL;
+
+	return pdev->driver;
+}
+
+/**
+ * eeh_pcid_put - Dereference on the PCI device driver
+ * @pdev: PCI device
+ *
+ * The function is called to do dereference on the PCI device
+ * driver of the indicated PCI device.
+ */
+static inline void eeh_pcid_put(struct pci_dev *pdev)
+{
+	if (!pdev || !pdev->driver)
+		return;
+
+	module_put(pdev->driver->driver.owner);
+}
+
 #if 0
 static void print_device_node_tree(struct pci_dn *pdn, int dent)
 {
@@ -93,7 +129,7 @@ static void eeh_disable_irq(struct pci_dev *dev)
 	if (!irq_has_action(dev->irq))
 		return;
 
-	edev->mode |= EEH_MODE_IRQ_DISABLED;
+	edev->mode |= EEH_DEV_IRQ_DISABLED;
 	disable_irq_nosync(dev->irq);
 }
 
@@ -108,36 +144,44 @@ static void eeh_enable_irq(struct pci_dev *dev)
 {
 	struct eeh_dev *edev = pci_dev_to_eeh_dev(dev);
 
-	if ((edev->mode) & EEH_MODE_IRQ_DISABLED) {
-		edev->mode &= ~EEH_MODE_IRQ_DISABLED;
+	if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
+		edev->mode &= ~EEH_DEV_IRQ_DISABLED;
 		enable_irq(dev->irq);
 	}
 }
 
 /**
  * eeh_report_error - Report pci error to each device driver
- * @dev: PCI device
+ * @data: eeh device
  * @userdata: return value
  * 
  * Report an EEH error to each device driver, collect up and 
  * merge the device driver responses. Cumulative response 
  * passed back in "userdata".
  */
-static int eeh_report_error(struct pci_dev *dev, void *userdata)
+static void *eeh_report_error(void *data, void *userdata)
 {
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
 	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver = dev->driver;
+	struct pci_driver *driver;
 
+	/* We might not have the associated PCI device,
+	 * then we should continue for next one.
+	 */
+	if (!dev) return NULL;
 	dev->error_state = pci_channel_io_frozen;
 
-	if (!driver)
-		return 0;
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
 
 	eeh_disable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->error_detected)
-		return 0;
+	    !driver->err_handler->error_detected) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
 
 	rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen);
 
@@ -145,27 +189,34 @@ static int eeh_report_error(struct pci_dev *dev, void *userdata)
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 
-	return 0;
+	eeh_pcid_put(dev);
+	return NULL;
 }
 
 /**
  * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
- * @dev: PCI device
+ * @data: eeh device
  * @userdata: return value
  *
  * Tells each device driver that IO ports, MMIO and config space I/O
  * are now enabled. Collects up and merges the device driver responses.
  * Cumulative response passed back in "userdata".
  */
-static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
+static void *eeh_report_mmio_enabled(void *data, void *userdata)
 {
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
 	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver = dev->driver;
+	struct pci_driver *driver;
 
-	if (!driver ||
-	    !driver->err_handler ||
-	    !driver->err_handler->mmio_enabled)
-		return 0;
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
+	if (!driver->err_handler ||
+	    !driver->err_handler->mmio_enabled) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
 
 	rc = driver->err_handler->mmio_enabled(dev);
 
@@ -173,12 +224,13 @@ static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 
-	return 0;
+	eeh_pcid_put(dev);
+	return NULL;
 }
 
 /**
  * eeh_report_reset - Tell device that slot has been reset
- * @dev: PCI device
+ * @data: eeh device
  * @userdata: return value
  *
  * This routine must be called while EEH tries to reset particular
@@ -186,21 +238,26 @@ static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata)
  * some actions, usually to save data the driver needs so that the
  * driver can work again while the device is recovered.
  */
-static int eeh_report_reset(struct pci_dev *dev, void *userdata)
+static void *eeh_report_reset(void *data, void *userdata)
 {
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
 	enum pci_ers_result rc, *res = userdata;
-	struct pci_driver *driver = dev->driver;
-
-	if (!driver)
-		return 0;
+	struct pci_driver *driver;
 
+	if (!dev) return NULL;
 	dev->error_state = pci_channel_io_normal;
 
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
+
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->slot_reset)
-		return 0;
+	    !driver->err_handler->slot_reset) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
 
 	rc = driver->err_handler->slot_reset(dev);
 	if ((*res == PCI_ERS_RESULT_NONE) ||
@@ -208,109 +265,115 @@ static int eeh_report_reset(struct pci_dev *dev, void *userdata)
 	if (*res == PCI_ERS_RESULT_DISCONNECT &&
 	     rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 
-	return 0;
+	eeh_pcid_put(dev);
+	return NULL;
 }
 
 /**
  * eeh_report_resume - Tell device to resume normal operations
- * @dev: PCI device
+ * @data: eeh device
  * @userdata: return value
  *
  * This routine must be called to notify the device driver that it
  * could resume so that the device driver can do some initialization
  * to make the recovered device work again.
  */
-static int eeh_report_resume(struct pci_dev *dev, void *userdata)
+static void *eeh_report_resume(void *data, void *userdata)
 {
-	struct pci_driver *driver = dev->driver;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_driver *driver;
 
+	if (!dev) return NULL;
 	dev->error_state = pci_channel_io_normal;
 
-	if (!driver)
-		return 0;
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
 
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->resume)
-		return 0;
+	    !driver->err_handler->resume) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
 
 	driver->err_handler->resume(dev);
 
-	return 0;
+	eeh_pcid_put(dev);
+	return NULL;
 }
 
 /**
  * eeh_report_failure - Tell device driver that device is dead.
- * @dev: PCI device
+ * @data: eeh device
  * @userdata: return value
  *
  * This informs the device driver that the device is permanently
  * dead, and that no further recovery attempts will be made on it.
  */
-static int eeh_report_failure(struct pci_dev *dev, void *userdata)
+static void *eeh_report_failure(void *data, void *userdata)
 {
-	struct pci_driver *driver = dev->driver;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_driver *driver;
 
+	if (!dev) return NULL;
 	dev->error_state = pci_channel_io_perm_failure;
 
-	if (!driver)
-		return 0;
+	driver = eeh_pcid_get(dev);
+	if (!driver) return NULL;
 
 	eeh_disable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->error_detected)
-		return 0;
+	    !driver->err_handler->error_detected) {
+		eeh_pcid_put(dev);
+		return NULL;
+	}
 
 	driver->err_handler->error_detected(dev, pci_channel_io_perm_failure);
 
-	return 0;
+	eeh_pcid_put(dev);
+	return NULL;
 }
 
 /**
  * eeh_reset_device - Perform actual reset of a pci slot
- * @edev: PE associated EEH device
+ * @pe: EEH PE
  * @bus: PCI bus corresponding to the isolcated slot
  *
  * This routine must be called to do reset on the indicated PE.
  * During the reset, udev might be invoked because those affected
  * PCI devices will be removed and then added.
  */
-static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 {
-	struct device_node *dn;
 	int cnt, rc;
 
 	/* pcibios will clear the counter; save the value */
-	cnt = edev->freeze_count;
+	cnt = pe->freeze_count;
 
+	/*
+	 * We don't remove the corresponding PE instances because
+	 * we need the information afterwords. The attached EEH
+	 * devices are expected to be attached soon when calling
+	 * into pcibios_add_pci_devices().
+	 */
 	if (bus)
-		pcibios_remove_pci_devices(bus);
+		__pcibios_remove_pci_devices(bus, 0);
 
 	/* Reset the pci controller. (Asserts RST#; resets config space).
 	 * Reconfigure bridges and devices. Don't try to bring the system
 	 * up if the reset failed for some reason.
 	 */
-	rc = eeh_reset_pe(edev);
+	rc = eeh_reset_pe(pe);
 	if (rc)
 		return rc;
 
-	/* Walk over all functions on this device. */
-	dn = eeh_dev_to_of_node(edev);
-	if (!pcibios_find_pci_bus(dn) && of_node_to_eeh_dev(dn->parent))
-		dn = dn->parent->child;
-
-	while (dn) {
-		struct eeh_dev *pedev = of_node_to_eeh_dev(dn);
-
-		/* On Power4, always true because eeh_pe_config_addr=0 */
-		if (edev->pe_config_addr == pedev->pe_config_addr) {
-			eeh_ops->configure_bridge(dn);
-			eeh_restore_bars(pedev);
- 		}
-		dn = dn->sibling;
-	}
+	/* Restore PE */
+	eeh_ops->configure_bridge(pe);
+	eeh_pe_restore_bars(pe);
 
 	/* Give the system 5 seconds to finish running the user-space
 	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes, 
@@ -322,7 +385,7 @@ static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
 		ssleep(5);
 		pcibios_add_pci_devices(bus);
 	}
-	edev->freeze_count = cnt;
+	pe->freeze_count = cnt;
 
 	return 0;
 }
@@ -334,7 +397,7 @@ static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
 
 /**
  * eeh_handle_event - Reset a PCI device after hard lockup.
- * @event: EEH event
+ * @pe: EEH PE
  *
  * While PHB detects address or data parity errors on particular PCI
  * slot, the associated PE will be frozen. Besides, DMA's occurring
@@ -349,69 +412,24 @@ static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
  * drivers (which cause a second set of hotplug events to go out to
  * userspace).
  */
-struct eeh_dev *handle_eeh_events(struct eeh_event *event)
+void eeh_handle_event(struct eeh_pe *pe)
 {
-	struct device_node *frozen_dn;
-	struct eeh_dev *frozen_edev;
 	struct pci_bus *frozen_bus;
 	int rc = 0;
 	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
-	const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str;
-
-	frozen_dn = eeh_find_device_pe(eeh_dev_to_of_node(event->edev));
-	if (!frozen_dn) {
-		location = of_get_property(eeh_dev_to_of_node(event->edev), "ibm,loc-code", NULL);
-		location = location ? location : "unknown";
-		printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "
-		                "for location=%s pci addr=%s\n",
-			location, eeh_pci_name(eeh_dev_to_pci_dev(event->edev)));
-		return NULL;
-	}
-
-	frozen_bus = pcibios_find_pci_bus(frozen_dn);
-	location = of_get_property(frozen_dn, "ibm,loc-code", NULL);
-	location = location ? location : "unknown";
-
-	/* There are two different styles for coming up with the PE.
-	 * In the old style, it was the highest EEH-capable device
-	 * which was always an EADS pci bridge.  In the new style,
-	 * there might not be any EADS bridges, and even when there are,
-	 * the firmware marks them as "EEH incapable". So another
-	 * two-step is needed to find the pci bus..
-	 */
-	if (!frozen_bus)
-		frozen_bus = pcibios_find_pci_bus(frozen_dn->parent);
 
+	frozen_bus = eeh_pe_bus_get(pe);
 	if (!frozen_bus) {
-		printk(KERN_ERR "EEH: Cannot find PCI bus "
-		        "for location=%s dn=%s\n",
-		        location, frozen_dn->full_name);
-		return NULL;
+		pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
+			__func__, pe->phb->global_number, pe->addr);
+		return;
 	}
 
-	frozen_edev = of_node_to_eeh_dev(frozen_dn);
-	frozen_edev->freeze_count++;
-	pci_str = eeh_pci_name(eeh_dev_to_pci_dev(event->edev));
-	drv_str = eeh_pcid_name(eeh_dev_to_pci_dev(event->edev));
-
-	if (frozen_edev->freeze_count > EEH_MAX_ALLOWED_FREEZES)
+	pe->freeze_count++;
+	if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES)
 		goto excess_failures;
-
-	printk(KERN_WARNING
-	   "EEH: This PCI device has failed %d times in the last hour:\n",
-		frozen_edev->freeze_count);
-
-	if (frozen_edev->pdev) {
-		bus_pci_str = pci_name(frozen_edev->pdev);
-		bus_drv_str = eeh_pcid_name(frozen_edev->pdev);
-		printk(KERN_WARNING
-			"EEH: Bus location=%s driver=%s pci addr=%s\n",
-			location, bus_drv_str, bus_pci_str);
-	}
-
-	printk(KERN_WARNING
-		"EEH: Device location=%s driver=%s pci addr=%s\n",
-		location, drv_str, pci_str);
+	pr_warning("EEH: This PCI device has failed %d times in the last hour\n",
+		pe->freeze_count);
 
 	/* Walk the various device drivers attached to this slot through
 	 * a reset sequence, giving each an opportunity to do what it needs
@@ -419,12 +437,12 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 	 * status ... if any child can't handle the reset, then the entire
 	 * slot is dlpar removed and added.
 	 */
-	pci_walk_bus(frozen_bus, eeh_report_error, &result);
+	eeh_pe_dev_traverse(pe, eeh_report_error, &result);
 
 	/* Get the current PCI slot state. This can take a long time,
 	 * sometimes over 3 seconds for certain systems.
 	 */
-	rc = eeh_ops->wait_state(eeh_dev_to_of_node(frozen_edev), MAX_WAIT_FOR_RECOVERY*1000);
+	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
 	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
 		printk(KERN_WARNING "EEH: Permanent failure\n");
 		goto hard_fail;
@@ -434,14 +452,14 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 	 * don't post the error log until after all dev drivers
 	 * have been informed.
 	 */
-	eeh_slot_error_detail(frozen_edev, EEH_LOG_TEMP);
+	eeh_slot_error_detail(pe, EEH_LOG_TEMP);
 
 	/* If all device drivers were EEH-unaware, then shut
 	 * down all of the device drivers, and hope they
 	 * go down willingly, without panicing the system.
 	 */
 	if (result == PCI_ERS_RESULT_NONE) {
-		rc = eeh_reset_device(frozen_edev, frozen_bus);
+		rc = eeh_reset_device(pe, frozen_bus);
 		if (rc) {
 			printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
 			goto hard_fail;
@@ -450,7 +468,7 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 
 	/* If all devices reported they can proceed, then re-enable MMIO */
 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
-		rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_MMIO);
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
 
 		if (rc < 0)
 			goto hard_fail;
@@ -458,13 +476,13 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 			result = PCI_ERS_RESULT_NEED_RESET;
 		} else {
 			result = PCI_ERS_RESULT_NONE;
-			pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result);
+			eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result);
 		}
 	}
 
 	/* If all devices reported they can proceed, then re-enable DMA */
 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
-		rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_DMA);
+		rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
 
 		if (rc < 0)
 			goto hard_fail;
@@ -482,13 +500,13 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 
 	/* If any device called out for a reset, then reset the slot */
 	if (result == PCI_ERS_RESULT_NEED_RESET) {
-		rc = eeh_reset_device(frozen_edev, NULL);
+		rc = eeh_reset_device(pe, NULL);
 		if (rc) {
 			printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
 			goto hard_fail;
 		}
 		result = PCI_ERS_RESULT_NONE;
-		pci_walk_bus(frozen_bus, eeh_report_reset, &result);
+		eeh_pe_dev_traverse(pe, eeh_report_reset, &result);
 	}
 
 	/* All devices should claim they have recovered by now. */
@@ -499,9 +517,9 @@ struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 	}
 
 	/* Tell all device drivers that they can resume operations */
-	pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
+	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
 
-	return frozen_edev;
+	return;
 	
 excess_failures:
 	/*
@@ -509,30 +527,26 @@ excess_failures:
 	 * are due to poorly seated PCI cards. Only 10% or so are
 	 * due to actual, failed cards.
 	 */
-	printk(KERN_ERR
-	   "EEH: PCI device at location=%s driver=%s pci addr=%s\n"
-		"has failed %d times in the last hour "
-		"and has been permanently disabled.\n"
-		"Please try reseating this device or replacing it.\n",
-		location, drv_str, pci_str, frozen_edev->freeze_count);
+	pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n"
+	       "last hour and has been permanently disabled.\n"
+	       "Please try reseating or replacing it.\n",
+		pe->phb->global_number, pe->addr,
+		pe->freeze_count);
 	goto perm_error;
 
 hard_fail:
-	printk(KERN_ERR
-	   "EEH: Unable to recover from failure of PCI device "
-	   "at location=%s driver=%s pci addr=%s\n"
-	   "Please try reseating this device or replacing it.\n",
-		location, drv_str, pci_str);
+	pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n"
+	       "Please try reseating or replacing it\n",
+		pe->phb->global_number, pe->addr);
 
 perm_error:
-	eeh_slot_error_detail(frozen_edev, EEH_LOG_PERM);
+	eeh_slot_error_detail(pe, EEH_LOG_PERM);
 
 	/* Notify all devices that they're about to go down. */
-	pci_walk_bus(frozen_bus, eeh_report_failure, NULL);
+	eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
 
 	/* Shut down the device drivers for good. */
-	pcibios_remove_pci_devices(frozen_bus);
-
-	return NULL;
+	if (frozen_bus)
+		pcibios_remove_pci_devices(frozen_bus);
 }
 
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index fb506317ebb0..51faaac8abe6 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -57,7 +57,7 @@ static int eeh_event_handler(void * dummy)
 {
 	unsigned long flags;
 	struct eeh_event *event;
-	struct eeh_dev *edev;
+	struct eeh_pe *pe;
 
 	set_task_comm(current, "eehd");
 
@@ -76,28 +76,23 @@ static int eeh_event_handler(void * dummy)
 
 	/* Serialize processing of EEH events */
 	mutex_lock(&eeh_event_mutex);
-	edev = event->edev;
-	eeh_mark_slot(eeh_dev_to_of_node(edev), EEH_MODE_RECOVERING);
-
-	printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
-	       eeh_pci_name(edev->pdev));
+	pe = event->pe;
+	eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+	pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n",
+		pe->phb->global_number, pe->addr);
 
 	set_current_state(TASK_INTERRUPTIBLE);	/* Don't add to load average */
-	edev = handle_eeh_events(event);
-
-	if (edev) {
-		eeh_clear_slot(eeh_dev_to_of_node(edev), EEH_MODE_RECOVERING);
-		pci_dev_put(edev->pdev);
-	}
+	eeh_handle_event(pe);
+	eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
 
 	kfree(event);
 	mutex_unlock(&eeh_event_mutex);
 
 	/* If there are no new errors after an hour, clear the counter. */
-	if (edev && edev->freeze_count>0) {
+	if (pe && pe->freeze_count > 0) {
 		msleep_interruptible(3600*1000);
-		if (edev->freeze_count>0)
-			edev->freeze_count--;
+		if (pe->freeze_count > 0)
+			pe->freeze_count--;
 
 	}
 
@@ -119,36 +114,23 @@ static void eeh_thread_launcher(struct work_struct *dummy)
 
 /**
  * eeh_send_failure_event - Generate a PCI error event
- * @edev: EEH device
+ * @pe: EEH PE
  *
  * This routine can be called within an interrupt context;
  * the actual event will be delivered in a normal context
  * (from a workqueue).
  */
-int eeh_send_failure_event(struct eeh_dev *edev)
+int eeh_send_failure_event(struct eeh_pe *pe)
 {
 	unsigned long flags;
 	struct eeh_event *event;
-	struct device_node *dn = eeh_dev_to_of_node(edev);
-	const char *location;
-
-	if (!mem_init_done) {
-		printk(KERN_ERR "EEH: event during early boot not handled\n");
-		location = of_get_property(dn, "ibm,loc-code", NULL);
-		printk(KERN_ERR "EEH: device node = %s\n", dn->full_name);
-		printk(KERN_ERR "EEH: PCI location = %s\n", location);
-		return 1;
-	}
-	event = kmalloc(sizeof(*event), GFP_ATOMIC);
-	if (event == NULL) {
-		printk(KERN_ERR "EEH: out of memory, event not handled\n");
-		return 1;
- 	}
-
-	if (edev->pdev)
-		pci_dev_get(edev->pdev);
 
-	event->edev = edev;
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
+	if (!event) {
+		pr_err("EEH: out of memory, event not handled\n");
+		return -ENOMEM;
+	}
+	event->pe = pe;
 
 	/* We may or may not be called in an interrupt context */
 	spin_lock_irqsave(&eeh_eventlist_lock, flags);
diff --git a/arch/powerpc/platforms/pseries/eeh_pe.c b/arch/powerpc/platforms/pseries/eeh_pe.c
new file mode 100644
index 000000000000..797cd181dc3f
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/eeh_pe.c
@@ -0,0 +1,652 @@
+/*
+ * The file intends to implement PE based on the information from
+ * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device.
+ * All the PEs should be organized as hierarchy tree. The first level
+ * of the tree will be associated to existing PHBs since the particular
+ * PE is only meaningful in one PHB domain.
+ *
+ * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/string.h>
+
+#include <asm/pci-bridge.h>
+#include <asm/ppc-pci.h>
+
+static LIST_HEAD(eeh_phb_pe);
+
+/**
+ * eeh_pe_alloc - Allocate PE
+ * @phb: PCI controller
+ * @type: PE type
+ *
+ * Allocate PE instance dynamically.
+ */
+static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type)
+{
+	struct eeh_pe *pe;
+
+	/* Allocate PHB PE */
+	pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL);
+	if (!pe) return NULL;
+
+	/* Initialize PHB PE */
+	pe->type = type;
+	pe->phb = phb;
+	INIT_LIST_HEAD(&pe->child_list);
+	INIT_LIST_HEAD(&pe->child);
+	INIT_LIST_HEAD(&pe->edevs);
+
+	return pe;
+}
+
+/**
+ * eeh_phb_pe_create - Create PHB PE
+ * @phb: PCI controller
+ *
+ * The function should be called while the PHB is detected during
+ * system boot or PCI hotplug in order to create PHB PE.
+ */
+int __devinit eeh_phb_pe_create(struct pci_controller *phb)
+{
+	struct eeh_pe *pe;
+
+	/* Allocate PHB PE */
+	pe = eeh_pe_alloc(phb, EEH_PE_PHB);
+	if (!pe) {
+		pr_err("%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+
+	/* Put it into the list */
+	eeh_lock();
+	list_add_tail(&pe->child, &eeh_phb_pe);
+	eeh_unlock();
+
+	pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
+
+	return 0;
+}
+
+/**
+ * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB
+ * @phb: PCI controller
+ *
+ * The overall PEs form hierarchy tree. The first layer of the
+ * hierarchy tree is composed of PHB PEs. The function is used
+ * to retrieve the corresponding PHB PE according to the given PHB.
+ */
+static struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb)
+{
+	struct eeh_pe *pe;
+
+	list_for_each_entry(pe, &eeh_phb_pe, child) {
+		/*
+		 * Actually, we needn't check the type since
+		 * the PE for PHB has been determined when that
+		 * was created.
+		 */
+		if ((pe->type & EEH_PE_PHB) && pe->phb == phb)
+			return pe;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_next - Retrieve the next PE in the tree
+ * @pe: current PE
+ * @root: root PE
+ *
+ * The function is used to retrieve the next PE in the
+ * hierarchy PE tree.
+ */
+static struct eeh_pe *eeh_pe_next(struct eeh_pe *pe,
+				  struct eeh_pe *root)
+{
+	struct list_head *next = pe->child_list.next;
+
+	if (next == &pe->child_list) {
+		while (1) {
+			if (pe == root)
+				return NULL;
+			next = pe->child.next;
+			if (next != &pe->parent->child_list)
+				break;
+			pe = pe->parent;
+		}
+	}
+
+	return list_entry(next, struct eeh_pe, child);
+}
+
+/**
+ * eeh_pe_traverse - Traverse PEs in the specified PHB
+ * @root: root PE
+ * @fn: callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the specified PE and its
+ * child PEs. The traversing is to be terminated once the
+ * callback returns something other than NULL, or no more PEs
+ * to be traversed.
+ */
+static void *eeh_pe_traverse(struct eeh_pe *root,
+			eeh_traverse_func fn, void *flag)
+{
+	struct eeh_pe *pe;
+	void *ret;
+
+	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+		ret = fn(pe, flag);
+		if (ret) return ret;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_dev_traverse - Traverse the devices from the PE
+ * @root: EEH PE
+ * @fn: function callback
+ * @flag: extra parameter to callback
+ *
+ * The function is used to traverse the devices of the specified
+ * PE and its child PEs.
+ */
+void *eeh_pe_dev_traverse(struct eeh_pe *root,
+		eeh_traverse_func fn, void *flag)
+{
+	struct eeh_pe *pe;
+	struct eeh_dev *edev;
+	void *ret;
+
+	if (!root) {
+		pr_warning("%s: Invalid PE %p\n", __func__, root);
+		return NULL;
+	}
+
+	eeh_lock();
+
+	/* Traverse root PE */
+	for (pe = root; pe; pe = eeh_pe_next(pe, root)) {
+		eeh_pe_for_each_dev(pe, edev) {
+			ret = fn(edev, flag);
+			if (ret) {
+				eeh_unlock();
+				return ret;
+			}
+		}
+	}
+
+	eeh_unlock();
+
+	return NULL;
+}
+
+/**
+ * __eeh_pe_get - Check the PE address
+ * @data: EEH PE
+ * @flag: EEH device
+ *
+ * For one particular PE, it can be identified by PE address
+ * or tranditional BDF address. BDF address is composed of
+ * Bus/Device/Function number. The extra data referred by flag
+ * indicates which type of address should be used.
+ */
+static void *__eeh_pe_get(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	struct eeh_dev *edev = (struct eeh_dev *)flag;
+
+	/* Unexpected PHB PE */
+	if (pe->type & EEH_PE_PHB)
+		return NULL;
+
+	/* We prefer PE address */
+	if (edev->pe_config_addr &&
+	   (edev->pe_config_addr == pe->addr))
+		return pe;
+
+	/* Try BDF address */
+	if (edev->pe_config_addr &&
+	   (edev->config_addr == pe->config_addr))
+		return pe;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_get - Search PE based on the given address
+ * @edev: EEH device
+ *
+ * Search the corresponding PE based on the specified address which
+ * is included in the eeh device. The function is used to check if
+ * the associated PE has been created against the PE address. It's
+ * notable that the PE address has 2 format: traditional PE address
+ * which is composed of PCI bus/device/function number, or unified
+ * PE address.
+ */
+static struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
+{
+	struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
+	struct eeh_pe *pe;
+
+	pe = eeh_pe_traverse(root, __eeh_pe_get, edev);
+
+	return pe;
+}
+
+/**
+ * eeh_pe_get_parent - Retrieve the parent PE
+ * @edev: EEH device
+ *
+ * The whole PEs existing in the system are organized as hierarchy
+ * tree. The function is used to retrieve the parent PE according
+ * to the parent EEH device.
+ */
+static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
+{
+	struct device_node *dn;
+	struct eeh_dev *parent;
+
+	/*
+	 * It might have the case for the indirect parent
+	 * EEH device already having associated PE, but
+	 * the direct parent EEH device doesn't have yet.
+	 */
+	dn = edev->dn->parent;
+	while (dn) {
+		/* We're poking out of PCI territory */
+		if (!PCI_DN(dn)) return NULL;
+
+		parent = of_node_to_eeh_dev(dn);
+		/* We're poking out of PCI territory */
+		if (!parent) return NULL;
+
+		if (parent->pe)
+			return parent->pe;
+
+		dn = dn->parent;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_add_to_parent_pe - Add EEH device to parent PE
+ * @edev: EEH device
+ *
+ * Add EEH device to the parent PE. If the parent PE already
+ * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
+ * we have to create new PE to hold the EEH device and the new
+ * PE will be linked to its parent PE as well.
+ */
+int eeh_add_to_parent_pe(struct eeh_dev *edev)
+{
+	struct eeh_pe *pe, *parent;
+
+	eeh_lock();
+
+	/*
+	 * Search the PE has been existing or not according
+	 * to the PE address. If that has been existing, the
+	 * PE should be composed of PCI bus and its subordinate
+	 * components.
+	 */
+	pe = eeh_pe_get(edev);
+	if (pe && !(pe->type & EEH_PE_INVALID)) {
+		if (!edev->pe_config_addr) {
+			eeh_unlock();
+			pr_err("%s: PE with addr 0x%x already exists\n",
+				__func__, edev->config_addr);
+			return -EEXIST;
+		}
+
+		/* Mark the PE as type of PCI bus */
+		pe->type = EEH_PE_BUS;
+		edev->pe = pe;
+
+		/* Put the edev to PE */
+		list_add_tail(&edev->list, &pe->edevs);
+		eeh_unlock();
+		pr_debug("EEH: Add %s to Bus PE#%x\n",
+			edev->dn->full_name, pe->addr);
+
+		return 0;
+	} else if (pe && (pe->type & EEH_PE_INVALID)) {
+		list_add_tail(&edev->list, &pe->edevs);
+		edev->pe = pe;
+		/*
+		 * We're running to here because of PCI hotplug caused by
+		 * EEH recovery. We need clear EEH_PE_INVALID until the top.
+		 */
+		parent = pe;
+		while (parent) {
+			if (!(parent->type & EEH_PE_INVALID))
+				break;
+			parent->type &= ~EEH_PE_INVALID;
+			parent = parent->parent;
+		}
+		eeh_unlock();
+		pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+			edev->dn->full_name, pe->addr, pe->parent->addr);
+
+		return 0;
+	}
+
+	/* Create a new EEH PE */
+	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
+	if (!pe) {
+		eeh_unlock();
+		pr_err("%s: out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+	pe->addr	= edev->pe_config_addr;
+	pe->config_addr	= edev->config_addr;
+
+	/*
+	 * Put the new EEH PE into hierarchy tree. If the parent
+	 * can't be found, the newly created PE will be attached
+	 * to PHB directly. Otherwise, we have to associate the
+	 * PE with its parent.
+	 */
+	parent = eeh_pe_get_parent(edev);
+	if (!parent) {
+		parent = eeh_phb_pe_get(edev->phb);
+		if (!parent) {
+			eeh_unlock();
+			pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
+				__func__, edev->phb->global_number);
+			edev->pe = NULL;
+			kfree(pe);
+			return -EEXIST;
+		}
+	}
+	pe->parent = parent;
+
+	/*
+	 * Put the newly created PE into the child list and
+	 * link the EEH device accordingly.
+	 */
+	list_add_tail(&pe->child, &parent->child_list);
+	list_add_tail(&edev->list, &pe->edevs);
+	edev->pe = pe;
+	eeh_unlock();
+	pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n",
+		edev->dn->full_name, pe->addr, pe->parent->addr);
+
+	return 0;
+}
+
+/**
+ * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE
+ * @edev: EEH device
+ * @purge_pe: remove PE or not
+ *
+ * The PE hierarchy tree might be changed when doing PCI hotplug.
+ * Also, the PCI devices or buses could be removed from the system
+ * during EEH recovery. So we have to call the function remove the
+ * corresponding PE accordingly if necessary.
+ */
+int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe)
+{
+	struct eeh_pe *pe, *parent, *child;
+	int cnt;
+
+	if (!edev->pe) {
+		pr_warning("%s: No PE found for EEH device %s\n",
+			__func__, edev->dn->full_name);
+		return -EEXIST;
+	}
+
+	eeh_lock();
+
+	/* Remove the EEH device */
+	pe = edev->pe;
+	edev->pe = NULL;
+	list_del(&edev->list);
+
+	/*
+	 * Check if the parent PE includes any EEH devices.
+	 * If not, we should delete that. Also, we should
+	 * delete the parent PE if it doesn't have associated
+	 * child PEs and EEH devices.
+	 */
+	while (1) {
+		parent = pe->parent;
+		if (pe->type & EEH_PE_PHB)
+			break;
+
+		if (purge_pe) {
+			if (list_empty(&pe->edevs) &&
+			    list_empty(&pe->child_list)) {
+				list_del(&pe->child);
+				kfree(pe);
+			} else {
+				break;
+			}
+		} else {
+			if (list_empty(&pe->edevs)) {
+				cnt = 0;
+				list_for_each_entry(child, &pe->child_list, child) {
+					if (!(pe->type & EEH_PE_INVALID)) {
+						cnt++;
+						break;
+					}
+				}
+
+				if (!cnt)
+					pe->type |= EEH_PE_INVALID;
+				else
+					break;
+			}
+		}
+
+		pe = parent;
+	}
+
+	eeh_unlock();
+
+	return 0;
+}
+
+/**
+ * __eeh_pe_state_mark - Mark the state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to mark the indicated state for the given
+ * PE. Also, the associated PCI devices will be put into IO frozen
+ * state as well.
+ */
+static void *__eeh_pe_state_mark(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	int state = *((int *)flag);
+	struct eeh_dev *tmp;
+	struct pci_dev *pdev;
+
+	/*
+	 * Mark the PE with the indicated state. Also,
+	 * the associated PCI device will be put into
+	 * I/O frozen state to avoid I/O accesses from
+	 * the PCI device driver.
+	 */
+	pe->state |= state;
+	eeh_pe_for_each_dev(pe, tmp) {
+		pdev = eeh_dev_to_pci_dev(tmp);
+		if (pdev)
+			pdev->error_state = pci_channel_io_frozen;
+	}
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_state_mark - Mark specified state for PE and its associated device
+ * @pe: EEH PE
+ *
+ * EEH error affects the current PE and its child PEs. The function
+ * is used to mark appropriate state for the affected PEs and the
+ * associated devices.
+ */
+void eeh_pe_state_mark(struct eeh_pe *pe, int state)
+{
+	eeh_lock();
+	eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
+	eeh_unlock();
+}
+
+/**
+ * __eeh_pe_state_clear - Clear state for the PE
+ * @data: EEH PE
+ * @flag: state
+ *
+ * The function is used to clear the indicated state from the
+ * given PE. Besides, we also clear the check count of the PE
+ * as well.
+ */
+static void *__eeh_pe_state_clear(void *data, void *flag)
+{
+	struct eeh_pe *pe = (struct eeh_pe *)data;
+	int state = *((int *)flag);
+
+	pe->state &= ~state;
+	pe->check_count = 0;
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_state_clear - Clear state for the PE and its children
+ * @pe: PE
+ * @state: state to be cleared
+ *
+ * When the PE and its children has been recovered from error,
+ * we need clear the error state for that. The function is used
+ * for the purpose.
+ */
+void eeh_pe_state_clear(struct eeh_pe *pe, int state)
+{
+	eeh_lock();
+	eeh_pe_traverse(pe, __eeh_pe_state_clear, &state);
+	eeh_unlock();
+}
+
+/**
+ * eeh_restore_one_device_bars - Restore the Base Address Registers for one device
+ * @data: EEH device
+ * @flag: Unused
+ *
+ * Loads the PCI configuration space base address registers,
+ * the expansion ROM base address, the latency timer, and etc.
+ * from the saved values in the device node.
+ */
+static void *eeh_restore_one_device_bars(void *data, void *flag)
+{
+	int i;
+	u32 cmd;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct device_node *dn = eeh_dev_to_of_node(edev);
+
+	for (i = 4; i < 10; i++)
+		eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]);
+	/* 12 == Expansion ROM Address */
+	eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]);
+
+#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
+#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)])
+
+	eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1,
+		SAVED_BYTE(PCI_CACHE_LINE_SIZE));
+	eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1,
+		SAVED_BYTE(PCI_LATENCY_TIMER));
+
+	/* max latency, min grant, interrupt pin and line */
+	eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]);
+
+	/*
+	 * Restore PERR & SERR bits, some devices require it,
+	 * don't touch the other command bits
+	 */
+	eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd);
+	if (edev->config_space[1] & PCI_COMMAND_PARITY)
+		cmd |= PCI_COMMAND_PARITY;
+	else
+		cmd &= ~PCI_COMMAND_PARITY;
+	if (edev->config_space[1] & PCI_COMMAND_SERR)
+		cmd |= PCI_COMMAND_SERR;
+	else
+		cmd &= ~PCI_COMMAND_SERR;
+	eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd);
+
+	return NULL;
+}
+
+/**
+ * eeh_pe_restore_bars - Restore the PCI config space info
+ * @pe: EEH PE
+ *
+ * This routine performs a recursive walk to the children
+ * of this device as well.
+ */
+void eeh_pe_restore_bars(struct eeh_pe *pe)
+{
+	/*
+	 * We needn't take the EEH lock since eeh_pe_dev_traverse()
+	 * will take that.
+	 */
+	eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL);
+}
+
+/**
+ * eeh_pe_bus_get - Retrieve PCI bus according to the given PE
+ * @pe: EEH PE
+ *
+ * Retrieve the PCI bus according to the given PE. Basically,
+ * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the
+ * primary PCI bus will be retrieved. The parent bus will be
+ * returned for BUS PE. However, we don't have associated PCI
+ * bus for DEVICE PE.
+ */
+struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
+{
+	struct pci_bus *bus = NULL;
+	struct eeh_dev *edev;
+	struct pci_dev *pdev;
+
+	eeh_lock();
+
+	if (pe->type & EEH_PE_PHB) {
+		bus = pe->phb->bus;
+	} else if (pe->type & EEH_PE_BUS) {
+		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
+		pdev = eeh_dev_to_pci_dev(edev);
+		if (pdev)
+			bus = pdev->bus;
+	}
+
+	eeh_unlock();
+
+	return bus;
+}
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index c33360ec4f4f..19506f935737 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -129,27 +129,117 @@ static int pseries_eeh_init(void)
 		eeh_error_buf_size = RTAS_ERROR_LOG_MAX;
 	}
 
+	/* Set EEH probe mode */
+	eeh_probe_mode_set(EEH_PROBE_MODE_DEVTREE);
+
 	return 0;
 }
 
 /**
+ * pseries_eeh_of_probe - EEH probe on the given device
+ * @dn: OF node
+ * @flag: Unused
+ *
+ * When EEH module is installed during system boot, all PCI devices
+ * are checked one by one to see if it supports EEH. The function
+ * is introduced for the purpose.
+ */
+static void *pseries_eeh_of_probe(struct device_node *dn, void *flag)
+{
+	struct eeh_dev *edev;
+	struct eeh_pe pe;
+	const u32 *class_code, *vendor_id, *device_id;
+	const u32 *regs;
+	int enable = 0;
+	int ret;
+
+	/* Retrieve OF node and eeh device */
+	edev = of_node_to_eeh_dev(dn);
+	if (!of_device_is_available(dn))
+		return NULL;
+
+	/* Retrieve class/vendor/device IDs */
+	class_code = of_get_property(dn, "class-code", NULL);
+	vendor_id  = of_get_property(dn, "vendor-id", NULL);
+	device_id  = of_get_property(dn, "device-id", NULL);
+
+	/* Skip for bad OF node or PCI-ISA bridge */
+	if (!class_code || !vendor_id || !device_id)
+		return NULL;
+	if (dn->type && !strcmp(dn->type, "isa"))
+		return NULL;
+
+	/* Update class code and mode of eeh device */
+	edev->class_code = *class_code;
+	edev->mode = 0;
+
+	/* Retrieve the device address */
+	regs = of_get_property(dn, "reg", NULL);
+	if (!regs) {
+		pr_warning("%s: OF node property %s::reg not found\n",
+			__func__, dn->full_name);
+		return NULL;
+	}
+
+	/* Initialize the fake PE */
+	memset(&pe, 0, sizeof(struct eeh_pe));
+	pe.phb = edev->phb;
+	pe.config_addr = regs[0];
+
+	/* Enable EEH on the device */
+	ret = eeh_ops->set_option(&pe, EEH_OPT_ENABLE);
+	if (!ret) {
+		edev->config_addr = regs[0];
+		/* Retrieve PE address */
+		edev->pe_config_addr = eeh_ops->get_pe_addr(&pe);
+		pe.addr = edev->pe_config_addr;
+
+		/* Some older systems (Power4) allow the ibm,set-eeh-option
+		 * call to succeed even on nodes where EEH is not supported.
+		 * Verify support explicitly.
+		 */
+		ret = eeh_ops->get_state(&pe, NULL);
+		if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
+			enable = 1;
+
+		if (enable) {
+			eeh_subsystem_enabled = 1;
+			eeh_add_to_parent_pe(edev);
+
+			pr_debug("%s: EEH enabled on %s PHB#%d-PE#%x, config addr#%x\n",
+				__func__, dn->full_name, pe.phb->global_number,
+				pe.addr, pe.config_addr);
+		} else if (dn->parent && of_node_to_eeh_dev(dn->parent) &&
+			   (of_node_to_eeh_dev(dn->parent))->pe) {
+			/* This device doesn't support EEH, but it may have an
+			 * EEH parent, in which case we mark it as supported.
+			 */
+			edev->config_addr = of_node_to_eeh_dev(dn->parent)->config_addr;
+			edev->pe_config_addr = of_node_to_eeh_dev(dn->parent)->pe_config_addr;
+			eeh_add_to_parent_pe(edev);
+		}
+	}
+
+	/* Save memory bars */
+	eeh_save_bars(edev);
+
+	return NULL;
+}
+
+/**
  * pseries_eeh_set_option - Initialize EEH or MMIO/DMA reenable
- * @dn: device node
+ * @pe: EEH PE
  * @option: operation to be issued
  *
  * The function is used to control the EEH functionality globally.
  * Currently, following options are support according to PAPR:
  * Enable EEH, Disable EEH, Enable MMIO and Enable DMA
  */
-static int pseries_eeh_set_option(struct device_node *dn, int option)
+static int pseries_eeh_set_option(struct eeh_pe *pe, int option)
 {
 	int ret = 0;
-	struct eeh_dev *edev;
-	const u32 *reg;
 	int config_addr;
 
-	edev = of_node_to_eeh_dev(dn);
-
 	/*
 	 * When we're enabling or disabling EEH functioality on
 	 * the particular PE, the PE config address is possibly
@@ -159,15 +249,11 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
 	switch (option) {
 	case EEH_OPT_DISABLE:
 	case EEH_OPT_ENABLE:
-		reg = of_get_property(dn, "reg", NULL);
-		config_addr = reg[0];
-		break;
-
 	case EEH_OPT_THAW_MMIO:
 	case EEH_OPT_THAW_DMA:
-		config_addr = edev->config_addr;
-		if (edev->pe_config_addr)
-			config_addr = edev->pe_config_addr;
+		config_addr = pe->config_addr;
+		if (pe->addr)
+			config_addr = pe->addr;
 		break;
 
 	default:
@@ -177,15 +263,15 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
 	}
 
 	ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
-			config_addr, BUID_HI(edev->phb->buid),
-			BUID_LO(edev->phb->buid), option);
+			config_addr, BUID_HI(pe->phb->buid),
+			BUID_LO(pe->phb->buid), option);
 
 	return ret;
 }
 
 /**
  * pseries_eeh_get_pe_addr - Retrieve PE address
- * @dn: device node
+ * @pe: EEH PE
  *
  * Retrieve the assocated PE address. Actually, there're 2 RTAS
  * function calls dedicated for the purpose. We need implement
@@ -196,14 +282,11 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
  * It's notable that zero'ed return value means invalid PE config
  * address.
  */
-static int pseries_eeh_get_pe_addr(struct device_node *dn)
+static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
 {
-	struct eeh_dev *edev;
 	int ret = 0;
 	int rets[3];
 
-	edev = of_node_to_eeh_dev(dn);
-
 	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
 		/*
 		 * First of all, we need to make sure there has one PE
@@ -211,18 +294,18 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 		 * meaningless.
 		 */
 		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-				edev->config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), 1);
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 1);
 		if (ret || (rets[0] == 0))
 			return 0;
 
 		/* Retrieve the associated PE config address */
 		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-				edev->config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), 0);
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 0);
 		if (ret) {
-			pr_warning("%s: Failed to get PE address for %s\n",
-				__func__, dn->full_name);
+			pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n",
+				__func__, pe->phb->global_number, pe->config_addr);
 			return 0;
 		}
 
@@ -231,11 +314,11 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 
 	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
-				edev->config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), 0);
+				pe->config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), 0);
 		if (ret) {
-			pr_warning("%s: Failed to get PE address for %s\n",
-				__func__, dn->full_name);
+			pr_warning("%s: Failed to get address for PHB#%d-PE#%x\n",
+				__func__, pe->phb->global_number, pe->config_addr);
 			return 0;
 		}
 
@@ -247,7 +330,7 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 
 /**
  * pseries_eeh_get_state - Retrieve PE state
- * @dn: PE associated device node
+ * @pe: EEH PE
  * @state: return value
  *
  * Retrieve the state of the specified PE. On RTAS compliant
@@ -258,30 +341,28 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
  * RTAS calls for the purpose, we need to try the new one and back
  * to the old one if the new one couldn't work properly.
  */
-static int pseries_eeh_get_state(struct device_node *dn, int *state)
+static int pseries_eeh_get_state(struct eeh_pe *pe, int *state)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 	int rets[4];
 	int result;
 
 	/* Figure out PE config address if possible */
-	edev = of_node_to_eeh_dev(dn);
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) {
 		/* Fake PE unavailable info */
 		rets[2] = 0;
 		ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else {
 		return EEH_STATE_NOT_SUPPORT;
 	}
@@ -333,34 +414,32 @@ static int pseries_eeh_get_state(struct device_node *dn, int *state)
 
 /**
  * pseries_eeh_reset - Reset the specified PE
- * @dn: PE associated device node
+ * @pe: EEH PE
  * @option: reset option
  *
  * Reset the specified PE
  */
-static int pseries_eeh_reset(struct device_node *dn, int option)
+static int pseries_eeh_reset(struct eeh_pe *pe, int option)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 
 	/* Figure out PE address */
-	edev = of_node_to_eeh_dev(dn);
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	/* Reset PE through RTAS call */
 	ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
-			config_addr, BUID_HI(edev->phb->buid),
-			BUID_LO(edev->phb->buid), option);
+			config_addr, BUID_HI(pe->phb->buid),
+			BUID_LO(pe->phb->buid), option);
 
 	/* If fundamental-reset not supported, try hot-reset */
 	if (option == EEH_RESET_FUNDAMENTAL &&
 	    ret == -8) {
 		ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid), EEH_RESET_HOT);
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid), EEH_RESET_HOT);
 	}
 
 	return ret;
@@ -368,13 +447,13 @@ static int pseries_eeh_reset(struct device_node *dn, int option)
 
 /**
  * pseries_eeh_wait_state - Wait for PE state
- * @dn: PE associated device node
+ * @pe: EEH PE
  * @max_wait: maximal period in microsecond
  *
  * Wait for the state of associated PE. It might take some time
  * to retrieve the PE's state.
  */
-static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
+static int pseries_eeh_wait_state(struct eeh_pe *pe, int max_wait)
 {
 	int ret;
 	int mwait;
@@ -391,7 +470,7 @@ static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
 #define EEH_STATE_MAX_WAIT_TIME	(300 * 1000)
 
 	while (1) {
-		ret = pseries_eeh_get_state(dn, &mwait);
+		ret = pseries_eeh_get_state(pe, &mwait);
 
 		/*
 		 * If the PE's state is temporarily unavailable,
@@ -426,7 +505,7 @@ static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
 
 /**
  * pseries_eeh_get_log - Retrieve error log
- * @dn: device node
+ * @pe: EEH PE
  * @severity: temporary or permanent error log
  * @drv_log: driver log to be combined with retrieved error log
  * @len: length of driver log
@@ -435,24 +514,22 @@ static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
  * Actually, the error will be retrieved through the dedicated
  * RTAS call.
  */
-static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_log, unsigned long len)
+static int pseries_eeh_get_log(struct eeh_pe *pe, int severity, char *drv_log, unsigned long len)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	unsigned long flags;
 	int ret;
 
-	edev = of_node_to_eeh_dev(dn);
 	spin_lock_irqsave(&slot_errbuf_lock, flags);
 	memset(slot_errbuf, 0, eeh_error_buf_size);
 
 	/* Figure out the PE address */
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr,
-			BUID_HI(edev->phb->buid), BUID_LO(edev->phb->buid),
+			BUID_HI(pe->phb->buid), BUID_LO(pe->phb->buid),
 			virt_to_phys(drv_log), len,
 			virt_to_phys(slot_errbuf), eeh_error_buf_size,
 			severity);
@@ -465,40 +542,38 @@ static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_l
 
 /**
  * pseries_eeh_configure_bridge - Configure PCI bridges in the indicated PE
- * @dn: PE associated device node
+ * @pe: EEH PE
  *
  * The function will be called to reconfigure the bridges included
  * in the specified PE so that the mulfunctional PE would be recovered
  * again.
  */
-static int pseries_eeh_configure_bridge(struct device_node *dn)
+static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
 {
-	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 
 	/* Figure out the PE address */
-	edev = of_node_to_eeh_dev(dn);
-	config_addr = edev->config_addr;
-	if (edev->pe_config_addr)
-		config_addr = edev->pe_config_addr;
+	config_addr = pe->config_addr;
+	if (pe->addr)
+		config_addr = pe->addr;
 
 	/* Use new configure-pe function, if supported */
 	if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_configure_bridge, 3, 1, NULL,
-				config_addr, BUID_HI(edev->phb->buid),
-				BUID_LO(edev->phb->buid));
+				config_addr, BUID_HI(pe->phb->buid),
+				BUID_LO(pe->phb->buid));
 	} else {
 		return -EFAULT;
 	}
 
 	if (ret)
-		pr_warning("%s: Unable to configure bridge %d for %s\n",
-			__func__, ret, dn->full_name);
+		pr_warning("%s: Unable to configure bridge PHB#%d-PE#%x (%d)\n",
+			__func__, pe->phb->global_number, pe->addr, ret);
 
 	return ret;
 }
@@ -542,6 +617,8 @@ static int pseries_eeh_write_config(struct device_node *dn, int where, int size,
 static struct eeh_ops pseries_eeh_ops = {
 	.name			= "pseries",
 	.init			= pseries_eeh_init,
+	.of_probe		= pseries_eeh_of_probe,
+	.dev_probe		= NULL,
 	.set_option		= pseries_eeh_set_option,
 	.get_pe_addr		= pseries_eeh_get_pe_addr,
 	.get_state		= pseries_eeh_get_state,
@@ -559,7 +636,21 @@ static struct eeh_ops pseries_eeh_ops = {
  * EEH initialization on pseries platform. This function should be
  * called before any EEH related functions.
  */
-int __init eeh_pseries_init(void)
+static int __init eeh_pseries_init(void)
 {
-	return eeh_ops_register(&pseries_eeh_ops);
+	int ret = -EINVAL;
+
+	if (!machine_is(pseries))
+		return ret;
+
+	ret = eeh_ops_register(&pseries_eeh_ops);
+	if (!ret)
+		pr_info("EEH: pSeries platform initialized\n");
+	else
+		pr_info("EEH: pSeries platform initialization failure (%d)\n",
+			ret);
+
+	return ret;
 }
+
+early_initcall(eeh_pseries_init);
diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/platforms/pseries/eeh_sysfs.c
index 243b3510d70f..d37708360f2e 100644
--- a/arch/powerpc/platforms/pseries/eeh_sysfs.c
+++ b/arch/powerpc/platforms/pseries/eeh_sysfs.c
@@ -53,9 +53,6 @@ static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
 EEH_SHOW_ATTR(eeh_mode,            mode,            "0x%x");
 EEH_SHOW_ATTR(eeh_config_addr,     config_addr,     "0x%x");
 EEH_SHOW_ATTR(eeh_pe_config_addr,  pe_config_addr,  "0x%x");
-EEH_SHOW_ATTR(eeh_check_count,     check_count,     "%d"  );
-EEH_SHOW_ATTR(eeh_freeze_count,    freeze_count,    "%d"  );
-EEH_SHOW_ATTR(eeh_false_positives, false_positives, "%d"  );
 
 void eeh_sysfs_add_device(struct pci_dev *pdev)
 {
@@ -64,9 +61,6 @@ void eeh_sysfs_add_device(struct pci_dev *pdev)
 	rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
 	rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
 	rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_check_count);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_false_positives);
-	rc += device_create_file(&pdev->dev, &dev_attr_eeh_freeze_count);
 
 	if (rc)
 		printk(KERN_WARNING "EEH: Unable to create sysfs entries\n");
@@ -77,8 +71,5 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev)
 	device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
 	device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
 	device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_check_count);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_false_positives);
-	device_remove_file(&pdev->dev, &dev_attr_eeh_freeze_count);
 }
 
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 11d8e0544ac0..dc0a035e63bb 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -77,7 +77,8 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz
 {
 	unsigned long start, start_pfn;
 	struct zone *zone;
-	int ret;
+	int i, ret;
+	int sections_to_remove;
 
 	start_pfn = base >> PAGE_SHIFT;
 
@@ -97,9 +98,13 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz
 	 * to sysfs "state" file and we can't remove sysfs entries
 	 * while writing to it. So we have to defer it to here.
 	 */
-	ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT);
-	if (ret)
-		return ret;
+	sections_to_remove = (memblock_size >> PAGE_SHIFT) / PAGES_PER_SECTION;
+	for (i = 0; i < sections_to_remove; i++) {
+		unsigned long pfn = start_pfn + i * PAGES_PER_SECTION;
+		ret = __remove_pages(zone, start_pfn,  PAGES_PER_SECTION);
+		if (ret)
+			return ret;
+	}
 
 	/*
 	 * Update memory regions for memory remove
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index bca220f2873c..6153eea27ce7 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -28,6 +28,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/memblock.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>	/* for show_stack */
 #include <linux/string.h>
@@ -41,7 +42,6 @@
 #include <asm/iommu.h>
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
-#include <asm/abs_addr.h>
 #include <asm/pSeries_reconfig.h>
 #include <asm/firmware.h>
 #include <asm/tce.h>
@@ -99,7 +99,7 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
 
 	while (npages--) {
 		/* can't move this out since we might cross MEMBLOCK boundary */
-		rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+		rpn = __pa(uaddr) >> TCE_SHIFT;
 		*tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
 
 		uaddr += TCE_PAGE_SIZE;
@@ -148,7 +148,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	int ret = 0;
 	long tcenum_start = tcenum, npages_start = npages;
 
-	rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+	rpn = __pa(uaddr) >> TCE_SHIFT;
 	proto_tce = TCE_PCI_READ;
 	if (direction != DMA_TO_DEVICE)
 		proto_tce |= TCE_PCI_WRITE;
@@ -217,7 +217,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 		__get_cpu_var(tce_page) = tcep;
 	}
 
-	rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT;
+	rpn = __pa(uaddr) >> TCE_SHIFT;
 	proto_tce = TCE_PCI_READ;
 	if (direction != DMA_TO_DEVICE)
 		proto_tce |= TCE_PCI_WRITE;
@@ -237,7 +237,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 
 		rc = plpar_tce_put_indirect((u64)tbl->it_index,
 					    (u64)tcenum << 12,
-					    (u64)virt_to_abs(tcep),
+					    (u64)__pa(tcep),
 					    limit);
 
 		npages -= limit;
@@ -441,7 +441,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn,
 
 		rc = plpar_tce_put_indirect(liobn,
 					    dma_offset,
-					    (u64)virt_to_abs(tcep),
+					    (u64)__pa(tcep),
 					    limit);
 
 		num_tce -= limit;
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 5f3ef876ded2..0da39fed355a 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -31,7 +31,6 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/machdep.h>
-#include <asm/abs_addr.h>
 #include <asm/mmu_context.h>
 #include <asm/iommu.h>
 #include <asm/tlbflush.h>
@@ -108,9 +107,9 @@ void vpa_init(int cpu)
 }
 
 static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
- 			      unsigned long va, unsigned long pa,
- 			      unsigned long rflags, unsigned long vflags,
-			      int psize, int ssize)
+				     unsigned long vpn, unsigned long pa,
+				     unsigned long rflags, unsigned long vflags,
+				     int psize, int ssize)
 {
 	unsigned long lpar_rc;
 	unsigned long flags;
@@ -118,11 +117,11 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 	unsigned long hpte_v, hpte_r;
 
 	if (!(vflags & HPTE_V_BOLTED))
-		pr_devel("hpte_insert(group=%lx, va=%016lx, pa=%016lx, "
-			 "rflags=%lx, vflags=%lx, psize=%d)\n",
-			 hpte_group, va, pa, rflags, vflags, psize);
+		pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
+			 "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
+			 hpte_group, vpn,  pa, rflags, vflags, psize);
 
-	hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID;
+	hpte_v = hpte_encode_v(vpn, psize, ssize) | vflags | HPTE_V_VALID;
 	hpte_r = hpte_encode_r(pa, psize) | rflags;
 
 	if (!(vflags & HPTE_V_BOLTED))
@@ -227,22 +226,6 @@ static void pSeries_lpar_hptab_clear(void)
 }
 
 /*
- * This computes the AVPN and B fields of the first dword of a HPTE,
- * for use when we want to match an existing PTE.  The bottom 7 bits
- * of the returned value are zero.
- */
-static inline unsigned long hpte_encode_avpn(unsigned long va, int psize,
-					     int ssize)
-{
-	unsigned long v;
-
-	v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm);
-	v <<= HPTE_V_AVPN_SHIFT;
-	v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
-	return v;
-}
-
-/*
  * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
  * the low 3 bits of flags happen to line up.  So no transform is needed.
  * We can probably optimize here and assume the high bits of newpp are
@@ -250,14 +233,14 @@ static inline unsigned long hpte_encode_avpn(unsigned long va, int psize,
  */
 static long pSeries_lpar_hpte_updatepp(unsigned long slot,
 				       unsigned long newpp,
-				       unsigned long va,
+				       unsigned long vpn,
 				       int psize, int ssize, int local)
 {
 	unsigned long lpar_rc;
 	unsigned long flags = (newpp & 7) | H_AVPN;
 	unsigned long want_v;
 
-	want_v = hpte_encode_avpn(va, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
 	pr_devel("    update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
 		 want_v, slot, flags, psize);
@@ -295,15 +278,15 @@ static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot)
 	return dword0;
 }
 
-static long pSeries_lpar_hpte_find(unsigned long va, int psize, int ssize)
+static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
 {
 	unsigned long hash;
 	unsigned long i;
 	long slot;
 	unsigned long want_v, hpte_v;
 
-	hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize);
-	want_v = hpte_encode_avpn(va, psize, ssize);
+	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 
 	/* Bolted entries are always in the primary group */
 	slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
@@ -323,12 +306,13 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
 					     unsigned long ea,
 					     int psize, int ssize)
 {
-	unsigned long lpar_rc, slot, vsid, va, flags;
+	unsigned long vpn;
+	unsigned long lpar_rc, slot, vsid, flags;
 
 	vsid = get_kernel_vsid(ea, ssize);
-	va = hpt_va(ea, vsid, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
 
-	slot = pSeries_lpar_hpte_find(va, psize, ssize);
+	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
 	BUG_ON(slot == -1);
 
 	flags = newpp & 7;
@@ -337,17 +321,17 @@ static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
 	BUG_ON(lpar_rc != H_SUCCESS);
 }
 
-static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
+static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
 					 int psize, int ssize, int local)
 {
 	unsigned long want_v;
 	unsigned long lpar_rc;
 	unsigned long dummy1, dummy2;
 
-	pr_devel("    inval : slot=%lx, va=%016lx, psize: %d, local: %d\n",
-		 slot, va, psize, local);
+	pr_devel("    inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
+		 slot, vpn, psize, local);
 
-	want_v = hpte_encode_avpn(va, psize, ssize);
+	want_v = hpte_encode_avpn(vpn, psize, ssize);
 	lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
 	if (lpar_rc == H_NOT_FOUND)
 		return;
@@ -358,15 +342,16 @@ static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va,
 static void pSeries_lpar_hpte_removebolted(unsigned long ea,
 					   int psize, int ssize)
 {
-	unsigned long slot, vsid, va;
+	unsigned long vpn;
+	unsigned long slot, vsid;
 
 	vsid = get_kernel_vsid(ea, ssize);
-	va = hpt_va(ea, vsid, ssize);
+	vpn = hpt_vpn(ea, vsid, ssize);
 
-	slot = pSeries_lpar_hpte_find(va, psize, ssize);
+	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
 	BUG_ON(slot == -1);
 
-	pSeries_lpar_hpte_invalidate(slot, va, psize, ssize, 0);
+	pSeries_lpar_hpte_invalidate(slot, vpn, psize, ssize, 0);
 }
 
 /* Flag bits for H_BULK_REMOVE */
@@ -382,12 +367,12 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea,
  */
 static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
 {
+	unsigned long vpn;
 	unsigned long i, pix, rc;
 	unsigned long flags = 0;
 	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
 	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
 	unsigned long param[9];
-	unsigned long va;
 	unsigned long hash, index, shift, hidx, slot;
 	real_pte_t pte;
 	int psize, ssize;
@@ -399,21 +384,21 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
 	ssize = batch->ssize;
 	pix = 0;
 	for (i = 0; i < number; i++) {
-		va = batch->vaddr[i];
+		vpn = batch->vpn[i];
 		pte = batch->pte[i];
-		pte_iterate_hashed_subpages(pte, psize, va, index, shift) {
-			hash = hpt_hash(va, shift, ssize);
+		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
+			hash = hpt_hash(vpn, shift, ssize);
 			hidx = __rpte_to_hidx(pte, index);
 			if (hidx & _PTEIDX_SECONDARY)
 				hash = ~hash;
 			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 			slot += hidx & _PTEIDX_GROUP_IX;
 			if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
-				pSeries_lpar_hpte_invalidate(slot, va, psize,
+				pSeries_lpar_hpte_invalidate(slot, vpn, psize,
 							     ssize, local);
 			} else {
 				param[pix] = HBR_REQUEST | HBR_AVPN | slot;
-				param[pix+1] = hpte_encode_avpn(va, psize,
+				param[pix+1] = hpte_encode_avpn(vpn, psize,
 								ssize);
 				pix += 2;
 				if (pix == 8) {
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index 109fdb75578d..d19f4977c834 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -210,6 +210,7 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total)
 static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 {
 	struct device_node *dn;
+	struct eeh_dev *edev;
 
 	/* Found our PE and assume 8 at that point. */
 
@@ -217,7 +218,10 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 	if (!dn)
 		return NULL;
 
-	dn = eeh_find_device_pe(dn);
+	/* Get the top level device in the PE */
+	edev = of_node_to_eeh_dev(dn);
+	edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list);
+	dn = eeh_dev_to_of_node(edev);
 	if (!dn)
 		return NULL;
 
@@ -387,12 +391,13 @@ static int check_msix_entries(struct pci_dev *pdev)
 	return 0;
 }
 
-static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
 {
 	struct pci_dn *pdn;
 	int hwirq, virq, i, rc;
 	struct msi_desc *entry;
 	struct msi_msg msg;
+	int nvec = nvec_in;
 
 	pdn = get_pdn(pdev);
 	if (!pdn)
@@ -402,10 +407,23 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 		return -EINVAL;
 
 	/*
+	 * Firmware currently refuse any non power of two allocation
+	 * so we round up if the quota will allow it.
+	 */
+	if (type == PCI_CAP_ID_MSIX) {
+		int m = roundup_pow_of_two(nvec);
+		int quota = msi_quota_for_device(pdev, m);
+
+		if (quota >= m)
+			nvec = m;
+	}
+
+	/*
 	 * Try the new more explicit firmware interface, if that fails fall
 	 * back to the old interface. The old interface is known to never
 	 * return MSI-Xs.
 	 */
+again:
 	if (type == PCI_CAP_ID_MSI) {
 		rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec);
 
@@ -417,6 +435,10 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 		rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec);
 
 	if (rc != nvec) {
+		if (nvec != nvec_in) {
+			nvec = nvec_in;
+			goto again;
+		}
 		pr_debug("rtas_msi: rtas_change_msi() failed\n");
 		return rc;
 	}
diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c
index 2c6ded29f73d..56b864d777ee 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -73,7 +73,7 @@ void __init pSeries_final_fixup(void)
 {
 	pSeries_request_regions();
 
-	pci_addr_cache_build();
+	eeh_addr_cache_build();
 }
 
 /*
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index 3ccebc83dc02..261a577a3dd2 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -65,27 +65,43 @@ pcibios_find_pci_bus(struct device_node *dn)
 EXPORT_SYMBOL_GPL(pcibios_find_pci_bus);
 
 /**
- * pcibios_remove_pci_devices - remove all devices under this bus
+ * __pcibios_remove_pci_devices - remove all devices under this bus
+ * @bus: the indicated PCI bus
+ * @purge_pe: destroy the PE on removal of PCI devices
  *
  * Remove all of the PCI devices under this bus both from the
  * linux pci device tree, and from the powerpc EEH address cache.
+ * By default, the corresponding PE will be destroied during the
+ * normal PCI hotplug path. For PCI hotplug during EEH recovery,
+ * the corresponding PE won't be destroied and deallocated.
  */
-void pcibios_remove_pci_devices(struct pci_bus *bus)
+void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe)
 {
- 	struct pci_dev *dev, *tmp;
+	struct pci_dev *dev, *tmp;
 	struct pci_bus *child_bus;
 
 	/* First go down child busses */
 	list_for_each_entry(child_bus, &bus->children, node)
-		pcibios_remove_pci_devices(child_bus);
+		__pcibios_remove_pci_devices(child_bus, purge_pe);
 
 	pr_debug("PCI: Removing devices on bus %04x:%02x\n",
-		 pci_domain_nr(bus),  bus->number);
+		pci_domain_nr(bus),  bus->number);
 	list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
 		pr_debug("     * Removing %s...\n", pci_name(dev));
-		eeh_remove_bus_device(dev);
- 		pci_stop_and_remove_bus_device(dev);
- 	}
+		eeh_remove_bus_device(dev, purge_pe);
+		pci_stop_and_remove_bus_device(dev);
+	}
+}
+
+/**
+ * pcibios_remove_pci_devices - remove all devices under this bus
+ *
+ * Remove all of the PCI devices under this bus both from the
+ * linux pci device tree, and from the powerpc EEH address cache.
+ */
+void pcibios_remove_pci_devices(struct pci_bus *bus)
+{
+	__pcibios_remove_pci_devices(bus, 1);
 }
 EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices);
 
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 51ecac920dd8..e3cb7ae61658 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -388,10 +388,8 @@ static void __init pSeries_setup_arch(void)
 
 	/* Find and initialize PCI host bridges */
 	init_pci_config_tokens();
-	eeh_pseries_init();
 	find_and_init_phbs();
 	pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb);
-	eeh_init();
 
 	pSeries_nvram_init();
 
@@ -416,16 +414,20 @@ static int __init pSeries_init_panel(void)
 }
 machine_arch_initcall(pseries, pSeries_init_panel);
 
-static int pseries_set_dabr(unsigned long dabr)
+static int pseries_set_dabr(unsigned long dabr, unsigned long dabrx)
 {
 	return plpar_hcall_norets(H_SET_DABR, dabr);
 }
 
-static int pseries_set_xdabr(unsigned long dabr)
+static int pseries_set_xdabr(unsigned long dabr, unsigned long dabrx)
 {
-	/* We want to catch accesses from kernel and userspace */
-	return plpar_hcall_norets(H_SET_XDABR, dabr,
-			H_DABRX_KERNEL | H_DABRX_USER);
+	/* Have to set at least one bit in the DABRX according to PAPR */
+	if (dabrx == 0 && dabr == 0)
+		dabrx = DABRX_USER;
+	/* PAPR says we can only set kernel and user bits */
+	dabrx &= DABRX_KERNEL | DABRX_USER;
+
+	return plpar_hcall_norets(H_SET_XDABR, dabr, dabrx);
 }
 
 #define CMO_CHARACTERISTICS_TOKEN 44
@@ -529,10 +531,10 @@ static void __init pSeries_init_early(void)
 	if (firmware_has_feature(FW_FEATURE_LPAR))
 		hvc_vio_init_early();
 #endif
-	if (firmware_has_feature(FW_FEATURE_DABR))
-		ppc_md.set_dabr = pseries_set_dabr;
-	else if (firmware_has_feature(FW_FEATURE_XDABR))
+	if (firmware_has_feature(FW_FEATURE_XDABR))
 		ppc_md.set_dabr = pseries_set_xdabr;
+	else if (firmware_has_feature(FW_FEATURE_DABR))
+		ppc_md.set_dabr = pseries_set_dabr;
 
 	pSeries_cmo_feature_init();
 	iommu_init_early_pSeries();