diff options
Diffstat (limited to 'arch/powerpc/kernel')
35 files changed, 3944 insertions, 157 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index f960a7944553..a8619bfe879e 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -58,6 +58,8 @@ obj-$(CONFIG_RTAS_PROC) += rtas-proc.o obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o +obj-$(CONFIG_EEH) += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \ + eeh_driver.o eeh_event.o eeh_sysfs.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_FA_DUMP) += fadump.o @@ -100,7 +102,7 @@ obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o -pci64-$(CONFIG_PPC64) += pci_dn.o isa-bridge.o +pci64-$(CONFIG_PPC64) += pci_dn.o pci-hotplug.o isa-bridge.o obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \ pci-common.o pci_of_scan.o obj-$(CONFIG_PCI_MSI) += msi.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6f16ffafa6f0..c7e8afc2ead0 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -105,9 +105,6 @@ int main(void) DEFINE(KSP_VSID, offsetof(struct thread_struct, ksp_vsid)); #else /* CONFIG_PPC64 */ DEFINE(PGDIR, offsetof(struct thread_struct, pgdir)); -#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) - DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0)); -#endif #ifdef CONFIG_SPE DEFINE(THREAD_EVR0, offsetof(struct thread_struct, evr[0])); DEFINE(THREAD_ACC, offsetof(struct thread_struct, acc)); @@ -115,6 +112,9 @@ int main(void) DEFINE(THREAD_USED_SPE, offsetof(struct thread_struct, used_spe)); #endif /* CONFIG_SPE */ #endif /* CONFIG_PPC64 */ +#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE) + DEFINE(THREAD_DBCR0, offsetof(struct thread_struct, dbcr0)); +#endif #ifdef CONFIG_KVM_BOOK3S_32_HANDLER DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu)); #endif @@ -132,7 +132,6 @@ int main(void) DEFINE(THREAD_SIER, offsetof(struct thread_struct, sier)); DEFINE(THREAD_MMCR0, offsetof(struct thread_struct, mmcr0)); DEFINE(THREAD_MMCR2, offsetof(struct thread_struct, mmcr2)); - DEFINE(THREAD_MMCRA, offsetof(struct thread_struct, mmcra)); #endif #ifdef CONFIG_PPC_TRANSACTIONAL_MEM DEFINE(PACATMSCRATCH, offsetof(struct paca_struct, tm_scratch)); diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 92c6b008dd2b..9262cf2bec4b 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -131,7 +131,8 @@ static const char *cache_type_string(const struct cache *cache) return cache_type_info[cache->type].name; } -static void __cpuinit cache_init(struct cache *cache, int type, int level, struct device_node *ofnode) +static void cache_init(struct cache *cache, int type, int level, + struct device_node *ofnode) { cache->type = type; cache->level = level; @@ -140,7 +141,7 @@ static void __cpuinit cache_init(struct cache *cache, int type, int level, struc list_add(&cache->list, &cache_list); } -static struct cache *__cpuinit new_cache(int type, int level, struct device_node *ofnode) +static struct cache *new_cache(int type, int level, struct device_node *ofnode) { struct cache *cache; @@ -324,7 +325,8 @@ static bool cache_node_is_unified(const struct device_node *np) return of_get_property(np, "cache-unified", NULL); } -static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node *node, int level) +static struct cache *cache_do_one_devnode_unified(struct device_node *node, + int level) { struct cache *cache; @@ -335,7 +337,8 @@ static struct cache *__cpuinit cache_do_one_devnode_unified(struct device_node * return cache; } -static struct cache *__cpuinit cache_do_one_devnode_split(struct device_node *node, int level) +static struct cache *cache_do_one_devnode_split(struct device_node *node, + int level) { struct cache *dcache, *icache; @@ -357,7 +360,7 @@ err: return NULL; } -static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, int level) +static struct cache *cache_do_one_devnode(struct device_node *node, int level) { struct cache *cache; @@ -369,7 +372,8 @@ static struct cache *__cpuinit cache_do_one_devnode(struct device_node *node, in return cache; } -static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *node, int level) +static struct cache *cache_lookup_or_instantiate(struct device_node *node, + int level) { struct cache *cache; @@ -385,7 +389,7 @@ static struct cache *__cpuinit cache_lookup_or_instantiate(struct device_node *n return cache; } -static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigger) +static void link_cache_lists(struct cache *smaller, struct cache *bigger) { while (smaller->next_local) { if (smaller->next_local == bigger) @@ -396,13 +400,13 @@ static void __cpuinit link_cache_lists(struct cache *smaller, struct cache *bigg smaller->next_local = bigger; } -static void __cpuinit do_subsidiary_caches_debugcheck(struct cache *cache) +static void do_subsidiary_caches_debugcheck(struct cache *cache) { WARN_ON_ONCE(cache->level != 1); WARN_ON_ONCE(strcmp(cache->ofnode->type, "cpu")); } -static void __cpuinit do_subsidiary_caches(struct cache *cache) +static void do_subsidiary_caches(struct cache *cache) { struct device_node *subcache_node; int level = cache->level; @@ -423,7 +427,7 @@ static void __cpuinit do_subsidiary_caches(struct cache *cache) } } -static struct cache *__cpuinit cache_chain_instantiate(unsigned int cpu_id) +static struct cache *cache_chain_instantiate(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cpu_cache = NULL; @@ -448,7 +452,7 @@ out: return cpu_cache; } -static struct cache_dir *__cpuinit cacheinfo_create_cache_dir(unsigned int cpu_id) +static struct cache_dir *cacheinfo_create_cache_dir(unsigned int cpu_id) { struct cache_dir *cache_dir; struct device *dev; @@ -653,7 +657,7 @@ static struct kobj_type cache_index_type = { .default_attrs = cache_index_default_attrs, }; -static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) +static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) { const char *cache_name; const char *cache_type; @@ -696,7 +700,8 @@ static void __cpuinit cacheinfo_create_index_opt_attrs(struct cache_index_dir *d kfree(buf); } -static void __cpuinit cacheinfo_create_index_dir(struct cache *cache, int index, struct cache_dir *cache_dir) +static void cacheinfo_create_index_dir(struct cache *cache, int index, + struct cache_dir *cache_dir) { struct cache_index_dir *index_dir; int rc; @@ -722,7 +727,8 @@ err: kfree(index_dir); } -static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache *cache_list) +static void cacheinfo_sysfs_populate(unsigned int cpu_id, + struct cache *cache_list) { struct cache_dir *cache_dir; struct cache *cache; @@ -740,7 +746,7 @@ static void __cpuinit cacheinfo_sysfs_populate(unsigned int cpu_id, struct cache } } -void __cpuinit cacheinfo_cpu_online(unsigned int cpu_id) +void cacheinfo_cpu_online(unsigned int cpu_id) { struct cache *cache; diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c new file mode 100644 index 000000000000..39954fe941b8 --- /dev/null +++ b/arch/powerpc/kernel/eeh.c @@ -0,0 +1,1070 @@ +/* + * Copyright IBM Corporation 2001, 2005, 2006 + * Copyright Dave Engebretsen & Todd Inglett 2001 + * Copyright Linas Vepstas 2005, 2006 + * Copyright 2001-2012 IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ + +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/of.h> + +#include <linux/atomic.h> +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/rtas.h> + + +/** Overview: + * EEH, or "Extended Error Handling" is a PCI bridge technology for + * dealing with PCI bus errors that can't be dealt with within the + * usual PCI framework, except by check-stopping the CPU. Systems + * that are designed for high-availability/reliability cannot afford + * to crash due to a "mere" PCI error, thus the need for EEH. + * An EEH-capable bridge operates by converting a detected error + * into a "slot freeze", taking the PCI adapter off-line, making + * the slot behave, from the OS'es point of view, as if the slot + * were "empty": all reads return 0xff's and all writes are silently + * ignored. EEH slot isolation events can be triggered by parity + * errors on the address or data busses (e.g. during posted writes), + * which in turn might be caused by low voltage on the bus, dust, + * vibration, humidity, radioactivity or plain-old failed hardware. + * + * Note, however, that one of the leading causes of EEH slot + * freeze events are buggy device drivers, buggy device microcode, + * or buggy device hardware. This is because any attempt by the + * device to bus-master data to a memory address that is not + * assigned to the device will trigger a slot freeze. (The idea + * is to prevent devices-gone-wild from corrupting system memory). + * Buggy hardware/drivers will have a miserable time co-existing + * with EEH. + * + * Ideally, a PCI device driver, when suspecting that an isolation + * event has occurred (e.g. by reading 0xff's), will then ask EEH + * whether this is the case, and then take appropriate steps to + * reset the PCI slot, the PCI device, and then resume operations. + * However, until that day, the checking is done here, with the + * eeh_check_failure() routine embedded in the MMIO macros. If + * the slot is found to be isolated, an "EEH Event" is synthesized + * and sent out for processing. + */ + +/* If a device driver keeps reading an MMIO register in an interrupt + * handler after a slot isolation event, it might be broken. + * This sets the threshold for how many read attempts we allow + * before printing an error message. + */ +#define EEH_MAX_FAILS 2100000 + +/* Time to wait for a PCI slot to report status, in milliseconds */ +#define PCI_BUS_RESET_WAIT_MSEC (60*1000) + +/* Platform dependent EEH operations */ +struct eeh_ops *eeh_ops = NULL; + +int eeh_subsystem_enabled; +EXPORT_SYMBOL(eeh_subsystem_enabled); + +/* + * EEH probe mode support. The intention is to support multiple + * platforms for EEH. Some platforms like pSeries do PCI emunation + * based on device tree. However, other platforms like powernv probe + * PCI devices from hardware. The flag is used to distinguish that. + * In addition, struct eeh_ops::probe would be invoked for particular + * OF node or PCI device so that the corresponding PE would be created + * there. + */ +int eeh_probe_mode; + +/* Lock to avoid races due to multiple reports of an error */ +DEFINE_RAW_SPINLOCK(confirm_error_lock); + +/* Buffer for reporting pci register dumps. Its here in BSS, and + * not dynamically alloced, so that it ends up in RMO where RTAS + * can access it. + */ +#define EEH_PCI_REGS_LOG_LEN 4096 +static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN]; + +/* + * The struct is used to maintain the EEH global statistic + * information. Besides, the EEH global statistics will be + * exported to user space through procfs + */ +struct eeh_stats { + u64 no_device; /* PCI device not found */ + u64 no_dn; /* OF node not found */ + u64 no_cfg_addr; /* Config address not found */ + u64 ignored_check; /* EEH check skipped */ + u64 total_mmio_ffs; /* Total EEH checks */ + u64 false_positives; /* Unnecessary EEH checks */ + u64 slot_resets; /* PE reset */ +}; + +static struct eeh_stats eeh_stats; + +#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE) + +/** + * eeh_gather_pci_data - Copy assorted PCI config space registers to buff + * @edev: device to report data for + * @buf: point to buffer in which to log + * @len: amount of room in buffer + * + * This routine captures assorted PCI configuration space data, + * and puts them into a buffer for RTAS error logging. + */ +static size_t eeh_gather_pci_data(struct eeh_dev *edev, char * buf, size_t len) +{ + struct device_node *dn = eeh_dev_to_of_node(edev); + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + u32 cfg; + int cap, i; + int n = 0; + + n += scnprintf(buf+n, len-n, "%s\n", dn->full_name); + printk(KERN_WARNING "EEH: of node=%s\n", dn->full_name); + + eeh_ops->read_config(dn, PCI_VENDOR_ID, 4, &cfg); + n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); + printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg); + + eeh_ops->read_config(dn, PCI_COMMAND, 4, &cfg); + n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); + printk(KERN_WARNING "EEH: PCI cmd/status register: %08x\n", cfg); + + if (!dev) { + printk(KERN_WARNING "EEH: no PCI device for this of node\n"); + return n; + } + + /* Gather bridge-specific registers */ + if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) { + eeh_ops->read_config(dn, PCI_SEC_STATUS, 2, &cfg); + n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); + printk(KERN_WARNING "EEH: Bridge secondary status: %04x\n", cfg); + + eeh_ops->read_config(dn, PCI_BRIDGE_CONTROL, 2, &cfg); + n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); + printk(KERN_WARNING "EEH: Bridge control: %04x\n", cfg); + } + + /* Dump out the PCI-X command and status regs */ + cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (cap) { + eeh_ops->read_config(dn, cap, 4, &cfg); + n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg); + printk(KERN_WARNING "EEH: PCI-X cmd: %08x\n", cfg); + + eeh_ops->read_config(dn, cap+4, 4, &cfg); + n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg); + printk(KERN_WARNING "EEH: PCI-X status: %08x\n", cfg); + } + + /* If PCI-E capable, dump PCI-E cap 10, and the AER */ + cap = pci_find_capability(dev, PCI_CAP_ID_EXP); + if (cap) { + n += scnprintf(buf+n, len-n, "pci-e cap10:\n"); + printk(KERN_WARNING + "EEH: PCI-E capabilities and status follow:\n"); + + for (i=0; i<=8; i++) { + eeh_ops->read_config(dn, cap+4*i, 4, &cfg); + n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); + printk(KERN_WARNING "EEH: PCI-E %02x: %08x\n", i, cfg); + } + + cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); + if (cap) { + n += scnprintf(buf+n, len-n, "pci-e AER:\n"); + printk(KERN_WARNING + "EEH: PCI-E AER capability register set follows:\n"); + + for (i=0; i<14; i++) { + eeh_ops->read_config(dn, cap+4*i, 4, &cfg); + n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); + printk(KERN_WARNING "EEH: PCI-E AER %02x: %08x\n", i, cfg); + } + } + } + + return n; +} + +/** + * eeh_slot_error_detail - Generate combined log including driver log and error log + * @pe: EEH PE + * @severity: temporary or permanent error log + * + * This routine should be called to generate the combined log, which + * is comprised of driver log and error log. The driver log is figured + * out from the config space of the corresponding PCI device, while + * the error log is fetched through platform dependent function call. + */ +void eeh_slot_error_detail(struct eeh_pe *pe, int severity) +{ + size_t loglen = 0; + struct eeh_dev *edev; + bool valid_cfg_log = true; + + /* + * When the PHB is fenced or dead, it's pointless to collect + * the data from PCI config space because it should return + * 0xFF's. For ER, we still retrieve the data from the PCI + * config space. + */ + if (eeh_probe_mode_dev() && + (pe->type & EEH_PE_PHB) && + (pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD))) + valid_cfg_log = false; + + if (valid_cfg_log) { + eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + eeh_ops->configure_bridge(pe); + eeh_pe_restore_bars(pe); + + pci_regs_buf[0] = 0; + eeh_pe_for_each_dev(pe, edev) { + loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen, + EEH_PCI_REGS_LOG_LEN - loglen); + } + } + + eeh_ops->get_log(pe, severity, pci_regs_buf, loglen); +} + +/** + * eeh_token_to_phys - Convert EEH address token to phys address + * @token: I/O token, should be address in the form 0xA.... + * + * This routine should be called to convert virtual I/O address + * to physical one. + */ +static inline unsigned long eeh_token_to_phys(unsigned long token) +{ + pte_t *ptep; + unsigned long pa; + int hugepage_shift; + + /* + * We won't find hugepages here, iomem + */ + ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift); + if (!ptep) + return token; + WARN_ON(hugepage_shift); + pa = pte_pfn(*ptep) << PAGE_SHIFT; + + return pa | (token & (PAGE_SIZE-1)); +} + +/* + * On PowerNV platform, we might already have fenced PHB there. + * For that case, it's meaningless to recover frozen PE. Intead, + * We have to handle fenced PHB firstly. + */ +static int eeh_phb_check_failure(struct eeh_pe *pe) +{ + struct eeh_pe *phb_pe; + unsigned long flags; + int ret; + + if (!eeh_probe_mode_dev()) + return -EPERM; + + /* Find the PHB PE */ + phb_pe = eeh_phb_pe_get(pe->phb); + if (!phb_pe) { + pr_warning("%s Can't find PE for PHB#%d\n", + __func__, pe->phb->global_number); + return -EEXIST; + } + + /* If the PHB has been in problematic state */ + eeh_serialize_lock(&flags); + if (phb_pe->state & (EEH_PE_ISOLATED | EEH_PE_PHB_DEAD)) { + ret = 0; + goto out; + } + + /* Check PHB state */ + ret = eeh_ops->get_state(phb_pe, NULL); + if ((ret < 0) || + (ret == EEH_STATE_NOT_SUPPORT) || + (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) == + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) { + ret = 0; + goto out; + } + + /* Isolate the PHB and send event */ + eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); + eeh_serialize_unlock(flags); + eeh_send_failure_event(phb_pe); + + pr_err("EEH: PHB#%x failure detected\n", + phb_pe->phb->global_number); + dump_stack(); + + return 1; +out: + eeh_serialize_unlock(flags); + return ret; +} + +/** + * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze + * @edev: eeh device + * + * Check for an EEH failure for the given device node. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze. This routine + * will query firmware for the EEH status. + * + * Returns 0 if there has not been an EEH error; otherwise returns + * a non-zero value and queues up a slot isolation event notification. + * + * It is safe to call this routine in an interrupt context. + */ +int eeh_dev_check_failure(struct eeh_dev *edev) +{ + int ret; + unsigned long flags; + struct device_node *dn; + struct pci_dev *dev; + struct eeh_pe *pe; + int rc = 0; + const char *location; + + eeh_stats.total_mmio_ffs++; + + if (!eeh_subsystem_enabled) + return 0; + + if (!edev) { + eeh_stats.no_dn++; + return 0; + } + dn = eeh_dev_to_of_node(edev); + dev = eeh_dev_to_pci_dev(edev); + pe = edev->pe; + + /* Access to IO BARs might get this far and still not want checking. */ + if (!pe) { + eeh_stats.ignored_check++; + pr_debug("EEH: Ignored check for %s %s\n", + eeh_pci_name(dev), dn->full_name); + return 0; + } + + if (!pe->addr && !pe->config_addr) { + eeh_stats.no_cfg_addr++; + return 0; + } + + /* + * On PowerNV platform, we might already have fenced PHB + * there and we need take care of that firstly. + */ + ret = eeh_phb_check_failure(pe); + if (ret > 0) + return ret; + + /* If we already have a pending isolation event for this + * slot, we know it's bad already, we don't need to check. + * Do this checking under a lock; as multiple PCI devices + * in one slot might report errors simultaneously, and we + * only want one error recovery routine running. + */ + eeh_serialize_lock(&flags); + rc = 1; + if (pe->state & EEH_PE_ISOLATED) { + pe->check_count++; + if (pe->check_count % EEH_MAX_FAILS == 0) { + location = of_get_property(dn, "ibm,loc-code", NULL); + printk(KERN_ERR "EEH: %d reads ignored for recovering device at " + "location=%s driver=%s pci addr=%s\n", + pe->check_count, location, + eeh_driver_name(dev), eeh_pci_name(dev)); + printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n", + eeh_driver_name(dev)); + dump_stack(); + } + goto dn_unlock; + } + + /* + * Now test for an EEH failure. This is VERY expensive. + * Note that the eeh_config_addr may be a parent device + * in the case of a device behind a bridge, or it may be + * function zero of a multi-function device. + * In any case they must share a common PHB. + */ + ret = eeh_ops->get_state(pe, NULL); + + /* Note that config-io to empty slots may fail; + * they are empty when they don't have children. + * We will punt with the following conditions: Failure to get + * PE's state, EEH not support and Permanently unavailable + * state, PE is in good state. + */ + if ((ret < 0) || + (ret == EEH_STATE_NOT_SUPPORT) || + (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) == + (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) { + eeh_stats.false_positives++; + pe->false_positives++; + rc = 0; + goto dn_unlock; + } + + eeh_stats.slot_resets++; + + /* Avoid repeated reports of this failure, including problems + * with other functions on this device, and functions under + * bridges. + */ + eeh_pe_state_mark(pe, EEH_PE_ISOLATED); + eeh_serialize_unlock(flags); + + eeh_send_failure_event(pe); + + /* Most EEH events are due to device driver bugs. Having + * a stack trace will help the device-driver authors figure + * out what happened. So print that out. + */ + pr_err("EEH: Frozen PE#%x detected on PHB#%x\n", + pe->addr, pe->phb->global_number); + dump_stack(); + + return 1; + +dn_unlock: + eeh_serialize_unlock(flags); + return rc; +} + +EXPORT_SYMBOL_GPL(eeh_dev_check_failure); + +/** + * eeh_check_failure - Check if all 1's data is due to EEH slot freeze + * @token: I/O token, should be address in the form 0xA.... + * @val: value, should be all 1's (XXX why do we need this arg??) + * + * Check for an EEH failure at the given token address. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze event. This routine + * will query firmware for the EEH status. + * + * Note this routine is safe to call in an interrupt context. + */ +unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val) +{ + unsigned long addr; + struct eeh_dev *edev; + + /* Finding the phys addr + pci device; this is pretty quick. */ + addr = eeh_token_to_phys((unsigned long __force) token); + edev = eeh_addr_cache_get_dev(addr); + if (!edev) { + eeh_stats.no_device++; + return val; + } + + eeh_dev_check_failure(edev); + + pci_dev_put(eeh_dev_to_pci_dev(edev)); + return val; +} + +EXPORT_SYMBOL(eeh_check_failure); + + +/** + * eeh_pci_enable - Enable MMIO or DMA transfers for this slot + * @pe: EEH PE + * + * This routine should be called to reenable frozen MMIO or DMA + * so that it would work correctly again. It's useful while doing + * recovery or log collection on the indicated device. + */ +int eeh_pci_enable(struct eeh_pe *pe, int function) +{ + int rc; + + rc = eeh_ops->set_option(pe, function); + if (rc) + pr_warning("%s: Unexpected state change %d on PHB#%d-PE#%x, err=%d\n", + __func__, function, pe->phb->global_number, pe->addr, rc); + + rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); + if (rc > 0 && (rc & EEH_STATE_MMIO_ENABLED) && + (function == EEH_OPT_THAW_MMIO)) + return 0; + + return rc; +} + +/** + * pcibios_set_pcie_slot_reset - Set PCI-E reset state + * @dev: pci device struct + * @state: reset state to enter + * + * Return value: + * 0 if success + */ +int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(dev); + struct eeh_pe *pe = edev->pe; + + if (!pe) { + pr_err("%s: No PE found on PCI device %s\n", + __func__, pci_name(dev)); + return -EINVAL; + } + + switch (state) { + case pcie_deassert_reset: + eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); + break; + case pcie_hot_reset: + eeh_ops->reset(pe, EEH_RESET_HOT); + break; + case pcie_warm_reset: + eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); + break; + default: + return -EINVAL; + }; + + return 0; +} + +/** + * eeh_set_pe_freset - Check the required reset for the indicated device + * @data: EEH device + * @flag: return value + * + * Each device might have its preferred reset type: fundamental or + * hot reset. The routine is used to collected the information for + * the indicated device and its children so that the bunch of the + * devices could be reset properly. + */ +static void *eeh_set_dev_freset(void *data, void *flag) +{ + struct pci_dev *dev; + unsigned int *freset = (unsigned int *)flag; + struct eeh_dev *edev = (struct eeh_dev *)data; + + dev = eeh_dev_to_pci_dev(edev); + if (dev) + *freset |= dev->needs_freset; + + return NULL; +} + +/** + * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second + * @pe: EEH PE + * + * Assert the PCI #RST line for 1/4 second. + */ +static void eeh_reset_pe_once(struct eeh_pe *pe) +{ + unsigned int freset = 0; + + /* Determine type of EEH reset required for + * Partitionable Endpoint, a hot-reset (1) + * or a fundamental reset (3). + * A fundamental reset required by any device under + * Partitionable Endpoint trumps hot-reset. + */ + eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset); + + if (freset) + eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); + else + eeh_ops->reset(pe, EEH_RESET_HOT); + + /* The PCI bus requires that the reset be held high for at least + * a 100 milliseconds. We wait a bit longer 'just in case'. + */ +#define PCI_BUS_RST_HOLD_TIME_MSEC 250 + msleep(PCI_BUS_RST_HOLD_TIME_MSEC); + + /* We might get hit with another EEH freeze as soon as the + * pci slot reset line is dropped. Make sure we don't miss + * these, and clear the flag now. + */ + eeh_pe_state_clear(pe, EEH_PE_ISOLATED); + + eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); + + /* After a PCI slot has been reset, the PCI Express spec requires + * a 1.5 second idle time for the bus to stabilize, before starting + * up traffic. + */ +#define PCI_BUS_SETTLE_TIME_MSEC 1800 + msleep(PCI_BUS_SETTLE_TIME_MSEC); +} + +/** + * eeh_reset_pe - Reset the indicated PE + * @pe: EEH PE + * + * This routine should be called to reset indicated device, including + * PE. A PE might include multiple PCI devices and sometimes PCI bridges + * might be involved as well. + */ +int eeh_reset_pe(struct eeh_pe *pe) +{ + int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); + int i, rc; + + /* Take three shots at resetting the bus */ + for (i=0; i<3; i++) { + eeh_reset_pe_once(pe); + + rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC); + if ((rc & flags) == flags) + return 0; + + if (rc < 0) { + pr_err("%s: Unrecoverable slot failure on PHB#%d-PE#%x", + __func__, pe->phb->global_number, pe->addr); + return -1; + } + pr_err("EEH: bus reset %d failed on PHB#%d-PE#%x, rc=%d\n", + i+1, pe->phb->global_number, pe->addr, rc); + } + + return -1; +} + +/** + * eeh_save_bars - Save device bars + * @edev: PCI device associated EEH device + * + * Save the values of the device bars. Unlike the restore + * routine, this routine is *not* recursive. This is because + * PCI devices are added individually; but, for the restore, + * an entire slot is reset at a time. + */ +void eeh_save_bars(struct eeh_dev *edev) +{ + int i; + struct device_node *dn; + + if (!edev) + return; + dn = eeh_dev_to_of_node(edev); + + for (i = 0; i < 16; i++) + eeh_ops->read_config(dn, i * 4, 4, &edev->config_space[i]); +} + +/** + * eeh_ops_register - Register platform dependent EEH operations + * @ops: platform dependent EEH operations + * + * Register the platform dependent EEH operation callback + * functions. The platform should call this function before + * any other EEH operations. + */ +int __init eeh_ops_register(struct eeh_ops *ops) +{ + if (!ops->name) { + pr_warning("%s: Invalid EEH ops name for %p\n", + __func__, ops); + return -EINVAL; + } + + if (eeh_ops && eeh_ops != ops) { + pr_warning("%s: EEH ops of platform %s already existing (%s)\n", + __func__, eeh_ops->name, ops->name); + return -EEXIST; + } + + eeh_ops = ops; + + return 0; +} + +/** + * eeh_ops_unregister - Unreigster platform dependent EEH operations + * @name: name of EEH platform operations + * + * Unregister the platform dependent EEH operation callback + * functions. + */ +int __exit eeh_ops_unregister(const char *name) +{ + if (!name || !strlen(name)) { + pr_warning("%s: Invalid EEH ops name\n", + __func__); + return -EINVAL; + } + + if (eeh_ops && !strcmp(eeh_ops->name, name)) { + eeh_ops = NULL; + return 0; + } + + return -EEXIST; +} + +/** + * eeh_init - EEH initialization + * + * Initialize EEH by trying to enable it for all of the adapters in the system. + * As a side effect we can determine here if eeh is supported at all. + * Note that we leave EEH on so failed config cycles won't cause a machine + * check. If a user turns off EEH for a particular adapter they are really + * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't + * grant access to a slot if EEH isn't enabled, and so we always enable + * EEH for all slots/all devices. + * + * The eeh-force-off option disables EEH checking globally, for all slots. + * Even if force-off is set, the EEH hardware is still enabled, so that + * newer systems can boot. + */ +int eeh_init(void) +{ + struct pci_controller *hose, *tmp; + struct device_node *phb; + static int cnt = 0; + int ret = 0; + + /* + * We have to delay the initialization on PowerNV after + * the PCI hierarchy tree has been built because the PEs + * are figured out based on PCI devices instead of device + * tree nodes + */ + if (machine_is(powernv) && cnt++ <= 0) + return ret; + + /* call platform initialization function */ + if (!eeh_ops) { + pr_warning("%s: Platform EEH operation not found\n", + __func__); + return -EEXIST; + } else if ((ret = eeh_ops->init())) { + pr_warning("%s: Failed to call platform init function (%d)\n", + __func__, ret); + return ret; + } + + /* Initialize EEH event */ + ret = eeh_event_init(); + if (ret) + return ret; + + /* Enable EEH for all adapters */ + if (eeh_probe_mode_devtree()) { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb = hose->dn; + traverse_pci_devices(phb, eeh_ops->of_probe, NULL); + } + } else if (eeh_probe_mode_dev()) { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) + pci_walk_bus(hose->bus, eeh_ops->dev_probe, NULL); + } else { + pr_warning("%s: Invalid probe mode %d\n", + __func__, eeh_probe_mode); + return -EINVAL; + } + + /* + * Call platform post-initialization. Actually, It's good chance + * to inform platform that EEH is ready to supply service if the + * I/O cache stuff has been built up. + */ + if (eeh_ops->post_init) { + ret = eeh_ops->post_init(); + if (ret) + return ret; + } + + if (eeh_subsystem_enabled) + pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); + else + pr_warning("EEH: No capable adapters found\n"); + + return ret; +} + +core_initcall_sync(eeh_init); + +/** + * eeh_add_device_early - Enable EEH for the indicated device_node + * @dn: device node for which to set up EEH + * + * This routine must be used to perform EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + * This routine must be called before any i/o is performed to the + * adapter (inluding any config-space i/o). + * Whether this actually enables EEH or not for this device depends + * on the CEC architecture, type of the device, on earlier boot + * command-line arguments & etc. + */ +static void eeh_add_device_early(struct device_node *dn) +{ + struct pci_controller *phb; + + /* + * If we're doing EEH probe based on PCI device, we + * would delay the probe until late stage because + * the PCI device isn't available this moment. + */ + if (!eeh_probe_mode_devtree()) + return; + + if (!of_node_to_eeh_dev(dn)) + return; + phb = of_node_to_eeh_dev(dn)->phb; + + /* USB Bus children of PCI devices will not have BUID's */ + if (NULL == phb || 0 == phb->buid) + return; + + eeh_ops->of_probe(dn, NULL); +} + +/** + * eeh_add_device_tree_early - Enable EEH for the indicated device + * @dn: device node + * + * This routine must be used to perform EEH initialization for the + * indicated PCI device that was added after system boot (e.g. + * hotplug, dlpar). + */ +void eeh_add_device_tree_early(struct device_node *dn) +{ + struct device_node *sib; + + for_each_child_of_node(dn, sib) + eeh_add_device_tree_early(sib); + eeh_add_device_early(dn); +} +EXPORT_SYMBOL_GPL(eeh_add_device_tree_early); + +/** + * eeh_add_device_late - Perform EEH initialization for the indicated pci device + * @dev: pci device for which to set up EEH + * + * This routine must be used to complete EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + */ +static void eeh_add_device_late(struct pci_dev *dev) +{ + struct device_node *dn; + struct eeh_dev *edev; + + if (!dev || !eeh_subsystem_enabled) + return; + + pr_debug("EEH: Adding device %s\n", pci_name(dev)); + + dn = pci_device_to_OF_node(dev); + edev = of_node_to_eeh_dev(dn); + if (edev->pdev == dev) { + pr_debug("EEH: Already referenced !\n"); + return; + } + WARN_ON(edev->pdev); + + pci_dev_get(dev); + edev->pdev = dev; + dev->dev.archdata.edev = edev; + + /* + * We have to do the EEH probe here because the PCI device + * hasn't been created yet in the early stage. + */ + if (eeh_probe_mode_dev()) + eeh_ops->dev_probe(dev, NULL); + + eeh_addr_cache_insert_dev(dev); +} + +/** + * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus + * @bus: PCI bus + * + * This routine must be used to perform EEH initialization for PCI + * devices which are attached to the indicated PCI bus. The PCI bus + * is added after system boot through hotplug or dlpar. + */ +void eeh_add_device_tree_late(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + eeh_add_device_late(dev); + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + struct pci_bus *subbus = dev->subordinate; + if (subbus) + eeh_add_device_tree_late(subbus); + } + } +} +EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); + +/** + * eeh_add_sysfs_files - Add EEH sysfs files for the indicated PCI bus + * @bus: PCI bus + * + * This routine must be used to add EEH sysfs files for PCI + * devices which are attached to the indicated PCI bus. The PCI bus + * is added after system boot through hotplug or dlpar. + */ +void eeh_add_sysfs_files(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + eeh_sysfs_add_device(dev); + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + struct pci_bus *subbus = dev->subordinate; + if (subbus) + eeh_add_sysfs_files(subbus); + } + } +} +EXPORT_SYMBOL_GPL(eeh_add_sysfs_files); + +/** + * eeh_remove_device - Undo EEH setup for the indicated pci device + * @dev: pci device to be removed + * @purge_pe: remove the PE or not + * + * This routine should be called when a device is removed from + * a running system (e.g. by hotplug or dlpar). It unregisters + * the PCI device from the EEH subsystem. I/O errors affecting + * this device will no longer be detected after this call; thus, + * i/o errors affecting this slot may leave this device unusable. + */ +static void eeh_remove_device(struct pci_dev *dev, int purge_pe) +{ + struct eeh_dev *edev; + + if (!dev || !eeh_subsystem_enabled) + return; + edev = pci_dev_to_eeh_dev(dev); + + /* Unregister the device with the EEH/PCI address search system */ + pr_debug("EEH: Removing device %s\n", pci_name(dev)); + + if (!edev || !edev->pdev) { + pr_debug("EEH: Not referenced !\n"); + return; + } + edev->pdev = NULL; + dev->dev.archdata.edev = NULL; + pci_dev_put(dev); + + eeh_rmv_from_parent_pe(edev, purge_pe); + eeh_addr_cache_rmv_dev(dev); + eeh_sysfs_remove_device(dev); +} + +/** + * eeh_remove_bus_device - Undo EEH setup for the indicated PCI device + * @dev: PCI device + * @purge_pe: remove the corresponding PE or not + * + * This routine must be called when a device is removed from the + * running system through hotplug or dlpar. The corresponding + * PCI address cache will be removed. + */ +void eeh_remove_bus_device(struct pci_dev *dev, int purge_pe) +{ + struct pci_bus *bus = dev->subordinate; + struct pci_dev *child, *tmp; + + eeh_remove_device(dev, purge_pe); + + if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + list_for_each_entry_safe(child, tmp, &bus->devices, bus_list) + eeh_remove_bus_device(child, purge_pe); + } +} +EXPORT_SYMBOL_GPL(eeh_remove_bus_device); + +static int proc_eeh_show(struct seq_file *m, void *v) +{ + if (0 == eeh_subsystem_enabled) { + seq_printf(m, "EEH Subsystem is globally disabled\n"); + seq_printf(m, "eeh_total_mmio_ffs=%llu\n", eeh_stats.total_mmio_ffs); + } else { + seq_printf(m, "EEH Subsystem is enabled\n"); + seq_printf(m, + "no device=%llu\n" + "no device node=%llu\n" + "no config address=%llu\n" + "check not wanted=%llu\n" + "eeh_total_mmio_ffs=%llu\n" + "eeh_false_positives=%llu\n" + "eeh_slot_resets=%llu\n", + eeh_stats.no_device, + eeh_stats.no_dn, + eeh_stats.no_cfg_addr, + eeh_stats.ignored_check, + eeh_stats.total_mmio_ffs, + eeh_stats.false_positives, + eeh_stats.slot_resets); + } + + return 0; +} + +static int proc_eeh_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_eeh_show, NULL); +} + +static const struct file_operations proc_eeh_operations = { + .open = proc_eeh_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init eeh_init_proc(void) +{ + if (machine_is(pseries)) + proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations); + return 0; +} +__initcall(eeh_init_proc); diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c new file mode 100644 index 000000000000..f9ac1232a746 --- /dev/null +++ b/arch/powerpc/kernel/eeh_cache.c @@ -0,0 +1,318 @@ +/* + * PCI address cache; allows the lookup of PCI devices based on I/O address + * + * Copyright IBM Corporation 2004 + * Copyright Linas Vepstas <linas@austin.ibm.com> 2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/atomic.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + + +/** + * The pci address cache subsystem. This subsystem places + * PCI device address resources into a red-black tree, sorted + * according to the address range, so that given only an i/o + * address, the corresponding PCI device can be **quickly** + * found. It is safe to perform an address lookup in an interrupt + * context; this ability is an important feature. + * + * Currently, the only customer of this code is the EEH subsystem; + * thus, this code has been somewhat tailored to suit EEH better. + * In particular, the cache does *not* hold the addresses of devices + * for which EEH is not enabled. + * + * (Implementation Note: The RB tree seems to be better/faster + * than any hash algo I could think of for this problem, even + * with the penalty of slow pointer chases for d-cache misses). + */ +struct pci_io_addr_range { + struct rb_node rb_node; + unsigned long addr_lo; + unsigned long addr_hi; + struct eeh_dev *edev; + struct pci_dev *pcidev; + unsigned int flags; +}; + +static struct pci_io_addr_cache { + struct rb_root rb_root; + spinlock_t piar_lock; +} pci_io_addr_cache_root; + +static inline struct eeh_dev *__eeh_addr_cache_get_device(unsigned long addr) +{ + struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node; + + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (addr < piar->addr_lo) { + n = n->rb_left; + } else { + if (addr > piar->addr_hi) { + n = n->rb_right; + } else { + pci_dev_get(piar->pcidev); + return piar->edev; + } + } + } + + return NULL; +} + +/** + * eeh_addr_cache_get_dev - Get device, given only address + * @addr: mmio (PIO) phys address or i/o port number + * + * Given an mmio phys address, or a port number, find a pci device + * that implements this address. Be sure to pci_dev_put the device + * when finished. I/O port numbers are assumed to be offset + * from zero (that is, they do *not* have pci_io_addr added in). + * It is safe to call this function within an interrupt. + */ +struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr) +{ + struct eeh_dev *edev; + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + edev = __eeh_addr_cache_get_device(addr); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); + return edev; +} + +#ifdef DEBUG +/* + * Handy-dandy debug print routine, does nothing more + * than print out the contents of our addr cache. + */ +static void eeh_addr_cache_print(struct pci_io_addr_cache *cache) +{ + struct rb_node *n; + int cnt = 0; + + n = rb_first(&cache->rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + pr_debug("PCI: %s addr range %d [%lx-%lx]: %s\n", + (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, + piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev)); + cnt++; + n = rb_next(n); + } +} +#endif + +/* Insert address range into the rb tree. */ +static struct pci_io_addr_range * +eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo, + unsigned long ahi, unsigned int flags) +{ + struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct pci_io_addr_range *piar; + + /* Walk tree, find a place to insert into tree */ + while (*p) { + parent = *p; + piar = rb_entry(parent, struct pci_io_addr_range, rb_node); + if (ahi < piar->addr_lo) { + p = &parent->rb_left; + } else if (alo > piar->addr_hi) { + p = &parent->rb_right; + } else { + if (dev != piar->pcidev || + alo != piar->addr_lo || ahi != piar->addr_hi) { + pr_warning("PIAR: overlapping address range\n"); + } + return piar; + } + } + piar = kzalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC); + if (!piar) + return NULL; + + pci_dev_get(dev); + piar->addr_lo = alo; + piar->addr_hi = ahi; + piar->edev = pci_dev_to_eeh_dev(dev); + piar->pcidev = dev; + piar->flags = flags; + +#ifdef DEBUG + pr_debug("PIAR: insert range=[%lx:%lx] dev=%s\n", + alo, ahi, pci_name(dev)); +#endif + + rb_link_node(&piar->rb_node, parent, p); + rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); + + return piar; +} + +static void __eeh_addr_cache_insert_dev(struct pci_dev *dev) +{ + struct device_node *dn; + struct eeh_dev *edev; + int i; + + dn = pci_device_to_OF_node(dev); + if (!dn) { + pr_warning("PCI: no pci dn found for dev=%s\n", pci_name(dev)); + return; + } + + edev = of_node_to_eeh_dev(dn); + if (!edev) { + pr_warning("PCI: no EEH dev found for dn=%s\n", + dn->full_name); + return; + } + + /* Skip any devices for which EEH is not enabled. */ + if (!eeh_probe_mode_dev() && !edev->pe) { +#ifdef DEBUG + pr_info("PCI: skip building address cache for=%s - %s\n", + pci_name(dev), dn->full_name); +#endif + return; + } + + /* Walk resources on this device, poke them into the tree */ + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + unsigned long start = pci_resource_start(dev,i); + unsigned long end = pci_resource_end(dev,i); + unsigned int flags = pci_resource_flags(dev,i); + + /* We are interested only bus addresses, not dma or other stuff */ + if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM))) + continue; + if (start == 0 || ~start == 0 || end == 0 || ~end == 0) + continue; + eeh_addr_cache_insert(dev, start, end, flags); + } +} + +/** + * eeh_addr_cache_insert_dev - Add a device to the address cache + * @dev: PCI device whose I/O addresses we are interested in. + * + * In order to support the fast lookup of devices based on addresses, + * we maintain a cache of devices that can be quickly searched. + * This routine adds a device to that cache. + */ +void eeh_addr_cache_insert_dev(struct pci_dev *dev) +{ + unsigned long flags; + + /* Ignore PCI bridges */ + if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) + return; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __eeh_addr_cache_insert_dev(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +static inline void __eeh_addr_cache_rmv_dev(struct pci_dev *dev) +{ + struct rb_node *n; + +restart: + n = rb_first(&pci_io_addr_cache_root.rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (piar->pcidev == dev) { + rb_erase(n, &pci_io_addr_cache_root.rb_root); + pci_dev_put(piar->pcidev); + kfree(piar); + goto restart; + } + n = rb_next(n); + } +} + +/** + * eeh_addr_cache_rmv_dev - remove pci device from addr cache + * @dev: device to remove + * + * Remove a device from the addr-cache tree. + * This is potentially expensive, since it will walk + * the tree multiple times (once per resource). + * But so what; device removal doesn't need to be that fast. + */ +void eeh_addr_cache_rmv_dev(struct pci_dev *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __eeh_addr_cache_rmv_dev(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +/** + * eeh_addr_cache_build - Build a cache of I/O addresses + * + * Build a cache of pci i/o addresses. This cache will be used to + * find the pci device that corresponds to a given address. + * This routine scans all pci busses to build the cache. + * Must be run late in boot process, after the pci controllers + * have been scanned for devices (after all device resources are known). + */ +void eeh_addr_cache_build(void) +{ + struct device_node *dn; + struct eeh_dev *edev; + struct pci_dev *dev = NULL; + + spin_lock_init(&pci_io_addr_cache_root.piar_lock); + + for_each_pci_dev(dev) { + dn = pci_device_to_OF_node(dev); + if (!dn) + continue; + + edev = of_node_to_eeh_dev(dn); + if (!edev) + continue; + + pci_dev_get(dev); /* matching put is in eeh_remove_device() */ + dev->dev.archdata.edev = edev; + edev->pdev = dev; + + eeh_addr_cache_insert_dev(dev); + + eeh_sysfs_add_device(dev); + } + +#ifdef DEBUG + /* Verify tree built up above, echo back the list of addrs. */ + eeh_addr_cache_print(&pci_io_addr_cache_root); +#endif +} diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c new file mode 100644 index 000000000000..1efa28f5fc54 --- /dev/null +++ b/arch/powerpc/kernel/eeh_dev.c @@ -0,0 +1,112 @@ +/* + * The file intends to implement dynamic creation of EEH device, which will + * be bound with OF node and PCI device simutaneously. The EEH devices would + * be foundamental information for EEH core components to work proerly. Besides, + * We have to support multiple situations where dynamic creation of EEH device + * is required: + * + * 1) Before PCI emunation starts, we need create EEH devices according to the + * PCI sensitive OF nodes. + * 2) When PCI emunation is done, we need do the binding between PCI device and + * the associated EEH device. + * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH device + * will be created while PCI sensitive OF node is detected from DR. + * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. If + * PHB is newly inserted, we also need create EEH devices accordingly. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/export.h> +#include <linux/gfp.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> + +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + +/** + * eeh_dev_init - Create EEH device according to OF node + * @dn: device node + * @data: PHB + * + * It will create EEH device according to the given OF node. The function + * might be called by PCI emunation, DR, PHB hotplug. + */ +void *eeh_dev_init(struct device_node *dn, void *data) +{ + struct pci_controller *phb = data; + struct eeh_dev *edev; + + /* Allocate EEH device */ + edev = kzalloc(sizeof(*edev), GFP_KERNEL); + if (!edev) { + pr_warning("%s: out of memory\n", __func__); + return NULL; + } + + /* Associate EEH device with OF node */ + PCI_DN(dn)->edev = edev; + edev->dn = dn; + edev->phb = phb; + INIT_LIST_HEAD(&edev->list); + + return NULL; +} + +/** + * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB + * @phb: PHB + * + * Scan the PHB OF node and its child association, then create the + * EEH devices accordingly + */ +void eeh_dev_phb_init_dynamic(struct pci_controller *phb) +{ + struct device_node *dn = phb->dn; + + /* EEH PE for PHB */ + eeh_phb_pe_create(phb); + + /* EEH device for PHB */ + eeh_dev_init(dn, phb); + + /* EEH devices for children OF nodes */ + traverse_pci_devices(dn, eeh_dev_init, phb); +} + +/** + * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs + * + * Scan all the existing PHBs and create EEH devices for their OF + * nodes and their children OF nodes + */ +static int __init eeh_dev_phb_init(void) +{ + struct pci_controller *phb, *tmp; + + list_for_each_entry_safe(phb, tmp, &hose_list, list_node) + eeh_dev_phb_init_dynamic(phb); + + pr_info("EEH: devices created\n"); + + return 0; +} + +core_initcall(eeh_dev_phb_init); diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c new file mode 100644 index 000000000000..2b1ce17cae50 --- /dev/null +++ b/arch/powerpc/kernel/eeh_driver.c @@ -0,0 +1,661 @@ +/* + * PCI Error Recovery Driver for RPA-compliant PPC64 platform. + * Copyright IBM Corp. 2004 2005 + * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/ppc-pci.h> +#include <asm/pci-bridge.h> +#include <asm/prom.h> +#include <asm/rtas.h> + +/** + * eeh_pcid_name - Retrieve name of PCI device driver + * @pdev: PCI device + * + * This routine is used to retrieve the name of PCI device driver + * if that's valid. + */ +static inline const char *eeh_pcid_name(struct pci_dev *pdev) +{ + if (pdev && pdev->dev.driver) + return pdev->dev.driver->name; + return ""; +} + +/** + * eeh_pcid_get - Get the PCI device driver + * @pdev: PCI device + * + * The function is used to retrieve the PCI device driver for + * the indicated PCI device. Besides, we will increase the reference + * of the PCI device driver to prevent that being unloaded on + * the fly. Otherwise, kernel crash would be seen. + */ +static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) +{ + if (!pdev || !pdev->driver) + return NULL; + + if (!try_module_get(pdev->driver->driver.owner)) + return NULL; + + return pdev->driver; +} + +/** + * eeh_pcid_put - Dereference on the PCI device driver + * @pdev: PCI device + * + * The function is called to do dereference on the PCI device + * driver of the indicated PCI device. + */ +static inline void eeh_pcid_put(struct pci_dev *pdev) +{ + if (!pdev || !pdev->driver) + return; + + module_put(pdev->driver->driver.owner); +} + +#if 0 +static void print_device_node_tree(struct pci_dn *pdn, int dent) +{ + int i; + struct device_node *pc; + + if (!pdn) + return; + for (i = 0; i < dent; i++) + printk(" "); + printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n", + pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr, + pdn->eeh_pe_config_addr, pdn->node->full_name); + dent += 3; + pc = pdn->node->child; + while (pc) { + print_device_node_tree(PCI_DN(pc), dent); + pc = pc->sibling; + } +} +#endif + +/** + * eeh_disable_irq - Disable interrupt for the recovering device + * @dev: PCI device + * + * This routine must be called when reporting temporary or permanent + * error to the particular PCI device to disable interrupt of that + * device. If the device has enabled MSI or MSI-X interrupt, we needn't + * do real work because EEH should freeze DMA transfers for those PCI + * devices encountering EEH errors, which includes MSI or MSI-X. + */ +static void eeh_disable_irq(struct pci_dev *dev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(dev); + + /* Don't disable MSI and MSI-X interrupts. They are + * effectively disabled by the DMA Stopped state + * when an EEH error occurs. + */ + if (dev->msi_enabled || dev->msix_enabled) + return; + + if (!irq_has_action(dev->irq)) + return; + + edev->mode |= EEH_DEV_IRQ_DISABLED; + disable_irq_nosync(dev->irq); +} + +/** + * eeh_enable_irq - Enable interrupt for the recovering device + * @dev: PCI device + * + * This routine must be called to enable interrupt while failed + * device could be resumed. + */ +static void eeh_enable_irq(struct pci_dev *dev) +{ + struct eeh_dev *edev = pci_dev_to_eeh_dev(dev); + + if ((edev->mode) & EEH_DEV_IRQ_DISABLED) { + edev->mode &= ~EEH_DEV_IRQ_DISABLED; + enable_irq(dev->irq); + } +} + +/** + * eeh_report_error - Report pci error to each device driver + * @data: eeh device + * @userdata: return value + * + * Report an EEH error to each device driver, collect up and + * merge the device driver responses. Cumulative response + * passed back in "userdata". + */ +static void *eeh_report_error(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver; + + /* We might not have the associated PCI device, + * then we should continue for next one. + */ + if (!dev) return NULL; + dev->error_state = pci_channel_io_frozen; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_disable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->error_detected) { + eeh_pcid_put(dev); + return NULL; + } + + rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); + + /* A driver that needs a reset trumps all others */ + if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + if (*res == PCI_ERS_RESULT_NONE) *res = rc; + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled + * @data: eeh device + * @userdata: return value + * + * Tells each device driver that IO ports, MMIO and config space I/O + * are now enabled. Collects up and merges the device driver responses. + * Cumulative response passed back in "userdata". + */ +static void *eeh_report_mmio_enabled(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + if (!driver->err_handler || + !driver->err_handler->mmio_enabled) { + eeh_pcid_put(dev); + return NULL; + } + + rc = driver->err_handler->mmio_enabled(dev); + + /* A driver that needs a reset trumps all others */ + if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + if (*res == PCI_ERS_RESULT_NONE) *res = rc; + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_reset - Tell device that slot has been reset + * @data: eeh device + * @userdata: return value + * + * This routine must be called while EEH tries to reset particular + * PCI device so that the associated PCI device driver could take + * some actions, usually to save data the driver needs so that the + * driver can work again while the device is recovered. + */ +static void *eeh_report_reset(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver; + + if (!dev) return NULL; + dev->error_state = pci_channel_io_normal; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_enable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->slot_reset) { + eeh_pcid_put(dev); + return NULL; + } + + rc = driver->err_handler->slot_reset(dev); + if ((*res == PCI_ERS_RESULT_NONE) || + (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc; + if (*res == PCI_ERS_RESULT_DISCONNECT && + rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_resume - Tell device to resume normal operations + * @data: eeh device + * @userdata: return value + * + * This routine must be called to notify the device driver that it + * could resume so that the device driver can do some initialization + * to make the recovered device work again. + */ +static void *eeh_report_resume(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + struct pci_driver *driver; + + if (!dev) return NULL; + dev->error_state = pci_channel_io_normal; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_enable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->resume) { + eeh_pcid_put(dev); + return NULL; + } + + driver->err_handler->resume(dev); + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_report_failure - Tell device driver that device is dead. + * @data: eeh device + * @userdata: return value + * + * This informs the device driver that the device is permanently + * dead, and that no further recovery attempts will be made on it. + */ +static void *eeh_report_failure(void *data, void *userdata) +{ + struct eeh_dev *edev = (struct eeh_dev *)data; + struct pci_dev *dev = eeh_dev_to_pci_dev(edev); + struct pci_driver *driver; + + if (!dev) return NULL; + dev->error_state = pci_channel_io_perm_failure; + + driver = eeh_pcid_get(dev); + if (!driver) return NULL; + + eeh_disable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->error_detected) { + eeh_pcid_put(dev); + return NULL; + } + + driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); + + eeh_pcid_put(dev); + return NULL; +} + +/** + * eeh_reset_device - Perform actual reset of a pci slot + * @pe: EEH PE + * @bus: PCI bus corresponding to the isolcated slot + * + * This routine must be called to do reset on the indicated PE. + * During the reset, udev might be invoked because those affected + * PCI devices will be removed and then added. + */ +static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) +{ + struct timeval tstamp; + int cnt, rc; + + /* pcibios will clear the counter; save the value */ + cnt = pe->freeze_count; + tstamp = pe->tstamp; + + /* + * We don't remove the corresponding PE instances because + * we need the information afterwords. The attached EEH + * devices are expected to be attached soon when calling + * into pcibios_add_pci_devices(). + */ + if (bus) + __pcibios_remove_pci_devices(bus, 0); + + /* Reset the pci controller. (Asserts RST#; resets config space). + * Reconfigure bridges and devices. Don't try to bring the system + * up if the reset failed for some reason. + */ + rc = eeh_reset_pe(pe); + if (rc) + return rc; + + /* Restore PE */ + eeh_ops->configure_bridge(pe); + eeh_pe_restore_bars(pe); + + /* Give the system 5 seconds to finish running the user-space + * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, + * this is a hack, but if we don't do this, and try to bring + * the device up before the scripts have taken it down, + * potentially weird things happen. + */ + if (bus) { + ssleep(5); + pcibios_add_pci_devices(bus); + } + + pe->tstamp = tstamp; + pe->freeze_count = cnt; + + return 0; +} + +/* The longest amount of time to wait for a pci device + * to come back on line, in seconds. + */ +#define MAX_WAIT_FOR_RECOVERY 150 + +static void eeh_handle_normal_event(struct eeh_pe *pe) +{ + struct pci_bus *frozen_bus; + int rc = 0; + enum pci_ers_result result = PCI_ERS_RESULT_NONE; + + frozen_bus = eeh_pe_bus_get(pe); + if (!frozen_bus) { + pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n", + __func__, pe->phb->global_number, pe->addr); + return; + } + + eeh_pe_update_time_stamp(pe); + pe->freeze_count++; + if (pe->freeze_count > EEH_MAX_ALLOWED_FREEZES) + goto excess_failures; + pr_warning("EEH: This PCI device has failed %d times in the last hour\n", + pe->freeze_count); + + /* Walk the various device drivers attached to this slot through + * a reset sequence, giving each an opportunity to do what it needs + * to accomplish the reset. Each child gets a report of the + * status ... if any child can't handle the reset, then the entire + * slot is dlpar removed and added. + */ + pr_info("EEH: Notify device drivers to shutdown\n"); + eeh_pe_dev_traverse(pe, eeh_report_error, &result); + + /* Get the current PCI slot state. This can take a long time, + * sometimes over 3 seconds for certain systems. + */ + rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); + if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { + pr_warning("EEH: Permanent failure\n"); + goto hard_fail; + } + + /* Since rtas may enable MMIO when posting the error log, + * don't post the error log until after all dev drivers + * have been informed. + */ + pr_info("EEH: Collect temporary log\n"); + eeh_slot_error_detail(pe, EEH_LOG_TEMP); + + /* If all device drivers were EEH-unaware, then shut + * down all of the device drivers, and hope they + * go down willingly, without panicing the system. + */ + if (result == PCI_ERS_RESULT_NONE) { + pr_info("EEH: Reset with hotplug activity\n"); + rc = eeh_reset_device(pe, frozen_bus); + if (rc) { + pr_warning("%s: Unable to reset, err=%d\n", + __func__, rc); + goto hard_fail; + } + } + + /* If all devices reported they can proceed, then re-enable MMIO */ + if (result == PCI_ERS_RESULT_CAN_RECOVER) { + pr_info("EEH: Enable I/O for affected devices\n"); + rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); + + if (rc < 0) + goto hard_fail; + if (rc) { + result = PCI_ERS_RESULT_NEED_RESET; + } else { + pr_info("EEH: Notify device drivers to resume I/O\n"); + result = PCI_ERS_RESULT_NONE; + eeh_pe_dev_traverse(pe, eeh_report_mmio_enabled, &result); + } + } + + /* If all devices reported they can proceed, then re-enable DMA */ + if (result == PCI_ERS_RESULT_CAN_RECOVER) { + pr_info("EEH: Enabled DMA for affected devices\n"); + rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); + + if (rc < 0) + goto hard_fail; + if (rc) + result = PCI_ERS_RESULT_NEED_RESET; + else + result = PCI_ERS_RESULT_RECOVERED; + } + + /* If any device has a hard failure, then shut off everything. */ + if (result == PCI_ERS_RESULT_DISCONNECT) { + pr_warning("EEH: Device driver gave up\n"); + goto hard_fail; + } + + /* If any device called out for a reset, then reset the slot */ + if (result == PCI_ERS_RESULT_NEED_RESET) { + pr_info("EEH: Reset without hotplug activity\n"); + rc = eeh_reset_device(pe, NULL); + if (rc) { + pr_warning("%s: Cannot reset, err=%d\n", + __func__, rc); + goto hard_fail; + } + + pr_info("EEH: Notify device drivers " + "the completion of reset\n"); + result = PCI_ERS_RESULT_NONE; + eeh_pe_dev_traverse(pe, eeh_report_reset, &result); + } + + /* All devices should claim they have recovered by now. */ + if ((result != PCI_ERS_RESULT_RECOVERED) && + (result != PCI_ERS_RESULT_NONE)) { + pr_warning("EEH: Not recovered\n"); + goto hard_fail; + } + + /* Tell all device drivers that they can resume operations */ + pr_info("EEH: Notify device driver to resume\n"); + eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); + + return; + +excess_failures: + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n" + "last hour and has been permanently disabled.\n" + "Please try reseating or replacing it.\n", + pe->phb->global_number, pe->addr, + pe->freeze_count); + goto perm_error; + +hard_fail: + pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n" + "Please try reseating or replacing it\n", + pe->phb->global_number, pe->addr); + +perm_error: + eeh_slot_error_detail(pe, EEH_LOG_PERM); + + /* Notify all devices that they're about to go down. */ + eeh_pe_dev_traverse(pe, eeh_report_failure, NULL); + + /* Shut down the device drivers for good. */ + if (frozen_bus) + pcibios_remove_pci_devices(frozen_bus); +} + +static void eeh_handle_special_event(void) +{ + struct eeh_pe *pe, *phb_pe; + struct pci_bus *bus; + struct pci_controller *hose, *tmp; + unsigned long flags; + int rc = 0; + + /* + * The return value from next_error() has been classified as follows. + * It might be good to enumerate them. However, next_error() is only + * supported by PowerNV platform for now. So it would be fine to use + * integer directly: + * + * 4 - Dead IOC 3 - Dead PHB + * 2 - Fenced PHB 1 - Frozen PE + * 0 - No error found + * + */ + rc = eeh_ops->next_error(&pe); + if (rc <= 0) + return; + + switch (rc) { + case 4: + /* Mark all PHBs in dead state */ + eeh_serialize_lock(&flags); + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe) continue; + + eeh_pe_state_mark(phb_pe, + EEH_PE_ISOLATED | EEH_PE_PHB_DEAD); + } + eeh_serialize_unlock(flags); + + /* Purge all events */ + eeh_remove_event(NULL); + break; + case 3: + case 2: + case 1: + /* Mark the PE in fenced state */ + eeh_serialize_lock(&flags); + if (rc == 3) + eeh_pe_state_mark(pe, + EEH_PE_ISOLATED | EEH_PE_PHB_DEAD); + else + eeh_pe_state_mark(pe, + EEH_PE_ISOLATED | EEH_PE_RECOVERING); + eeh_serialize_unlock(flags); + + /* Purge all events of the PHB */ + eeh_remove_event(pe); + break; + default: + pr_err("%s: Invalid value %d from next_error()\n", + __func__, rc); + return; + } + + /* + * For fenced PHB and frozen PE, it's handled as normal + * event. We have to remove the affected PHBs for dead + * PHB and IOC + */ + if (rc == 2 || rc == 1) + eeh_handle_normal_event(pe); + else { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD)) + continue; + + bus = eeh_pe_bus_get(phb_pe); + /* Notify all devices that they're about to go down. */ + eeh_pe_dev_traverse(pe, eeh_report_failure, NULL); + pcibios_remove_pci_devices(bus); + } + } +} + +/** + * eeh_handle_event - Reset a PCI device after hard lockup. + * @pe: EEH PE + * + * While PHB detects address or data parity errors on particular PCI + * slot, the associated PE will be frozen. Besides, DMA's occurring + * to wild addresses (which usually happen due to bugs in device + * drivers or in PCI adapter firmware) can cause EEH error. #SERR, + * #PERR or other misc PCI-related errors also can trigger EEH errors. + * + * Recovery process consists of unplugging the device driver (which + * generated hotplug events to userspace), then issuing a PCI #RST to + * the device, then reconfiguring the PCI config space for all bridges + * & devices under this slot, and then finally restarting the device + * drivers (which cause a second set of hotplug events to go out to + * userspace). + */ +void eeh_handle_event(struct eeh_pe *pe) +{ + if (pe) + eeh_handle_normal_event(pe); + else + eeh_handle_special_event(); +} diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c new file mode 100644 index 000000000000..d27c5afc90ae --- /dev/null +++ b/arch/powerpc/kernel/eeh_event.c @@ -0,0 +1,182 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2005 Linas Vepstas <linas@linas.org> + */ + +#include <linux/delay.h> +#include <linux/list.h> +#include <linux/sched.h> +#include <linux/semaphore.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/kthread.h> +#include <asm/eeh_event.h> +#include <asm/ppc-pci.h> + +/** Overview: + * EEH error states may be detected within exception handlers; + * however, the recovery processing needs to occur asynchronously + * in a normal kernel context and not an interrupt context. + * This pair of routines creates an event and queues it onto a + * work-queue, where a worker thread can drive recovery. + */ + +static DEFINE_SPINLOCK(eeh_eventlist_lock); +static struct semaphore eeh_eventlist_sem; +LIST_HEAD(eeh_eventlist); + +/** + * eeh_event_handler - Dispatch EEH events. + * @dummy - unused + * + * The detection of a frozen slot can occur inside an interrupt, + * where it can be hard to do anything about it. The goal of this + * routine is to pull these detection events out of the context + * of the interrupt handler, and re-dispatch them for processing + * at a later time in a normal context. + */ +static int eeh_event_handler(void * dummy) +{ + unsigned long flags; + struct eeh_event *event; + struct eeh_pe *pe; + + while (!kthread_should_stop()) { + if (down_interruptible(&eeh_eventlist_sem)) + break; + + /* Fetch EEH event from the queue */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + event = NULL; + if (!list_empty(&eeh_eventlist)) { + event = list_entry(eeh_eventlist.next, + struct eeh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + if (!event) + continue; + + /* We might have event without binding PE */ + pe = event->pe; + if (pe) { + eeh_pe_state_mark(pe, EEH_PE_RECOVERING); + pr_info("EEH: Detected PCI bus error on PHB#%d-PE#%x\n", + pe->phb->global_number, pe->addr); + eeh_handle_event(pe); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING); + } else { + eeh_handle_event(NULL); + } + + kfree(event); + } + + return 0; +} + +/** + * eeh_event_init - Start kernel thread to handle EEH events + * + * This routine is called to start the kernel thread for processing + * EEH event. + */ +int eeh_event_init(void) +{ + struct task_struct *t; + int ret = 0; + + /* Initialize semaphore */ + sema_init(&eeh_eventlist_sem, 0); + + t = kthread_run(eeh_event_handler, NULL, "eehd"); + if (IS_ERR(t)) { + ret = PTR_ERR(t); + pr_err("%s: Failed to start EEH daemon (%d)\n", + __func__, ret); + return ret; + } + + return 0; +} + +/** + * eeh_send_failure_event - Generate a PCI error event + * @pe: EEH PE + * + * This routine can be called within an interrupt context; + * the actual event will be delivered in a normal context + * (from a workqueue). + */ +int eeh_send_failure_event(struct eeh_pe *pe) +{ + unsigned long flags; + struct eeh_event *event; + + event = kzalloc(sizeof(*event), GFP_ATOMIC); + if (!event) { + pr_err("EEH: out of memory, event not handled\n"); + return -ENOMEM; + } + event->pe = pe; + + /* We may or may not be called in an interrupt context */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + list_add(&event->list, &eeh_eventlist); + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + + /* For EEH deamon to knick in */ + up(&eeh_eventlist_sem); + + return 0; +} + +/** + * eeh_remove_event - Remove EEH event from the queue + * @pe: Event binding to the PE + * + * On PowerNV platform, we might have subsequent coming events + * is part of the former one. For that case, those subsequent + * coming events are totally duplicated and unnecessary, thus + * they should be removed. + */ +void eeh_remove_event(struct eeh_pe *pe) +{ + unsigned long flags; + struct eeh_event *event, *tmp; + + spin_lock_irqsave(&eeh_eventlist_lock, flags); + list_for_each_entry_safe(event, tmp, &eeh_eventlist, list) { + /* + * If we don't have valid PE passed in, that means + * we already have event corresponding to dead IOC + * and all events should be purged. + */ + if (!pe) { + list_del(&event->list); + kfree(event); + } else if (pe->type & EEH_PE_PHB) { + if (event->pe && event->pe->phb == pe->phb) { + list_del(&event->list); + kfree(event); + } + } else if (event->pe == pe) { + list_del(&event->list); + kfree(event); + } + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); +} diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c new file mode 100644 index 000000000000..016588a6f5ed --- /dev/null +++ b/arch/powerpc/kernel/eeh_pe.c @@ -0,0 +1,800 @@ +/* + * The file intends to implement PE based on the information from + * platforms. Basically, there have 3 types of PEs: PHB/Bus/Device. + * All the PEs should be organized as hierarchy tree. The first level + * of the tree will be associated to existing PHBs since the particular + * PE is only meaningful in one PHB domain. + * + * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/gfp.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> + +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + +static LIST_HEAD(eeh_phb_pe); + +/** + * eeh_pe_alloc - Allocate PE + * @phb: PCI controller + * @type: PE type + * + * Allocate PE instance dynamically. + */ +static struct eeh_pe *eeh_pe_alloc(struct pci_controller *phb, int type) +{ + struct eeh_pe *pe; + + /* Allocate PHB PE */ + pe = kzalloc(sizeof(struct eeh_pe), GFP_KERNEL); + if (!pe) return NULL; + + /* Initialize PHB PE */ + pe->type = type; + pe->phb = phb; + INIT_LIST_HEAD(&pe->child_list); + INIT_LIST_HEAD(&pe->child); + INIT_LIST_HEAD(&pe->edevs); + + return pe; +} + +/** + * eeh_phb_pe_create - Create PHB PE + * @phb: PCI controller + * + * The function should be called while the PHB is detected during + * system boot or PCI hotplug in order to create PHB PE. + */ +int eeh_phb_pe_create(struct pci_controller *phb) +{ + struct eeh_pe *pe; + + /* Allocate PHB PE */ + pe = eeh_pe_alloc(phb, EEH_PE_PHB); + if (!pe) { + pr_err("%s: out of memory!\n", __func__); + return -ENOMEM; + } + + /* Put it into the list */ + list_add_tail(&pe->child, &eeh_phb_pe); + + pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number); + + return 0; +} + +/** + * eeh_phb_pe_get - Retrieve PHB PE based on the given PHB + * @phb: PCI controller + * + * The overall PEs form hierarchy tree. The first layer of the + * hierarchy tree is composed of PHB PEs. The function is used + * to retrieve the corresponding PHB PE according to the given PHB. + */ +struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb) +{ + struct eeh_pe *pe; + + list_for_each_entry(pe, &eeh_phb_pe, child) { + /* + * Actually, we needn't check the type since + * the PE for PHB has been determined when that + * was created. + */ + if ((pe->type & EEH_PE_PHB) && pe->phb == phb) + return pe; + } + + return NULL; +} + +/** + * eeh_pe_next - Retrieve the next PE in the tree + * @pe: current PE + * @root: root PE + * + * The function is used to retrieve the next PE in the + * hierarchy PE tree. + */ +static struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, + struct eeh_pe *root) +{ + struct list_head *next = pe->child_list.next; + + if (next == &pe->child_list) { + while (1) { + if (pe == root) + return NULL; + next = pe->child.next; + if (next != &pe->parent->child_list) + break; + pe = pe->parent; + } + } + + return list_entry(next, struct eeh_pe, child); +} + +/** + * eeh_pe_traverse - Traverse PEs in the specified PHB + * @root: root PE + * @fn: callback + * @flag: extra parameter to callback + * + * The function is used to traverse the specified PE and its + * child PEs. The traversing is to be terminated once the + * callback returns something other than NULL, or no more PEs + * to be traversed. + */ +static void *eeh_pe_traverse(struct eeh_pe *root, + eeh_traverse_func fn, void *flag) +{ + struct eeh_pe *pe; + void *ret; + + for (pe = root; pe; pe = eeh_pe_next(pe, root)) { + ret = fn(pe, flag); + if (ret) return ret; + } + + return NULL; +} + +/** + * eeh_pe_dev_traverse - Traverse the devices from the PE + * @root: EEH PE + * @fn: function callback + * @flag: extra parameter to callback + * + * The function is used to traverse the devices of the specified + * PE and its child PEs. + */ +void *eeh_pe_dev_traverse(struct eeh_pe *root, + eeh_traverse_func fn, void *flag) +{ + struct eeh_pe *pe; + struct eeh_dev *edev; + void *ret; + + if (!root) { + pr_warning("%s: Invalid PE %p\n", __func__, root); + return NULL; + } + + /* Traverse root PE */ + for (pe = root; pe; pe = eeh_pe_next(pe, root)) { + eeh_pe_for_each_dev(pe, edev) { + ret = fn(edev, flag); + if (ret) + return ret; + } + } + + return NULL; +} + +/** + * __eeh_pe_get - Check the PE address + * @data: EEH PE + * @flag: EEH device + * + * For one particular PE, it can be identified by PE address + * or tranditional BDF address. BDF address is composed of + * Bus/Device/Function number. The extra data referred by flag + * indicates which type of address should be used. + */ +static void *__eeh_pe_get(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + struct eeh_dev *edev = (struct eeh_dev *)flag; + + /* Unexpected PHB PE */ + if (pe->type & EEH_PE_PHB) + return NULL; + + /* We prefer PE address */ + if (edev->pe_config_addr && + (edev->pe_config_addr == pe->addr)) + return pe; + + /* Try BDF address */ + if (edev->config_addr && + (edev->config_addr == pe->config_addr)) + return pe; + + return NULL; +} + +/** + * eeh_pe_get - Search PE based on the given address + * @edev: EEH device + * + * Search the corresponding PE based on the specified address which + * is included in the eeh device. The function is used to check if + * the associated PE has been created against the PE address. It's + * notable that the PE address has 2 format: traditional PE address + * which is composed of PCI bus/device/function number, or unified + * PE address. + */ +struct eeh_pe *eeh_pe_get(struct eeh_dev *edev) +{ + struct eeh_pe *root = eeh_phb_pe_get(edev->phb); + struct eeh_pe *pe; + + pe = eeh_pe_traverse(root, __eeh_pe_get, edev); + + return pe; +} + +/** + * eeh_pe_get_parent - Retrieve the parent PE + * @edev: EEH device + * + * The whole PEs existing in the system are organized as hierarchy + * tree. The function is used to retrieve the parent PE according + * to the parent EEH device. + */ +static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev) +{ + struct device_node *dn; + struct eeh_dev *parent; + + /* + * It might have the case for the indirect parent + * EEH device already having associated PE, but + * the direct parent EEH device doesn't have yet. + */ + dn = edev->dn->parent; + while (dn) { + /* We're poking out of PCI territory */ + if (!PCI_DN(dn)) return NULL; + + parent = of_node_to_eeh_dev(dn); + /* We're poking out of PCI territory */ + if (!parent) return NULL; + + if (parent->pe) + return parent->pe; + + dn = dn->parent; + } + + return NULL; +} + +/** + * eeh_add_to_parent_pe - Add EEH device to parent PE + * @edev: EEH device + * + * Add EEH device to the parent PE. If the parent PE already + * exists, the PE type will be changed to EEH_PE_BUS. Otherwise, + * we have to create new PE to hold the EEH device and the new + * PE will be linked to its parent PE as well. + */ +int eeh_add_to_parent_pe(struct eeh_dev *edev) +{ + struct eeh_pe *pe, *parent; + + /* + * Search the PE has been existing or not according + * to the PE address. If that has been existing, the + * PE should be composed of PCI bus and its subordinate + * components. + */ + pe = eeh_pe_get(edev); + if (pe && !(pe->type & EEH_PE_INVALID)) { + if (!edev->pe_config_addr) { + pr_err("%s: PE with addr 0x%x already exists\n", + __func__, edev->config_addr); + return -EEXIST; + } + + /* Mark the PE as type of PCI bus */ + pe->type = EEH_PE_BUS; + edev->pe = pe; + + /* Put the edev to PE */ + list_add_tail(&edev->list, &pe->edevs); + pr_debug("EEH: Add %s to Bus PE#%x\n", + edev->dn->full_name, pe->addr); + + return 0; + } else if (pe && (pe->type & EEH_PE_INVALID)) { + list_add_tail(&edev->list, &pe->edevs); + edev->pe = pe; + /* + * We're running to here because of PCI hotplug caused by + * EEH recovery. We need clear EEH_PE_INVALID until the top. + */ + parent = pe; + while (parent) { + if (!(parent->type & EEH_PE_INVALID)) + break; + parent->type &= ~EEH_PE_INVALID; + parent = parent->parent; + } + pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", + edev->dn->full_name, pe->addr, pe->parent->addr); + + return 0; + } + + /* Create a new EEH PE */ + pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE); + if (!pe) { + pr_err("%s: out of memory!\n", __func__); + return -ENOMEM; + } + pe->addr = edev->pe_config_addr; + pe->config_addr = edev->config_addr; + + /* + * While doing PE reset, we probably hot-reset the + * upstream bridge. However, the PCI devices including + * the associated EEH devices might be removed when EEH + * core is doing recovery. So that won't safe to retrieve + * the bridge through downstream EEH device. We have to + * trace the parent PCI bus, then the upstream bridge. + */ + if (eeh_probe_mode_dev()) + pe->bus = eeh_dev_to_pci_dev(edev)->bus; + + /* + * Put the new EEH PE into hierarchy tree. If the parent + * can't be found, the newly created PE will be attached + * to PHB directly. Otherwise, we have to associate the + * PE with its parent. + */ + parent = eeh_pe_get_parent(edev); + if (!parent) { + parent = eeh_phb_pe_get(edev->phb); + if (!parent) { + pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", + __func__, edev->phb->global_number); + edev->pe = NULL; + kfree(pe); + return -EEXIST; + } + } + pe->parent = parent; + + /* + * Put the newly created PE into the child list and + * link the EEH device accordingly. + */ + list_add_tail(&pe->child, &parent->child_list); + list_add_tail(&edev->list, &pe->edevs); + edev->pe = pe; + pr_debug("EEH: Add %s to Device PE#%x, Parent PE#%x\n", + edev->dn->full_name, pe->addr, pe->parent->addr); + + return 0; +} + +/** + * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE + * @edev: EEH device + * @purge_pe: remove PE or not + * + * The PE hierarchy tree might be changed when doing PCI hotplug. + * Also, the PCI devices or buses could be removed from the system + * during EEH recovery. So we have to call the function remove the + * corresponding PE accordingly if necessary. + */ +int eeh_rmv_from_parent_pe(struct eeh_dev *edev, int purge_pe) +{ + struct eeh_pe *pe, *parent, *child; + int cnt; + + if (!edev->pe) { + pr_warning("%s: No PE found for EEH device %s\n", + __func__, edev->dn->full_name); + return -EEXIST; + } + + /* Remove the EEH device */ + pe = edev->pe; + edev->pe = NULL; + list_del(&edev->list); + + /* + * Check if the parent PE includes any EEH devices. + * If not, we should delete that. Also, we should + * delete the parent PE if it doesn't have associated + * child PEs and EEH devices. + */ + while (1) { + parent = pe->parent; + if (pe->type & EEH_PE_PHB) + break; + + if (purge_pe) { + if (list_empty(&pe->edevs) && + list_empty(&pe->child_list)) { + list_del(&pe->child); + kfree(pe); + } else { + break; + } + } else { + if (list_empty(&pe->edevs)) { + cnt = 0; + list_for_each_entry(child, &pe->child_list, child) { + if (!(child->type & EEH_PE_INVALID)) { + cnt++; + break; + } + } + + if (!cnt) + pe->type |= EEH_PE_INVALID; + else + break; + } + } + + pe = parent; + } + + return 0; +} + +/** + * eeh_pe_update_time_stamp - Update PE's frozen time stamp + * @pe: EEH PE + * + * We have time stamp for each PE to trace its time of getting + * frozen in last hour. The function should be called to update + * the time stamp on first error of the specific PE. On the other + * handle, we needn't account for errors happened in last hour. + */ +void eeh_pe_update_time_stamp(struct eeh_pe *pe) +{ + struct timeval tstamp; + + if (!pe) return; + + if (pe->freeze_count <= 0) { + pe->freeze_count = 0; + do_gettimeofday(&pe->tstamp); + } else { + do_gettimeofday(&tstamp); + if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) { + pe->tstamp = tstamp; + pe->freeze_count = 0; + } + } +} + +/** + * __eeh_pe_state_mark - Mark the state for the PE + * @data: EEH PE + * @flag: state + * + * The function is used to mark the indicated state for the given + * PE. Also, the associated PCI devices will be put into IO frozen + * state as well. + */ +static void *__eeh_pe_state_mark(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + int state = *((int *)flag); + struct eeh_dev *tmp; + struct pci_dev *pdev; + + /* + * Mark the PE with the indicated state. Also, + * the associated PCI device will be put into + * I/O frozen state to avoid I/O accesses from + * the PCI device driver. + */ + pe->state |= state; + eeh_pe_for_each_dev(pe, tmp) { + pdev = eeh_dev_to_pci_dev(tmp); + if (pdev) + pdev->error_state = pci_channel_io_frozen; + } + + return NULL; +} + +/** + * eeh_pe_state_mark - Mark specified state for PE and its associated device + * @pe: EEH PE + * + * EEH error affects the current PE and its child PEs. The function + * is used to mark appropriate state for the affected PEs and the + * associated devices. + */ +void eeh_pe_state_mark(struct eeh_pe *pe, int state) +{ + eeh_pe_traverse(pe, __eeh_pe_state_mark, &state); +} + +/** + * __eeh_pe_state_clear - Clear state for the PE + * @data: EEH PE + * @flag: state + * + * The function is used to clear the indicated state from the + * given PE. Besides, we also clear the check count of the PE + * as well. + */ +static void *__eeh_pe_state_clear(void *data, void *flag) +{ + struct eeh_pe *pe = (struct eeh_pe *)data; + int state = *((int *)flag); + + pe->state &= ~state; + pe->check_count = 0; + + return NULL; +} + +/** + * eeh_pe_state_clear - Clear state for the PE and its children + * @pe: PE + * @state: state to be cleared + * + * When the PE and its children has been recovered from error, + * we need clear the error state for that. The function is used + * for the purpose. + */ +void eeh_pe_state_clear(struct eeh_pe *pe, int state) +{ + eeh_pe_traverse(pe, __eeh_pe_state_clear, &state); +} + +/* + * Some PCI bridges (e.g. PLX bridges) have primary/secondary + * buses assigned explicitly by firmware, and we probably have + * lost that after reset. So we have to delay the check until + * the PCI-CFG registers have been restored for the parent + * bridge. + * + * Don't use normal PCI-CFG accessors, which probably has been + * blocked on normal path during the stage. So we need utilize + * eeh operations, which is always permitted. + */ +static void eeh_bridge_check_link(struct pci_dev *pdev, + struct device_node *dn) +{ + int cap; + uint32_t val; + int timeout = 0; + + /* + * We only check root port and downstream ports of + * PCIe switches + */ + if (!pci_is_pcie(pdev) || + (pci_pcie_type(pdev) != PCI_EXP_TYPE_ROOT_PORT && + pci_pcie_type(pdev) != PCI_EXP_TYPE_DOWNSTREAM)) + return; + + pr_debug("%s: Check PCIe link for %s ...\n", + __func__, pci_name(pdev)); + + /* Check slot status */ + cap = pdev->pcie_cap; + eeh_ops->read_config(dn, cap + PCI_EXP_SLTSTA, 2, &val); + if (!(val & PCI_EXP_SLTSTA_PDS)) { + pr_debug(" No card in the slot (0x%04x) !\n", val); + return; + } + + /* Check power status if we have the capability */ + eeh_ops->read_config(dn, cap + PCI_EXP_SLTCAP, 2, &val); + if (val & PCI_EXP_SLTCAP_PCP) { + eeh_ops->read_config(dn, cap + PCI_EXP_SLTCTL, 2, &val); + if (val & PCI_EXP_SLTCTL_PCC) { + pr_debug(" In power-off state, power it on ...\n"); + val &= ~(PCI_EXP_SLTCTL_PCC | PCI_EXP_SLTCTL_PIC); + val |= (0x0100 & PCI_EXP_SLTCTL_PIC); + eeh_ops->write_config(dn, cap + PCI_EXP_SLTCTL, 2, val); + msleep(2 * 1000); + } + } + + /* Enable link */ + eeh_ops->read_config(dn, cap + PCI_EXP_LNKCTL, 2, &val); + val &= ~PCI_EXP_LNKCTL_LD; + eeh_ops->write_config(dn, cap + PCI_EXP_LNKCTL, 2, val); + + /* Check link */ + eeh_ops->read_config(dn, cap + PCI_EXP_LNKCAP, 4, &val); + if (!(val & PCI_EXP_LNKCAP_DLLLARC)) { + pr_debug(" No link reporting capability (0x%08x) \n", val); + msleep(1000); + return; + } + + /* Wait the link is up until timeout (5s) */ + timeout = 0; + while (timeout < 5000) { + msleep(20); + timeout += 20; + + eeh_ops->read_config(dn, cap + PCI_EXP_LNKSTA, 2, &val); + if (val & PCI_EXP_LNKSTA_DLLLA) + break; + } + + if (val & PCI_EXP_LNKSTA_DLLLA) + pr_debug(" Link up (%s)\n", + (val & PCI_EXP_LNKSTA_CLS_2_5GB) ? "2.5GB" : "5GB"); + else + pr_debug(" Link not ready (0x%04x)\n", val); +} + +#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) +#define SAVED_BYTE(OFF) (((u8 *)(edev->config_space))[BYTE_SWAP(OFF)]) + +static void eeh_restore_bridge_bars(struct pci_dev *pdev, + struct eeh_dev *edev, + struct device_node *dn) +{ + int i; + + /* + * Device BARs: 0x10 - 0x18 + * Bus numbers and windows: 0x18 - 0x30 + */ + for (i = 4; i < 13; i++) + eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]); + /* Rom: 0x38 */ + eeh_ops->write_config(dn, 14*4, 4, edev->config_space[14]); + + /* Cache line & Latency timer: 0xC 0xD */ + eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + /* Max latency, min grant, interrupt ping and line: 0x3C */ + eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]); + + /* PCI Command: 0x4 */ + eeh_ops->write_config(dn, PCI_COMMAND, 4, edev->config_space[1]); + + /* Check the PCIe link is ready */ + eeh_bridge_check_link(pdev, dn); +} + +static void eeh_restore_device_bars(struct eeh_dev *edev, + struct device_node *dn) +{ + int i; + u32 cmd; + + for (i = 4; i < 10; i++) + eeh_ops->write_config(dn, i*4, 4, edev->config_space[i]); + /* 12 == Expansion ROM Address */ + eeh_ops->write_config(dn, 12*4, 4, edev->config_space[12]); + + eeh_ops->write_config(dn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + eeh_ops->write_config(dn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + + /* max latency, min grant, interrupt pin and line */ + eeh_ops->write_config(dn, 15*4, 4, edev->config_space[15]); + + /* + * Restore PERR & SERR bits, some devices require it, + * don't touch the other command bits + */ + eeh_ops->read_config(dn, PCI_COMMAND, 4, &cmd); + if (edev->config_space[1] & PCI_COMMAND_PARITY) + cmd |= PCI_COMMAND_PARITY; + else + cmd &= ~PCI_COMMAND_PARITY; + if (edev->config_space[1] & PCI_COMMAND_SERR) + cmd |= PCI_COMMAND_SERR; + else + cmd &= ~PCI_COMMAND_SERR; + eeh_ops->write_config(dn, PCI_COMMAND, 4, cmd); +} + +/** + * eeh_restore_one_device_bars - Restore the Base Address Registers for one device + * @data: EEH device + * @flag: Unused + * + * Loads the PCI configuration space base address registers, + * the expansion ROM base address, the latency timer, and etc. + * from the saved values in the device node. + */ +static void *eeh_restore_one_device_bars(void *data, void *flag) +{ + struct pci_dev *pdev = NULL; + struct eeh_dev *edev = (struct eeh_dev *)data; + struct device_node *dn = eeh_dev_to_of_node(edev); + + /* Trace the PCI bridge */ + if (eeh_probe_mode_dev()) { + pdev = eeh_dev_to_pci_dev(edev); + if (pdev->hdr_type != PCI_HEADER_TYPE_BRIDGE) + pdev = NULL; + } + + if (pdev) + eeh_restore_bridge_bars(pdev, edev, dn); + else + eeh_restore_device_bars(edev, dn); + + return NULL; +} + +/** + * eeh_pe_restore_bars - Restore the PCI config space info + * @pe: EEH PE + * + * This routine performs a recursive walk to the children + * of this device as well. + */ +void eeh_pe_restore_bars(struct eeh_pe *pe) +{ + /* + * We needn't take the EEH lock since eeh_pe_dev_traverse() + * will take that. + */ + eeh_pe_dev_traverse(pe, eeh_restore_one_device_bars, NULL); +} + +/** + * eeh_pe_bus_get - Retrieve PCI bus according to the given PE + * @pe: EEH PE + * + * Retrieve the PCI bus according to the given PE. Basically, + * there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the + * primary PCI bus will be retrieved. The parent bus will be + * returned for BUS PE. However, we don't have associated PCI + * bus for DEVICE PE. + */ +struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe) +{ + struct pci_bus *bus = NULL; + struct eeh_dev *edev; + struct pci_dev *pdev; + + if (pe->type & EEH_PE_PHB) { + bus = pe->phb->bus; + } else if (pe->type & EEH_PE_BUS || + pe->type & EEH_PE_DEVICE) { + if (pe->bus) { + bus = pe->bus; + goto out; + } + + edev = list_first_entry(&pe->edevs, struct eeh_dev, list); + pdev = eeh_dev_to_pci_dev(edev); + if (pdev) + bus = pdev->bus; + } + +out: + return bus; +} diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c new file mode 100644 index 000000000000..e7ae3484918c --- /dev/null +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -0,0 +1,74 @@ +/* + * Sysfs entries for PCI Error Recovery for PAPR-compliant platform. + * Copyright IBM Corporation 2007 + * Copyright Linas Vepstas <linas@austin.ibm.com> 2007 + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ +#include <linux/pci.h> +#include <linux/stat.h> +#include <asm/ppc-pci.h> +#include <asm/pci-bridge.h> + +/** + * EEH_SHOW_ATTR -- Create sysfs entry for eeh statistic + * @_name: name of file in sysfs directory + * @_memb: name of member in struct pci_dn to access + * @_format: printf format for display + * + * All of the attributes look very similar, so just + * auto-gen a cut-n-paste routine to display them. + */ +#define EEH_SHOW_ATTR(_name,_memb,_format) \ +static ssize_t eeh_show_##_name(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct pci_dev *pdev = to_pci_dev(dev); \ + struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); \ + \ + if (!edev) \ + return 0; \ + \ + return sprintf(buf, _format "\n", edev->_memb); \ +} \ +static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL); + +EEH_SHOW_ATTR(eeh_mode, mode, "0x%x"); +EEH_SHOW_ATTR(eeh_config_addr, config_addr, "0x%x"); +EEH_SHOW_ATTR(eeh_pe_config_addr, pe_config_addr, "0x%x"); + +void eeh_sysfs_add_device(struct pci_dev *pdev) +{ + int rc=0; + + rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); + + if (rc) + printk(KERN_WARNING "EEH: Unable to create sysfs entries\n"); +} + +void eeh_sysfs_remove_device(struct pci_dev *pdev) +{ + device_remove_file(&pdev->dev, &dev_attr_eeh_mode); + device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); + device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); +} diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 8741c854e03d..ab15b8d057ad 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -629,21 +629,43 @@ _GLOBAL(ret_from_except_lite) CURRENT_THREAD_INFO(r9, r1) ld r3,_MSR(r1) +#ifdef CONFIG_PPC_BOOK3E + ld r10,PACACURRENT(r13) +#endif /* CONFIG_PPC_BOOK3E */ ld r4,TI_FLAGS(r9) andi. r3,r3,MSR_PR beq resume_kernel +#ifdef CONFIG_PPC_BOOK3E + lwz r3,(THREAD+THREAD_DBCR0)(r10) +#endif /* CONFIG_PPC_BOOK3E */ /* Check current_thread_info()->flags */ andi. r0,r4,_TIF_USER_WORK_MASK +#ifdef CONFIG_PPC_BOOK3E + bne 1f + /* + * Check to see if the dbcr0 register is set up to debug. + * Use the internal debug mode bit to do this. + */ + andis. r0,r3,DBCR0_IDM@h beq restore - - andi. r0,r4,_TIF_NEED_RESCHED - beq 1f + mfmsr r0 + rlwinm r0,r0,0,~MSR_DE /* Clear MSR.DE */ + mtmsr r0 + mtspr SPRN_DBCR0,r3 + li r10, -1 + mtspr SPRN_DBSR,r10 + b restore +#else + beq restore +#endif +1: andi. r0,r4,_TIF_NEED_RESCHED + beq 2f bl .restore_interrupts SCHEDULE_USER b .ret_from_except_lite -1: bl .save_nvgprs +2: bl .save_nvgprs bl .restore_interrupts addi r3,r1,STACK_FRAME_OVERHEAD bl .do_notify_resume diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 40e4a17c8ba0..4e00d223b2e3 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -341,10 +341,17 @@ vsx_unavailable_pSeries_1: EXCEPTION_PROLOG_0(PACA_EXGEN) b vsx_unavailable_pSeries +facility_unavailable_trampoline: . = 0xf60 SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXGEN) - b tm_unavailable_pSeries + b facility_unavailable_pSeries + +hv_facility_unavailable_trampoline: + . = 0xf80 + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b facility_unavailable_hv #ifdef CONFIG_CBE_RAS STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error) @@ -522,8 +529,10 @@ denorm_done: KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf20) STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf40) - STD_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable) + STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0xf60) + STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable) + KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82) /* * An interrupt came in while soft-disabled. We set paca->irq_happened, then: @@ -793,14 +802,10 @@ system_call_relon_pSeries: STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step) . = 0x4e00 - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXGEN) - b h_data_storage_relon_hv + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ . = 0x4e20 - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXGEN) - b h_instr_storage_relon_hv + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ . = 0x4e40 SET_SCRATCH0(r13) @@ -808,9 +813,7 @@ system_call_relon_pSeries: b emulation_assist_relon_hv . = 0x4e60 - SET_SCRATCH0(r13) - EXCEPTION_PROLOG_0(PACA_EXGEN) - b hmi_exception_relon_hv + b . /* Can't happen, see v2.07 Book III-S section 6.5 */ . = 0x4e80 SET_SCRATCH0(r13) @@ -835,11 +838,17 @@ vsx_unavailable_relon_pSeries_1: EXCEPTION_PROLOG_0(PACA_EXGEN) b vsx_unavailable_relon_pSeries -tm_unavailable_relon_pSeries_1: +facility_unavailable_relon_trampoline: . = 0x4f60 SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXGEN) - b tm_unavailable_relon_pSeries + b facility_unavailable_relon_pSeries + +hv_facility_unavailable_relon_trampoline: + . = 0x4f80 + SET_SCRATCH0(r13) + EXCEPTION_PROLOG_0(PACA_EXGEN) + b facility_unavailable_relon_hv STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint) #ifdef CONFIG_PPC_DENORMALISATION @@ -1165,36 +1174,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) bl .vsx_unavailable_exception b .ret_from_except - .align 7 - .globl tm_unavailable_common -tm_unavailable_common: - EXCEPTION_PROLOG_COMMON(0xf60, PACA_EXGEN) - bl .save_nvgprs - DISABLE_INTS - addi r3,r1,STACK_FRAME_OVERHEAD - bl .tm_unavailable_exception - b .ret_from_except + STD_EXCEPTION_COMMON(0xf60, facility_unavailable, .facility_unavailable_exception) .align 7 .globl __end_handlers __end_handlers: /* Equivalents to the above handlers for relocation-on interrupt vectors */ - STD_RELON_EXCEPTION_HV_OOL(0xe00, h_data_storage) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe00) - STD_RELON_EXCEPTION_HV_OOL(0xe20, h_instr_storage) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe20) STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe40) - STD_RELON_EXCEPTION_HV_OOL(0xe60, hmi_exception) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe60) MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe80) STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor) STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable) STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) - STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, tm_unavailable) + STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) + STD_RELON_EXCEPTION_HV_OOL(0xf80, facility_unavailable) #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) /* diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index a949bdfc9623..f0b47d1a6b0e 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -176,7 +176,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) length_max = 512 ; /* 64 doublewords */ /* DAWR region can't cross 512 boundary */ if ((bp->attr.bp_addr >> 10) != - ((bp->attr.bp_addr + bp->attr.bp_len) >> 10)) + ((bp->attr.bp_addr + bp->attr.bp_len - 1) >> 10)) return -EINVAL; } if (info->len > @@ -250,6 +250,7 @@ int __kprobes hw_breakpoint_handler(struct die_args *args) * we still need to single-step the instruction, but we don't * generate an event. */ + info->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ; if (!((bp->attr.bp_addr <= dar) && (dar - bp->attr.bp_addr < bp->attr.bp_len))) info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ; diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 939ea7ef0dc8..d7216c9abda1 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -85,7 +85,7 @@ int powersave_nap; /* * Register the sysctl to set/clear powersave_nap. */ -static ctl_table powersave_nap_ctl_table[]={ +static struct ctl_table powersave_nap_ctl_table[] = { { .procname = "powersave-nap", .data = &powersave_nap, @@ -95,7 +95,7 @@ static ctl_table powersave_nap_ctl_table[]={ }, {} }; -static ctl_table powersave_nap_sysctl_root[] = { +static struct ctl_table powersave_nap_sysctl_root[] = { { .procname = "kernel", .mode = 0555, diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index 50e90b7e7139..fa0b54b2a362 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -55,6 +55,7 @@ static struct iowa_bus *iowa_pci_find(unsigned long vaddr, unsigned long paddr) struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) { + unsigned hugepage_shift; struct iowa_bus *bus; int token; @@ -70,11 +71,17 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END) return NULL; - ptep = find_linux_pte(init_mm.pgd, vaddr); + ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr, + &hugepage_shift); if (ptep == NULL) paddr = 0; - else + else { + /* + * we don't have hugepages backing iomem + */ + WARN_ON(hugepage_shift); paddr = pte_pfn(*ptep) << PAGE_SHIFT; + } bus = iowa_pci_find(vaddr, paddr); if (bus == NULL) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index c0d0dbddfba1..b20ff173a671 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -36,6 +36,8 @@ #include <linux/hash.h> #include <linux/fault-inject.h> #include <linux/pci.h> +#include <linux/iommu.h> +#include <linux/sched.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/iommu.h> @@ -44,6 +46,7 @@ #include <asm/kdump.h> #include <asm/fadump.h> #include <asm/vio.h> +#include <asm/tce.h> #define DBG(...) @@ -724,6 +727,13 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name) if (tbl->it_offset == 0) clear_bit(0, tbl->it_map); +#ifdef CONFIG_IOMMU_API + if (tbl->it_group) { + iommu_group_put(tbl->it_group); + BUG_ON(tbl->it_group); + } +#endif + /* verify that table contains no entries */ if (!bitmap_empty(tbl->it_map, tbl->it_size)) pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name); @@ -860,3 +870,316 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size, free_pages((unsigned long)vaddr, get_order(size)); } } + +#ifdef CONFIG_IOMMU_API +/* + * SPAPR TCE API + */ +static void group_release(void *iommu_data) +{ + struct iommu_table *tbl = iommu_data; + tbl->it_group = NULL; +} + +void iommu_register_group(struct iommu_table *tbl, + int pci_domain_number, unsigned long pe_num) +{ + struct iommu_group *grp; + char *name; + + grp = iommu_group_alloc(); + if (IS_ERR(grp)) { + pr_warn("powerpc iommu api: cannot create new group, err=%ld\n", + PTR_ERR(grp)); + return; + } + tbl->it_group = grp; + iommu_group_set_iommudata(grp, tbl, group_release); + name = kasprintf(GFP_KERNEL, "domain%d-pe%lx", + pci_domain_number, pe_num); + if (!name) + return; + iommu_group_set_name(grp, name); + kfree(name); +} + +enum dma_data_direction iommu_tce_direction(unsigned long tce) +{ + if ((tce & TCE_PCI_READ) && (tce & TCE_PCI_WRITE)) + return DMA_BIDIRECTIONAL; + else if (tce & TCE_PCI_READ) + return DMA_TO_DEVICE; + else if (tce & TCE_PCI_WRITE) + return DMA_FROM_DEVICE; + else + return DMA_NONE; +} +EXPORT_SYMBOL_GPL(iommu_tce_direction); + +void iommu_flush_tce(struct iommu_table *tbl) +{ + /* Flush/invalidate TLB caches if necessary */ + if (ppc_md.tce_flush) + ppc_md.tce_flush(tbl); + + /* Make sure updates are seen by hardware */ + mb(); +} +EXPORT_SYMBOL_GPL(iommu_flush_tce); + +int iommu_tce_clear_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce_value, + unsigned long npages) +{ + /* ppc_md.tce_free() does not support any value but 0 */ + if (tce_value) + return -EINVAL; + + if (ioba & ~IOMMU_PAGE_MASK) + return -EINVAL; + + ioba >>= IOMMU_PAGE_SHIFT; + if (ioba < tbl->it_offset) + return -EINVAL; + + if ((ioba + npages) > (tbl->it_offset + tbl->it_size)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check); + +int iommu_tce_put_param_check(struct iommu_table *tbl, + unsigned long ioba, unsigned long tce) +{ + if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ))) + return -EINVAL; + + if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ)) + return -EINVAL; + + if (ioba & ~IOMMU_PAGE_MASK) + return -EINVAL; + + ioba >>= IOMMU_PAGE_SHIFT; + if (ioba < tbl->it_offset) + return -EINVAL; + + if ((ioba + 1) > (tbl->it_offset + tbl->it_size)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); + +unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry) +{ + unsigned long oldtce; + struct iommu_pool *pool = get_pool(tbl, entry); + + spin_lock(&(pool->lock)); + + oldtce = ppc_md.tce_get(tbl, entry); + if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)) + ppc_md.tce_free(tbl, entry, 1); + else + oldtce = 0; + + spin_unlock(&(pool->lock)); + + return oldtce; +} +EXPORT_SYMBOL_GPL(iommu_clear_tce); + +int iommu_clear_tces_and_put_pages(struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + unsigned long oldtce; + struct page *page; + + for ( ; pages; --pages, ++entry) { + oldtce = iommu_clear_tce(tbl, entry); + if (!oldtce) + continue; + + page = pfn_to_page(oldtce >> PAGE_SHIFT); + WARN_ON(!page); + if (page) { + if (oldtce & TCE_PCI_WRITE) + SetPageDirty(page); + put_page(page); + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages); + +/* + * hwaddr is a kernel virtual address here (0xc... bazillion), + * tce_build converts it to a physical address. + */ +int iommu_tce_build(struct iommu_table *tbl, unsigned long entry, + unsigned long hwaddr, enum dma_data_direction direction) +{ + int ret = -EBUSY; + unsigned long oldtce; + struct iommu_pool *pool = get_pool(tbl, entry); + + spin_lock(&(pool->lock)); + + oldtce = ppc_md.tce_get(tbl, entry); + /* Add new entry if it is not busy */ + if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))) + ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL); + + spin_unlock(&(pool->lock)); + + /* if (unlikely(ret)) + pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n", + __func__, hwaddr, entry << IOMMU_PAGE_SHIFT, + hwaddr, ret); */ + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_tce_build); + +int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry, + unsigned long tce) +{ + int ret; + struct page *page = NULL; + unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK; + enum dma_data_direction direction = iommu_tce_direction(tce); + + ret = get_user_pages_fast(tce & PAGE_MASK, 1, + direction != DMA_TO_DEVICE, &page); + if (unlikely(ret != 1)) { + /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n", + tce, entry << IOMMU_PAGE_SHIFT, ret); */ + return -EFAULT; + } + hwaddr = (unsigned long) page_address(page) + offset; + + ret = iommu_tce_build(tbl, entry, hwaddr, direction); + if (ret) + put_page(page); + + if (ret < 0) + pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n", + __func__, entry << IOMMU_PAGE_SHIFT, tce, ret); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode); + +int iommu_take_ownership(struct iommu_table *tbl) +{ + unsigned long sz = (tbl->it_size + 7) >> 3; + + if (tbl->it_offset == 0) + clear_bit(0, tbl->it_map); + + if (!bitmap_empty(tbl->it_map, tbl->it_size)) { + pr_err("iommu_tce: it_map is not empty"); + return -EBUSY; + } + + memset(tbl->it_map, 0xff, sz); + iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_take_ownership); + +void iommu_release_ownership(struct iommu_table *tbl) +{ + unsigned long sz = (tbl->it_size + 7) >> 3; + + iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size); + memset(tbl->it_map, 0, sz); + + /* Restore bit#0 set by iommu_init_table() */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); +} +EXPORT_SYMBOL_GPL(iommu_release_ownership); + +static int iommu_add_device(struct device *dev) +{ + struct iommu_table *tbl; + int ret = 0; + + if (WARN_ON(dev->iommu_group)) { + pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n", + dev_name(dev), + iommu_group_id(dev->iommu_group)); + return -EBUSY; + } + + tbl = get_iommu_table_base(dev); + if (!tbl || !tbl->it_group) { + pr_debug("iommu_tce: skipping device %s with no tbl\n", + dev_name(dev)); + return 0; + } + + pr_debug("iommu_tce: adding %s to iommu group %d\n", + dev_name(dev), iommu_group_id(tbl->it_group)); + + ret = iommu_group_add_device(tbl->it_group, dev); + if (ret < 0) + pr_err("iommu_tce: %s has not been added, ret=%d\n", + dev_name(dev), ret); + + return ret; +} + +static void iommu_del_device(struct device *dev) +{ + iommu_group_remove_device(dev); +} + +static int iommu_bus_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + return iommu_add_device(dev); + case BUS_NOTIFY_DEL_DEVICE: + iommu_del_device(dev); + return 0; + default: + return 0; + } +} + +static struct notifier_block tce_iommu_bus_nb = { + .notifier_call = iommu_bus_notifier, +}; + +static int __init tce_iommu_init(void) +{ + struct pci_dev *pdev = NULL; + + BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE); + + for_each_pci_dev(pdev) + iommu_add_device(&pdev->dev); + + bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb); + return 0; +} + +subsys_initcall_sync(tce_iommu_init); + +#else + +void iommu_register_group(struct iommu_table *tbl, + int pci_domain_number, unsigned long pe_num) +{ +} + +#endif /* CONFIG_IOMMU_API */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index ea185e0b3cae..2e51cde616d2 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -116,8 +116,6 @@ static inline notrace int decrementer_check_overflow(void) u64 now = get_tb_or_rtc(); u64 *next_tb = &__get_cpu_var(decrementers_next_tb); - if (now >= *next_tb) - set_dec(1); return now >= *next_tb; } diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 11f5b03a0b06..2156ea90eb54 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -36,12 +36,6 @@ #include <asm/sstep.h> #include <asm/uaccess.h> -#ifdef CONFIG_PPC_ADV_DEBUG_REGS -#define MSR_SINGLESTEP (MSR_DE) -#else -#define MSR_SINGLESTEP (MSR_SE) -#endif - DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); @@ -104,19 +98,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p) static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { - /* We turn off async exceptions to ensure that the single step will - * be for the instruction we have the kprobe on, if we dont its - * possible we'd get the single step reported for an exception handler - * like Decrementer or External Interrupt */ - regs->msr &= ~MSR_EE; - regs->msr |= MSR_SINGLESTEP; -#ifdef CONFIG_PPC_ADV_DEBUG_REGS - regs->msr &= ~MSR_CE; - mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM); -#ifdef CONFIG_PPC_47x - isync(); -#endif -#endif + enable_single_step(regs); /* * On powerpc we should single step on the original diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 48fbc2b97e95..8213ee1eb05a 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -84,22 +84,30 @@ static ssize_t dev_nvram_read(struct file *file, char __user *buf, char *tmp = NULL; ssize_t size; - ret = -ENODEV; - if (!ppc_md.nvram_size) + if (!ppc_md.nvram_size) { + ret = -ENODEV; goto out; + } - ret = 0; size = ppc_md.nvram_size(); - if (*ppos >= size || size < 0) + if (size < 0) { + ret = size; + goto out; + } + + if (*ppos >= size) { + ret = 0; goto out; + } count = min_t(size_t, count, size - *ppos); count = min(count, PAGE_SIZE); - ret = -ENOMEM; tmp = kmalloc(count, GFP_KERNEL); - if (!tmp) + if (!tmp) { + ret = -ENOMEM; goto out; + } ret = ppc_md.nvram_read(tmp, count, ppos); if (ret <= 0) diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c new file mode 100644 index 000000000000..3f608800c06b --- /dev/null +++ b/arch/powerpc/kernel/pci-hotplug.c @@ -0,0 +1,111 @@ +/* + * Derived from "arch/powerpc/platforms/pseries/pci_dlpar.c" + * + * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com> + * Copyright (C) 2005 International Business Machines + * + * Updates, 2005, John Rose <johnrose@austin.ibm.com> + * Updates, 2005, Linas Vepstas <linas@austin.ibm.com> + * Updates, 2013, Gavin Shan <shangw@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/pci.h> +#include <linux/export.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/firmware.h> +#include <asm/eeh.h> + +/** + * __pcibios_remove_pci_devices - remove all devices under this bus + * @bus: the indicated PCI bus + * @purge_pe: destroy the PE on removal of PCI devices + * + * Remove all of the PCI devices under this bus both from the + * linux pci device tree, and from the powerpc EEH address cache. + * By default, the corresponding PE will be destroied during the + * normal PCI hotplug path. For PCI hotplug during EEH recovery, + * the corresponding PE won't be destroied and deallocated. + */ +void __pcibios_remove_pci_devices(struct pci_bus *bus, int purge_pe) +{ + struct pci_dev *dev, *tmp; + struct pci_bus *child_bus; + + /* First go down child busses */ + list_for_each_entry(child_bus, &bus->children, node) + __pcibios_remove_pci_devices(child_bus, purge_pe); + + pr_debug("PCI: Removing devices on bus %04x:%02x\n", + pci_domain_nr(bus), bus->number); + list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { + pr_debug(" * Removing %s...\n", pci_name(dev)); + eeh_remove_bus_device(dev, purge_pe); + pci_stop_and_remove_bus_device(dev); + } +} + +/** + * pcibios_remove_pci_devices - remove all devices under this bus + * @bus: the indicated PCI bus + * + * Remove all of the PCI devices under this bus both from the + * linux pci device tree, and from the powerpc EEH address cache. + */ +void pcibios_remove_pci_devices(struct pci_bus *bus) +{ + __pcibios_remove_pci_devices(bus, 1); +} +EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); + +/** + * pcibios_add_pci_devices - adds new pci devices to bus + * @bus: the indicated PCI bus + * + * This routine will find and fixup new pci devices under + * the indicated bus. This routine presumes that there + * might already be some devices under this bridge, so + * it carefully tries to add only new devices. (And that + * is how this routine differs from other, similar pcibios + * routines.) + */ +void pcibios_add_pci_devices(struct pci_bus * bus) +{ + int slotno, num, mode, pass, max; + struct pci_dev *dev; + struct device_node *dn = pci_bus_to_OF_node(bus); + + eeh_add_device_tree_early(dn); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + + if (mode == PCI_PROBE_DEVTREE) { + /* use ofdt-based probe */ + of_rescan_bus(dn, bus); + } else if (mode == PCI_PROBE_NORMAL) { + /* use legacy probe */ + slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); + num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); + if (!num) + return; + pcibios_setup_bus_devices(bus); + max = bus->busn_res.start; + for (pass = 0; pass < 2; pass++) { + list_for_each_entry(dev, &bus->devices, bus_list) { + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || + dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) + max = pci_scan_bridge(bus, dev, + max, pass); + } + } + } + pcibios_finish_adding_to_bus(bus); +} +EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 076d1242507a..c517dbe705fd 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -916,7 +916,11 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) flush_altivec_to_thread(src); flush_vsx_to_thread(src); flush_spe_to_thread(src); + *dst = *src; + + clear_task_ebb(dst); + return 0; } diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 8b6f7a99cce2..eb23ac92abb9 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -559,6 +559,35 @@ void __init early_init_dt_setup_initrd_arch(unsigned long start, } #endif +static void __init early_reserve_mem_dt(void) +{ + unsigned long i, len, dt_root; + const __be32 *prop; + + dt_root = of_get_flat_dt_root(); + + prop = of_get_flat_dt_prop(dt_root, "reserved-ranges", &len); + + if (!prop) + return; + + DBG("Found new-style reserved-ranges\n"); + + /* Each reserved range is an (address,size) pair, 2 cells each, + * totalling 4 cells per range. */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, size; + + base = of_read_number(prop + (i * 4) + 0, 2); + size = of_read_number(prop + (i * 4) + 2, 2); + + if (size) { + DBG("reserving: %llx -> %llx\n", base, size); + memblock_reserve(base, size); + } + } +} + static void __init early_reserve_mem(void) { u64 base, size; @@ -574,12 +603,16 @@ static void __init early_reserve_mem(void) self_size = initial_boot_params->totalsize; memblock_reserve(self_base, self_size); + /* Look for the new "reserved-regions" property in the DT */ + early_reserve_mem_dt(); + #ifdef CONFIG_BLK_DEV_INITRD - /* then reserve the initrd, if any */ - if (initrd_start && (initrd_end > initrd_start)) + /* Then reserve the initrd, if any */ + if (initrd_start && (initrd_end > initrd_start)) { memblock_reserve(_ALIGN_DOWN(__pa(initrd_start), PAGE_SIZE), _ALIGN_UP(initrd_end, PAGE_SIZE) - _ALIGN_DOWN(initrd_start, PAGE_SIZE)); + } #endif /* CONFIG_BLK_DEV_INITRD */ #ifdef CONFIG_PPC32 @@ -591,6 +624,8 @@ static void __init early_reserve_mem(void) u32 base_32, size_32; u32 *reserve_map_32 = (u32 *)reserve_map; + DBG("Found old 32-bit reserve map\n"); + while (1) { base_32 = *(reserve_map_32++); size_32 = *(reserve_map_32++); @@ -605,6 +640,9 @@ static void __init early_reserve_mem(void) return; } #endif + DBG("Processing reserve map\n"); + + /* Handle the reserve map in the fdt blob if it exists */ while (1) { base = *(reserve_map++); size = *(reserve_map++); diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 98c2fc198712..64f7bd5b1b0f 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -1449,7 +1449,9 @@ static long ppc_set_hwdebug(struct task_struct *child, */ if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE) { len = bp_info->addr2 - bp_info->addr; - } else if (bp_info->addr_mode != PPC_BREAKPOINT_MODE_EXACT) { + } else if (bp_info->addr_mode == PPC_BREAKPOINT_MODE_EXACT) + len = 1; + else { ptrace_put_breakpoints(child); return -EINVAL; } diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S index ef46ba6e094f..f366fedb0872 100644 --- a/arch/powerpc/kernel/reloc_32.S +++ b/arch/powerpc/kernel/reloc_32.S @@ -166,7 +166,7 @@ ha16: /* R_PPC_ADDR16_LO */ lo16: cmpwi r4, R_PPC_ADDR16_LO - bne nxtrela + bne unknown_type lwz r4, 0(r9) /* r_offset */ lwz r0, 8(r9) /* r_addend */ add r0, r0, r3 @@ -191,6 +191,7 @@ nxtrela: dcbst r4,r7 sync /* Ensure the data is flushed before icbi */ icbi r4,r7 +unknown_type: cmpwi r8, 0 /* relasz = 0 ? */ ble done add r9, r9, r6 /* move to next entry in the .rela table */ diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 52add6f3e201..80b5ef403f68 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -1172,7 +1172,7 @@ int __init early_init_dt_scan_rtas(unsigned long node, static arch_spinlock_t timebase_lock; static u64 timebase = 0; -void __cpuinit rtas_give_timebase(void) +void rtas_give_timebase(void) { unsigned long flags; @@ -1189,7 +1189,7 @@ void __cpuinit rtas_give_timebase(void) local_irq_restore(flags); } -void __cpuinit rtas_take_timebase(void) +void rtas_take_timebase(void) { while (!timebase) barrier(); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e379d3fd1694..389fb8077cc9 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -76,7 +76,7 @@ #endif int boot_cpuid = 0; -int __initdata spinning_secondaries; +int spinning_secondaries; u64 ppc64_pft_size; /* Pick defaults since we might want to patch instructions diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 201385c3a1ae..0f83122e6676 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -407,7 +407,8 @@ inline unsigned long copy_transact_fpr_from_user(struct task_struct *task, * altivec/spe instructions at some point. */ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, - int sigret, int ctx_has_vsx_region) + struct mcontext __user *tm_frame, int sigret, + int ctx_has_vsx_region) { unsigned long msr = regs->msr; @@ -475,6 +476,12 @@ static int save_user_regs(struct pt_regs *regs, struct mcontext __user *frame, if (__put_user(msr, &frame->mc_gregs[PT_MSR])) return 1; + /* We need to write 0 the MSR top 32 bits in the tm frame so that we + * can check it on the restore to see if TM is active + */ + if (tm_frame && __put_user(0, &tm_frame->mc_gregs[PT_MSR])) + return 1; + if (sigret) { /* Set up the sigreturn trampoline: li r0,sigret; sc */ if (__put_user(0x38000000UL + sigret, &frame->tramp[0]) @@ -747,7 +754,7 @@ static long restore_tm_user_regs(struct pt_regs *regs, struct mcontext __user *tm_sr) { long err; - unsigned long msr; + unsigned long msr, msr_hi; #ifdef CONFIG_VSX int i; #endif @@ -852,8 +859,11 @@ static long restore_tm_user_regs(struct pt_regs *regs, tm_enable(); /* This loads the checkpointed FP/VEC state, if used */ tm_recheckpoint(¤t->thread, msr); - /* The task has moved into TM state S, so ensure MSR reflects this */ - regs->msr = (regs->msr & ~MSR_TS_MASK) | MSR_TS_S; + /* Get the top half of the MSR */ + if (__get_user(msr_hi, &tm_sr->mc_gregs[PT_MSR])) + return 1; + /* Pull in MSR TM from user context */ + regs->msr = (regs->msr & ~MSR_TS_MASK) | ((msr_hi<<32) & MSR_TS_MASK); /* This loads the speculative FP/VEC state, if used */ if (msr & MSR_FP) { @@ -952,6 +962,7 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, { struct rt_sigframe __user *rt_sf; struct mcontext __user *frame; + struct mcontext __user *tm_frame = NULL; void __user *addr; unsigned long newsp = 0; int sigret; @@ -985,23 +996,24 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka, } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM + tm_frame = &rt_sf->uc_transact.uc_mcontext; if (MSR_TM_ACTIVE(regs->msr)) { - if (save_tm_user_regs(regs, &rt_sf->uc.uc_mcontext, - &rt_sf->uc_transact.uc_mcontext, sigret)) + if (save_tm_user_regs(regs, frame, tm_frame, sigret)) goto badframe; } else #endif - if (save_user_regs(regs, frame, sigret, 1)) + { + if (save_user_regs(regs, frame, tm_frame, sigret, 1)) goto badframe; + } regs->link = tramp; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (MSR_TM_ACTIVE(regs->msr)) { if (__put_user((unsigned long)&rt_sf->uc_transact, &rt_sf->uc.uc_link) - || __put_user(to_user_ptr(&rt_sf->uc_transact.uc_mcontext), - &rt_sf->uc_transact.uc_regs)) + || __put_user((unsigned long)tm_frame, &rt_sf->uc_transact.uc_regs)) goto badframe; } else @@ -1170,7 +1182,7 @@ long sys_swapcontext(struct ucontext __user *old_ctx, mctx = (struct mcontext __user *) ((unsigned long) &old_ctx->uc_mcontext & ~0xfUL); if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size) - || save_user_regs(regs, mctx, 0, ctx_has_vsx_region) + || save_user_regs(regs, mctx, NULL, 0, ctx_has_vsx_region) || put_sigset_t(&old_ctx->uc_sigmask, ¤t->blocked) || __put_user(to_user_ptr(mctx), &old_ctx->uc_regs)) return -EFAULT; @@ -1233,7 +1245,7 @@ long sys_rt_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, if (__get_user(msr_hi, &mcp->mc_gregs[PT_MSR])) goto bad; - if (MSR_TM_SUSPENDED(msr_hi<<32)) { + if (MSR_TM_ACTIVE(msr_hi<<32)) { /* We only recheckpoint on return if we're * transaction. */ @@ -1392,6 +1404,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, { struct sigcontext __user *sc; struct sigframe __user *frame; + struct mcontext __user *tm_mctx = NULL; unsigned long newsp = 0; int sigret; unsigned long tramp; @@ -1425,6 +1438,7 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM + tm_mctx = &frame->mctx_transact; if (MSR_TM_ACTIVE(regs->msr)) { if (save_tm_user_regs(regs, &frame->mctx, &frame->mctx_transact, sigret)) @@ -1432,8 +1446,10 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka, } else #endif - if (save_user_regs(regs, &frame->mctx, sigret, 1)) + { + if (save_user_regs(regs, &frame->mctx, tm_mctx, sigret, 1)) goto badframe; + } regs->link = tramp; @@ -1481,16 +1497,22 @@ badframe: long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, struct pt_regs *regs) { + struct sigframe __user *sf; struct sigcontext __user *sc; struct sigcontext sigctx; struct mcontext __user *sr; void __user *addr; sigset_t set; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + struct mcontext __user *mcp, *tm_mcp; + unsigned long msr_hi; +#endif /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; - sc = (struct sigcontext __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); + sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); + sc = &sf->sctx; addr = sc; if (copy_from_user(&sigctx, sc, sizeof(sigctx))) goto badframe; @@ -1507,11 +1529,25 @@ long sys_sigreturn(int r3, int r4, int r5, int r6, int r7, int r8, #endif set_current_blocked(&set); - sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); - addr = sr; - if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) - || restore_user_regs(regs, sr, 1)) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + mcp = (struct mcontext __user *)&sf->mctx; + tm_mcp = (struct mcontext __user *)&sf->mctx_transact; + if (__get_user(msr_hi, &tm_mcp->mc_gregs[PT_MSR])) goto badframe; + if (MSR_TM_ACTIVE(msr_hi<<32)) { + if (!cpu_has_feature(CPU_FTR_TM)) + goto badframe; + if (restore_tm_user_regs(regs, mcp, tm_mcp)) + goto badframe; + } else +#endif + { + sr = (struct mcontext __user *)from_user_ptr(sigctx.regs); + addr = sr; + if (!access_ok(VERIFY_READ, sr, sizeof(*sr)) + || restore_user_regs(regs, sr, 1)) + goto badframe; + } set_thread_flag(TIF_RESTOREALL); return 0; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 345947367ec0..887e99d85bc2 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -410,6 +410,10 @@ static long restore_tm_sigcontexts(struct pt_regs *regs, /* get MSR separately, transfer the LE bit if doing signal return */ err |= __get_user(msr, &sc->gp_regs[PT_MSR]); + /* pull in MSR TM from user context */ + regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + + /* pull in MSR LE from user context */ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); /* The following non-GPR non-FPR non-VR state is also checkpointed: */ @@ -505,8 +509,6 @@ static long restore_tm_sigcontexts(struct pt_regs *regs, tm_enable(); /* This loads the checkpointed FP/VEC state, if used */ tm_recheckpoint(¤t->thread, msr); - /* The task has moved into TM state S, so ensure MSR reflects this: */ - regs->msr = (regs->msr & ~MSR_TS_MASK) | __MASK(33); /* This loads the speculative FP/VEC state, if used */ if (msr & MSR_FP) { @@ -654,7 +656,7 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5, #ifdef CONFIG_PPC_TRANSACTIONAL_MEM if (__get_user(msr, &uc->uc_mcontext.gp_regs[PT_MSR])) goto badframe; - if (MSR_TM_SUSPENDED(msr)) { + if (MSR_TM_ACTIVE(msr)) { /* We recheckpoint on return. */ struct ucontext __user *uc_transact; if (__get_user(uc_transact, &uc->uc_link)) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index ee7ac5e6e28a..38b0ba65a735 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -480,7 +480,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) secondary_ti = current_set[cpu] = ti; } -int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) +int __cpu_up(unsigned int cpu, struct task_struct *tidle) { int rc, c; @@ -610,7 +610,7 @@ static struct device_node *cpu_to_l2cache(int cpu) } /* Activate a secondary processor. */ -__cpuinit void start_secondary(void *unused) +void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); struct device_node *l2_cache; @@ -637,12 +637,10 @@ __cpuinit void start_secondary(void *unused) vdso_getcpu_init(); #endif - notify_cpu_starting(cpu); - set_cpu_online(cpu, true); /* Update sibling maps */ base = cpu_first_thread_sibling(cpu); for (i = 0; i < threads_per_core; i++) { - if (cpu_is_offline(base + i)) + if (cpu_is_offline(base + i) && (cpu != base + i)) continue; cpumask_set_cpu(cpu, cpu_sibling_mask(base + i)); cpumask_set_cpu(base + i, cpu_sibling_mask(cpu)); @@ -667,6 +665,10 @@ __cpuinit void start_secondary(void *unused) } of_node_put(l2_cache); + smp_wmb(); + notify_cpu_starting(cpu); + set_cpu_online(cpu, true); + local_irq_enable(); cpu_startup_entry(CPUHP_ONLINE); diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index e68a84568b8b..27a90b99ef67 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -341,7 +341,7 @@ static struct device_attribute pa6t_attrs[] = { #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ -static void __cpuinit register_cpu_online(unsigned int cpu) +static void register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); struct device *s = &c->dev; @@ -502,7 +502,7 @@ ssize_t arch_cpu_release(const char *buf, size_t count) #endif /* CONFIG_HOTPLUG_CPU */ -static int __cpuinit sysfs_cpu_notify(struct notifier_block *self, +static int sysfs_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; @@ -522,7 +522,7 @@ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata sysfs_cpu_nb = { +static struct notifier_block sysfs_cpu_nb = { .notifier_call = sysfs_cpu_notify, }; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 5fc29ad7e26f..65ab9e909377 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -631,7 +631,6 @@ static int __init get_freq(char *name, int cells, unsigned long *val) return found; } -/* should become __cpuinit when secondary_cpu_time_init also is */ void start_cpu_decrementer(void) { #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 2da67e7a16d5..51be8fb24803 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -112,9 +112,18 @@ _GLOBAL(tm_reclaim) std r3, STACK_PARAM(0)(r1) SAVE_NVGPRS(r1) + /* We need to setup MSR for VSX register save instructions. Here we + * also clear the MSR RI since when we do the treclaim, we won't have a + * valid kernel pointer for a while. We clear RI here as it avoids + * adding another mtmsr closer to the treclaim. This makes the region + * maked as non-recoverable wider than it needs to be but it saves on + * inserting another mtmsrd later. + */ mfmsr r14 mr r15, r14 ori r15, r15, MSR_FP + li r16, MSR_RI + andc r15, r15, r16 oris r15, r15, MSR_VEC@h #ifdef CONFIG_VSX BEGIN_FTR_SECTION @@ -349,9 +358,10 @@ restore_gprs: mtcr r5 mtxer r6 - /* MSR and flags: We don't change CRs, and we don't need to alter - * MSR. + /* Clear the MSR RI since we are about to change R1. EE is already off */ + li r4, 0 + mtmsrd r4, 1 REST_4GPRS(0, r7) /* GPR0-3 */ REST_GPR(4, r7) /* GPR4-6 */ @@ -377,6 +387,10 @@ restore_gprs: GET_PACA(r13) GET_SCRATCH0(r1) + /* R1 is restored, so we are recoverable again. EE is still off */ + li r4, MSR_RI + mtmsrd r4, 1 + REST_NVGPRS(r1) addi r1, r1, TM_FRAME_SIZE diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index c0e5caf8ccc7..bf33c22e38a4 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -866,6 +866,10 @@ static int emulate_string_inst(struct pt_regs *regs, u32 instword) u8 val; u32 shift = 8 * (3 - (pos & 0x3)); + /* if process is 32-bit, clear upper 32 bits of EA */ + if ((regs->msr & MSR_64BIT) == 0) + EA &= 0xFFFFFFFF; + switch ((instword & PPC_INST_STRING_MASK)) { case PPC_INST_LSWX: case PPC_INST_LSWI: @@ -1125,7 +1129,17 @@ void __kprobes program_check_exception(struct pt_regs *regs) * ESR_DST (!?) or 0. In the process of chasing this with the * hardware people - not sure if it can happen on any illegal * instruction or only on FP instructions, whether there is a - * pattern to occurrences etc. -dgibson 31/Mar/2003 */ + * pattern to occurrences etc. -dgibson 31/Mar/2003 + */ + + /* + * If we support a HW FPU, we need to ensure the FP state + * if flushed into the thread_struct before attempting + * emulation + */ +#ifdef CONFIG_PPC_FPU + flush_fp_to_thread(current); +#endif switch (do_mathemu(regs)) { case 0: emulate_single_step(regs); @@ -1282,25 +1296,50 @@ void vsx_unavailable_exception(struct pt_regs *regs) die("Unrecoverable VSX Unavailable Exception", regs, SIGABRT); } -void tm_unavailable_exception(struct pt_regs *regs) +void facility_unavailable_exception(struct pt_regs *regs) { + static char *facility_strings[] = { + "FPU", + "VMX/VSX", + "DSCR", + "PMU SPRs", + "BHRB", + "TM", + "AT", + "EBB", + "TAR", + }; + char *facility, *prefix; + u64 value; + + if (regs->trap == 0xf60) { + value = mfspr(SPRN_FSCR); + prefix = ""; + } else { + value = mfspr(SPRN_HFSCR); + prefix = "Hypervisor "; + } + + value = value >> 56; + /* We restore the interrupt state now */ if (!arch_irq_disabled_regs(regs)) local_irq_enable(); - /* Currently we never expect a TMU exception. Catch - * this and kill the process! - */ - printk(KERN_EMERG "Unexpected TM unavailable exception at %lx " - "(msr %lx)\n", - regs->nip, regs->msr); + if (value < ARRAY_SIZE(facility_strings)) + facility = facility_strings[value]; + else + facility = "unknown"; + + pr_err("%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n", + prefix, facility, regs->nip, regs->msr); if (user_mode(regs)) { _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); return; } - die("Unexpected TM unavailable exception", regs, SIGABRT); + die("Unexpected facility unavailable exception", regs, SIGABRT); } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM @@ -1396,8 +1435,7 @@ void performance_monitor_exception(struct pt_regs *regs) void SoftwareEmulation(struct pt_regs *regs) { extern int do_mathemu(struct pt_regs *); - extern int Soft_emulate_8xx(struct pt_regs *); -#if defined(CONFIG_MATH_EMULATION) || defined(CONFIG_8XX_MINIMAL_FPEMU) +#if defined(CONFIG_MATH_EMULATION) int errcode; #endif @@ -1430,23 +1468,6 @@ void SoftwareEmulation(struct pt_regs *regs) _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); return; } - -#elif defined(CONFIG_8XX_MINIMAL_FPEMU) - errcode = Soft_emulate_8xx(regs); - if (errcode >= 0) - PPC_WARN_EMULATED(8xx, regs); - - switch (errcode) { - case 0: - emulate_single_step(regs); - return; - case 1: - _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); - return; - case -EFAULT: - _exception(SIGSEGV, regs, SEGV_MAPERR, regs->nip); - return; - } #else _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); #endif @@ -1796,8 +1817,6 @@ struct ppc_emulated ppc_emulated = { WARN_EMULATED_SETUP(unaligned), #ifdef CONFIG_MATH_EMULATION WARN_EMULATED_SETUP(math), -#elif defined(CONFIG_8XX_MINIMAL_FPEMU) - WARN_EMULATED_SETUP(8xx), #endif #ifdef CONFIG_VSX WARN_EMULATED_SETUP(vsx), diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index 9d3fdcd66290..a15837519dca 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -50,7 +50,7 @@ void __init udbg_early_init(void) udbg_init_debug_beat(); #elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE) udbg_init_pas_realmode(); -#elif defined(CONFIG_BOOTX_TEXT) +#elif defined(CONFIG_PPC_EARLY_DEBUG_BOOTX) udbg_init_btext(); #elif defined(CONFIG_PPC_EARLY_DEBUG_44x) /* PPC44x debug */ diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index d4f463ac65b1..1d9c92621b36 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -711,7 +711,7 @@ static void __init vdso_setup_syscall_map(void) } #ifdef CONFIG_PPC64 -int __cpuinit vdso_getcpu_init(void) +int vdso_getcpu_init(void) { unsigned long cpu, node, val; |