diff options
Diffstat (limited to 'arch')
270 files changed, 9713 insertions, 2588 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index ad89a033f17f..87b63fde06d7 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -165,12 +165,9 @@ config TRACE_IRQFLAGS_SUPPORT bool default y -config RWSEM_GENERIC_SPINLOCK - bool - default y - config RWSEM_XCHGADD_ALGORITHM bool + default y config ARCH_HAS_ILOG2_U32 bool @@ -1089,11 +1086,6 @@ source "arch/arm/firmware/Kconfig" source arch/arm/mm/Kconfig -config ARM_NR_BANKS - int - default 16 if ARCH_EP93XX - default 8 - config IWMMXT bool "Enable iWMMXt support" depends on CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_PJ4 || CPU_PJ4B @@ -1214,19 +1206,6 @@ config ARM_ERRATA_742231 register of the Cortex-A9 which reduces the linefill issuing capabilities of the processor. -config PL310_ERRATA_588369 - bool "PL310 errata: Clean & Invalidate maintenance operations do not invalidate clean lines" - depends on CACHE_L2X0 - help - The PL310 L2 cache controller implements three types of Clean & - Invalidate maintenance operations: by Physical Address - (offset 0x7F0), by Index/Way (0x7F8) and by Way (0x7FC). - They are architecturally defined to behave as the execution of a - clean operation followed immediately by an invalidate operation, - both performing to the same memory location. This functionality - is not correctly implemented in PL310 as clean lines are not - invalidated as a result of these operations. - config ARM_ERRATA_643719 bool "ARM errata: LoUIS bit field in CLIDR register is incorrect" depends on CPU_V7 && SMP @@ -1249,17 +1228,6 @@ config ARM_ERRATA_720789 tables. The workaround changes the TLB flushing routines to invalidate entries regardless of the ASID. -config PL310_ERRATA_727915 - bool "PL310 errata: Background Clean & Invalidate by Way operation can cause data corruption" - depends on CACHE_L2X0 - help - PL310 implements the Clean & Invalidate by Way L2 cache maintenance - operation (offset 0x7FC). This operation runs in background so that - PL310 can handle normal accesses while it is in progress. Under very - rare circumstances, due to this erratum, write data can be lost when - PL310 treats a cacheable write transaction during a Clean & - Invalidate by Way operation. - config ARM_ERRATA_743622 bool "ARM errata: Faulty hazard checking in the Store Buffer may lead to data corruption" depends on CPU_V7 @@ -1285,21 +1253,6 @@ config ARM_ERRATA_751472 operation is received by a CPU before the ICIALLUIS has completed, potentially leading to corrupted entries in the cache or TLB. -config PL310_ERRATA_753970 - bool "PL310 errata: cache sync operation may be faulty" - depends on CACHE_PL310 - help - This option enables the workaround for the 753970 PL310 (r3p0) erratum. - - Under some condition the effect of cache sync operation on - the store buffer still remains when the operation completes. - This means that the store buffer is always asked to drain and - this prevents it from merging any further writes. The workaround - is to replace the normal offset of cache sync operation (0x730) - by another offset targeting an unmapped PL310 register 0x740. - This has the same effect as the cache sync operation: store buffer - drain and waiting for all buffers empty. - config ARM_ERRATA_754322 bool "ARM errata: possible faulty MMU translations following an ASID switch" depends on CPU_V7 @@ -1348,18 +1301,6 @@ config ARM_ERRATA_764369 relevant cache maintenance functions and sets a specific bit in the diagnostic control register of the SCU. -config PL310_ERRATA_769419 - bool "PL310 errata: no automatic Store Buffer drain" - depends on CACHE_L2X0 - help - On revisions of the PL310 prior to r3p2, the Store Buffer does - not automatically drain. This can cause normal, non-cacheable - writes to be retained when the memory system is idle, leading - to suboptimal I/O performance for drivers using coherent DMA. - This option adds a write barrier to the cpu_idle loop so that, - on systems with an outer cache, the store buffer is drained - explicitly. - config ARM_ERRATA_775420 bool "ARM errata: A data cache maintenance operation which aborts, might lead to deadlock" depends on CPU_V7 @@ -2279,6 +2220,11 @@ config ARCH_SUSPEND_POSSIBLE config ARM_CPU_SUSPEND def_bool PM_SLEEP +config ARCH_HIBERNATION_POSSIBLE + bool + depends on MMU + default y if ARCH_SUSPEND_POSSIBLE + endmenu source "net/Kconfig" diff --git a/arch/arm/boot/compressed/atags_to_fdt.c b/arch/arm/boot/compressed/atags_to_fdt.c index d1153c8a765a..9448aa0c6686 100644 --- a/arch/arm/boot/compressed/atags_to_fdt.c +++ b/arch/arm/boot/compressed/atags_to_fdt.c @@ -7,6 +7,8 @@ #define do_extend_cmdline 0 #endif +#define NR_BANKS 16 + static int node_offset(void *fdt, const char *node_path) { int offset = fdt_path_offset(fdt, node_path); diff --git a/arch/arm/boot/dts/bcm21664.dtsi b/arch/arm/boot/dts/bcm21664.dtsi index 08a44d41b672..8b366822bb43 100644 --- a/arch/arm/boot/dts/bcm21664.dtsi +++ b/arch/arm/boot/dts/bcm21664.dtsi @@ -14,6 +14,8 @@ #include <dt-bindings/interrupt-controller/arm-gic.h> #include <dt-bindings/interrupt-controller/irq.h> +#include "dt-bindings/clock/bcm21664.h" + #include "skeleton.dtsi" / { @@ -43,7 +45,7 @@ compatible = "brcm,bcm21664-dw-apb-uart", "snps,dw-apb-uart"; status = "disabled"; reg = <0x3e000000 0x118>; - clocks = <&uartb_clk>; + clocks = <&slave_ccu BCM21664_SLAVE_CCU_UARTB>; interrupts = <GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>; reg-shift = <2>; reg-io-width = <4>; @@ -53,7 +55,7 @@ compatible = "brcm,bcm21664-dw-apb-uart", "snps,dw-apb-uart"; status = "disabled"; reg = <0x3e001000 0x118>; - clocks = <&uartb2_clk>; + clocks = <&slave_ccu BCM21664_SLAVE_CCU_UARTB2>; interrupts = <GIC_SPI 66 IRQ_TYPE_LEVEL_HIGH>; reg-shift = <2>; reg-io-width = <4>; @@ -63,7 +65,7 @@ compatible = "brcm,bcm21664-dw-apb-uart", "snps,dw-apb-uart"; status = "disabled"; reg = <0x3e002000 0x118>; - clocks = <&uartb3_clk>; + clocks = <&slave_ccu BCM21664_SLAVE_CCU_UARTB3>; interrupts = <GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>; reg-shift = <2>; reg-io-width = <4>; @@ -85,7 +87,7 @@ compatible = "brcm,kona-timer"; reg = <0x35006000 0x1c>; interrupts = <GIC_SPI 7 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&hub_timer_clk>; + clocks = <&aon_ccu BCM21664_AON_CCU_HUB_TIMER>; }; gpio: gpio@35003000 { @@ -106,7 +108,7 @@ compatible = "brcm,kona-sdhci"; reg = <0x3f180000 0x801c>; interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&sdio1_clk>; + clocks = <&master_ccu BCM21664_MASTER_CCU_SDIO1>; status = "disabled"; }; @@ -114,7 +116,7 @@ compatible = "brcm,kona-sdhci"; reg = <0x3f190000 0x801c>; interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&sdio2_clk>; + clocks = <&master_ccu BCM21664_MASTER_CCU_SDIO2>; status = "disabled"; }; @@ -122,7 +124,7 @@ compatible = "brcm,kona-sdhci"; reg = <0x3f1a0000 0x801c>; interrupts = <GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&sdio3_clk>; + clocks = <&master_ccu BCM21664_MASTER_CCU_SDIO3>; status = "disabled"; }; @@ -130,7 +132,7 @@ compatible = "brcm,kona-sdhci"; reg = <0x3f1b0000 0x801c>; interrupts = <GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&sdio4_clk>; + clocks = <&master_ccu BCM21664_MASTER_CCU_SDIO4>; status = "disabled"; }; @@ -140,7 +142,7 @@ interrupts = <GIC_SPI 103 IRQ_TYPE_LEVEL_HIGH>; #address-cells = <1>; #size-cells = <0>; - clocks = <&bsc1_clk>; + clocks = <&slave_ccu BCM21664_SLAVE_CCU_BSC1>; status = "disabled"; }; @@ -150,7 +152,7 @@ interrupts = <GIC_SPI 102 IRQ_TYPE_LEVEL_HIGH>; #address-cells = <1>; #size-cells = <0>; - clocks = <&bsc2_clk>; + clocks = <&slave_ccu BCM21664_SLAVE_CCU_BSC2>; status = "disabled"; }; @@ -160,7 +162,7 @@ interrupts = <GIC_SPI 169 IRQ_TYPE_LEVEL_HIGH>; #address-cells = <1>; #size-cells = <0>; - clocks = <&bsc3_clk>; + clocks = <&slave_ccu BCM21664_SLAVE_CCU_BSC3>; status = "disabled"; }; @@ -170,105 +172,149 @@ interrupts = <GIC_SPI 170 IRQ_TYPE_LEVEL_HIGH>; #address-cells = <1>; #size-cells = <0>; - clocks = <&bsc4_clk>; + clocks = <&slave_ccu BCM21664_SLAVE_CCU_BSC4>; status = "disabled"; }; clocks { - bsc1_clk: bsc1 { - compatible = "fixed-clock"; - clock-frequency = <13000000>; - #clock-cells = <0>; - }; + #address-cells = <1>; + #size-cells = <1>; + ranges; - bsc2_clk: bsc2 { - compatible = "fixed-clock"; - clock-frequency = <13000000>; + /* + * Fixed clocks are defined before CCUs whose + * clocks may depend on them. + */ + + ref_32k_clk: ref_32k { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <32768>; }; - bsc3_clk: bsc3 { - compatible = "fixed-clock"; - clock-frequency = <13000000>; + bbl_32k_clk: bbl_32k { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <32768>; }; - bsc4_clk: bsc4 { + ref_13m_clk: ref_13m { + #clock-cells = <0>; compatible = "fixed-clock"; clock-frequency = <13000000>; - #clock-cells = <0>; }; - pmu_bsc_clk: pmu_bsc { + var_13m_clk: var_13m { + #clock-cells = <0>; compatible = "fixed-clock"; clock-frequency = <13000000>; - #clock-cells = <0>; }; - hub_timer_clk: hub_timer { - compatible = "fixed-clock"; - clock-frequency = <32768>; + dft_19_5m_clk: dft_19_5m { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <19500000>; }; - pwm_clk: pwm { + ref_crystal_clk: ref_crystal { + #clock-cells = <0>; compatible = "fixed-clock"; clock-frequency = <26000000>; - #clock-cells = <0>; }; - sdio1_clk: sdio1 { - compatible = "fixed-clock"; - clock-frequency = <48000000>; + ref_52m_clk: ref_52m { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <52000000>; }; - sdio2_clk: sdio2 { - compatible = "fixed-clock"; - clock-frequency = <48000000>; + var_52m_clk: var_52m { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <52000000>; }; - sdio3_clk: sdio3 { - compatible = "fixed-clock"; - clock-frequency = <48000000>; + usb_otg_ahb_clk: usb_otg_ahb { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <52000000>; }; - sdio4_clk: sdio4 { - compatible = "fixed-clock"; - clock-frequency = <48000000>; + ref_96m_clk: ref_96m { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <96000000>; }; - tmon_1m_clk: tmon_1m { - compatible = "fixed-clock"; - clock-frequency = <1000000>; + var_96m_clk: var_96m { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <96000000>; }; - uartb_clk: uartb { - compatible = "fixed-clock"; - clock-frequency = <13000000>; + ref_104m_clk: ref_104m { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <104000000>; }; - uartb2_clk: uartb2 { - compatible = "fixed-clock"; - clock-frequency = <13000000>; + var_104m_clk: var_104m { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <104000000>; }; - uartb3_clk: uartb3 { - compatible = "fixed-clock"; - clock-frequency = <13000000>; + ref_156m_clk: ref_156m { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <156000000>; }; - usb_otg_ahb_clk: usb_otg_ahb { - compatible = "fixed-clock"; - clock-frequency = <52000000>; + var_156m_clk: var_156m { #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <156000000>; + }; + + root_ccu: root_ccu { + compatible = BCM21664_DT_ROOT_CCU_COMPAT; + reg = <0x35001000 0x0f00>; + #clock-cells = <1>; + clock-output-names = "frac_1m"; + }; + + aon_ccu: aon_ccu { + compatible = BCM21664_DT_AON_CCU_COMPAT; + reg = <0x35002000 0x0f00>; + #clock-cells = <1>; + clock-output-names = "hub_timer"; + }; + + master_ccu: master_ccu { + compatible = BCM21664_DT_MASTER_CCU_COMPAT; + reg = <0x3f001000 0x0f00>; + #clock-cells = <1>; + clock-output-names = "sdio1", + "sdio2", + "sdio3", + "sdio4", + "sdio1_sleep", + "sdio2_sleep", + "sdio3_sleep", + "sdio4_sleep"; + }; + + slave_ccu: slave_ccu { + compatible = BCM21664_DT_SLAVE_CCU_COMPAT; + reg = <0x3e011000 0x0f00>; + #clock-cells = <1>; + clock-output-names = "uartb", + "uartb2", + "uartb3", + "bsc1", + "bsc2", + "bsc3", + "bsc4"; }; }; diff --git a/arch/arm/boot/dts/marco.dtsi b/arch/arm/boot/dts/marco.dtsi index 0c9647d28765..fb354225740a 100644 --- a/arch/arm/boot/dts/marco.dtsi +++ b/arch/arm/boot/dts/marco.dtsi @@ -36,7 +36,7 @@ ranges = <0x40000000 0x40000000 0xa0000000>; l2-cache-controller@c0030000 { - compatible = "sirf,marco-pl310-cache", "arm,pl310-cache"; + compatible = "arm,pl310-cache"; reg = <0xc0030000 0x1000>; interrupts = <0 59 0>; arm,tag-latency = <1 1 1>; diff --git a/arch/arm/boot/dts/prima2.dtsi b/arch/arm/boot/dts/prima2.dtsi index 3df7ba860282..963b7e54ab15 100644 --- a/arch/arm/boot/dts/prima2.dtsi +++ b/arch/arm/boot/dts/prima2.dtsi @@ -48,7 +48,7 @@ ranges = <0x40000000 0x40000000 0x80000000>; l2-cache-controller@80040000 { - compatible = "arm,pl310-cache", "sirf,prima2-pl310-cache"; + compatible = "arm,pl310-cache"; reg = <0x80040000 0x1000>; interrupts = <59>; arm,tag-latency = <1 1 1>; diff --git a/arch/arm/boot/dts/sun4i-a10.dtsi b/arch/arm/boot/dts/sun4i-a10.dtsi index c7b794ed337d..d96e179490ce 100644 --- a/arch/arm/boot/dts/sun4i-a10.dtsi +++ b/arch/arm/boot/dts/sun4i-a10.dtsi @@ -713,7 +713,7 @@ }; i2c0: i2c@01c2ac00 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun4i-a10-i2c"; reg = <0x01c2ac00 0x400>; interrupts = <7>; clocks = <&apb1_gates 0>; @@ -724,7 +724,7 @@ }; i2c1: i2c@01c2b000 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun4i-a10-i2c"; reg = <0x01c2b000 0x400>; interrupts = <8>; clocks = <&apb1_gates 1>; @@ -735,7 +735,7 @@ }; i2c2: i2c@01c2b400 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun4i-a10-i2c"; reg = <0x01c2b400 0x400>; interrupts = <9>; clocks = <&apb1_gates 2>; diff --git a/arch/arm/boot/dts/sun5i-a10s.dtsi b/arch/arm/boot/dts/sun5i-a10s.dtsi index aa1dd59dc34a..b64f705d9008 100644 --- a/arch/arm/boot/dts/sun5i-a10s.dtsi +++ b/arch/arm/boot/dts/sun5i-a10s.dtsi @@ -560,7 +560,7 @@ i2c0: i2c@01c2ac00 { #address-cells = <1>; #size-cells = <0>; - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun5i-a10s-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2ac00 0x400>; interrupts = <7>; clocks = <&apb1_gates 0>; @@ -571,7 +571,7 @@ i2c1: i2c@01c2b000 { #address-cells = <1>; #size-cells = <0>; - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun5i-a10s-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2b000 0x400>; interrupts = <8>; clocks = <&apb1_gates 1>; @@ -582,7 +582,7 @@ i2c2: i2c@01c2b400 { #address-cells = <1>; #size-cells = <0>; - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun5i-a10s-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2b400 0x400>; interrupts = <9>; clocks = <&apb1_gates 2>; diff --git a/arch/arm/boot/dts/sun5i-a13.dtsi b/arch/arm/boot/dts/sun5i-a13.dtsi index c9fdb7b3ecd9..3b2a94c40f6e 100644 --- a/arch/arm/boot/dts/sun5i-a13.dtsi +++ b/arch/arm/boot/dts/sun5i-a13.dtsi @@ -486,7 +486,7 @@ }; i2c0: i2c@01c2ac00 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun5i-a13-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2ac00 0x400>; interrupts = <7>; clocks = <&apb1_gates 0>; @@ -497,7 +497,7 @@ }; i2c1: i2c@01c2b000 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun5i-a13-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2b000 0x400>; interrupts = <8>; clocks = <&apb1_gates 1>; @@ -508,7 +508,7 @@ }; i2c2: i2c@01c2b400 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun5i-a13-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2b400 0x400>; interrupts = <9>; clocks = <&apb1_gates 2>; diff --git a/arch/arm/boot/dts/sun7i-a20.dtsi b/arch/arm/boot/dts/sun7i-a20.dtsi index 385933bac114..01e94664232a 100644 --- a/arch/arm/boot/dts/sun7i-a20.dtsi +++ b/arch/arm/boot/dts/sun7i-a20.dtsi @@ -863,7 +863,7 @@ }; i2c0: i2c@01c2ac00 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun7i-a20-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2ac00 0x400>; interrupts = <0 7 4>; clocks = <&apb1_gates 0>; @@ -874,7 +874,7 @@ }; i2c1: i2c@01c2b000 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun7i-a20-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2b000 0x400>; interrupts = <0 8 4>; clocks = <&apb1_gates 1>; @@ -885,7 +885,7 @@ }; i2c2: i2c@01c2b400 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun7i-a20-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2b400 0x400>; interrupts = <0 9 4>; clocks = <&apb1_gates 2>; @@ -896,7 +896,7 @@ }; i2c3: i2c@01c2b800 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun7i-a20-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2b800 0x400>; interrupts = <0 88 4>; clocks = <&apb1_gates 3>; @@ -907,7 +907,7 @@ }; i2c4: i2c@01c2c000 { - compatible = "allwinner,sun4i-i2c"; + compatible = "allwinner,sun7i-a20-i2c", "allwinner,sun4i-a10-i2c"; reg = <0x01c2c000 0x400>; interrupts = <0 89 4>; clocks = <&apb1_gates 15>; diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c index 86fd60fefbc9..f91136ab447e 100644 --- a/arch/arm/common/mcpm_entry.c +++ b/arch/arm/common/mcpm_entry.c @@ -106,14 +106,14 @@ void mcpm_cpu_power_down(void) BUG(); } -int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster) +int mcpm_wait_for_cpu_powerdown(unsigned int cpu, unsigned int cluster) { int ret; - if (WARN_ON_ONCE(!platform_ops || !platform_ops->power_down_finish)) + if (WARN_ON_ONCE(!platform_ops || !platform_ops->wait_for_powerdown)) return -EUNATCH; - ret = platform_ops->power_down_finish(cpu, cluster); + ret = platform_ops->wait_for_powerdown(cpu, cluster); if (ret) pr_warn("%s: cpu %u, cluster %u failed to power down (%d)\n", __func__, cpu, cluster, ret); diff --git a/arch/arm/common/mcpm_platsmp.c b/arch/arm/common/mcpm_platsmp.c index 177251a4dd9a..92e54d7c6f46 100644 --- a/arch/arm/common/mcpm_platsmp.c +++ b/arch/arm/common/mcpm_platsmp.c @@ -62,7 +62,7 @@ static int mcpm_cpu_kill(unsigned int cpu) cpu_to_pcpu(cpu, &pcpu, &pcluster); - return !mcpm_cpu_power_down_finish(pcpu, pcluster); + return !mcpm_wait_for_cpu_powerdown(pcpu, pcluster); } static int mcpm_cpu_disable(unsigned int cpu) diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index 23e728ecf8ab..f5a357601983 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -21,6 +21,7 @@ generic-y += parport.h generic-y += poll.h generic-y += preempt.h generic-y += resource.h +generic-y += rwsem.h generic-y += sections.h generic-y += segment.h generic-y += sembuf.h diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index b974184f9941..57f0584e8d97 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -312,7 +312,7 @@ * you cannot return to the original mode. */ .macro safe_svcmode_maskall reg:req -#if __LINUX_ARM_ARCH__ >= 6 +#if __LINUX_ARM_ARCH__ >= 6 && !defined(CONFIG_CPU_V7M) mrs \reg , cpsr eor \reg, \reg, #HYP_MODE tst \reg, #MODE_MASK diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 8b8b61685a34..fd43f7f55b70 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -212,7 +212,7 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *, static inline void __flush_icache_all(void) { __flush_icache_preferred(); - dsb(); + dsb(ishst); } /* @@ -487,4 +487,6 @@ int set_memory_rw(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); +void flush_uprobe_xol_access(struct page *page, unsigned long uaddr, + void *kaddr, unsigned long len); #endif diff --git a/arch/arm/include/asm/cp15.h b/arch/arm/include/asm/cp15.h index 6493802f880a..c3f11524f10c 100644 --- a/arch/arm/include/asm/cp15.h +++ b/arch/arm/include/asm/cp15.h @@ -42,24 +42,23 @@ #ifndef __ASSEMBLY__ #if __LINUX_ARM_ARCH__ >= 4 -#define vectors_high() (cr_alignment & CR_V) +#define vectors_high() (get_cr() & CR_V) #else #define vectors_high() (0) #endif #ifdef CONFIG_CPU_CP15 -extern unsigned long cr_no_alignment; /* defined in entry-armv.S */ extern unsigned long cr_alignment; /* defined in entry-armv.S */ -static inline unsigned int get_cr(void) +static inline unsigned long get_cr(void) { - unsigned int val; + unsigned long val; asm("mrc p15, 0, %0, c1, c0, 0 @ get CR" : "=r" (val) : : "cc"); return val; } -static inline void set_cr(unsigned int val) +static inline void set_cr(unsigned long val) { asm volatile("mcr p15, 0, %0, c1, c0, 0 @ set CR" : : "r" (val) : "cc"); @@ -80,10 +79,6 @@ static inline void set_auxcr(unsigned int val) isb(); } -#ifndef CONFIG_SMP -extern void adjust_cr(unsigned long mask, unsigned long set); -#endif - #define CPACC_FULL(n) (3 << (n * 2)) #define CPACC_SVC(n) (1 << (n * 2)) #define CPACC_DISABLE(n) (0 << (n * 2)) @@ -106,13 +101,17 @@ static inline void set_copro_access(unsigned int val) #else /* ifdef CONFIG_CPU_CP15 */ /* - * cr_alignment and cr_no_alignment are tightly coupled to cp15 (at least in the - * minds of the developers). Yielding 0 for machines without a cp15 (and making - * it read-only) is fine for most cases and saves quite some #ifdeffery. + * cr_alignment is tightly coupled to cp15 (at least in the minds of the + * developers). Yielding 0 for machines without a cp15 (and making it + * read-only) is fine for most cases and saves quite some #ifdeffery. */ -#define cr_no_alignment UL(0) #define cr_alignment UL(0) +static inline unsigned long get_cr(void) +{ + return 0; +} + #endif /* ifdef CONFIG_CPU_CP15 / else */ #endif /* ifndef __ASSEMBLY__ */ diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h index 4764344367d4..8c2b7321a478 100644 --- a/arch/arm/include/asm/cputype.h +++ b/arch/arm/include/asm/cputype.h @@ -72,6 +72,7 @@ #define ARM_CPU_PART_CORTEX_A15 0xC0F0 #define ARM_CPU_PART_CORTEX_A7 0xC070 #define ARM_CPU_PART_CORTEX_A12 0xC0D0 +#define ARM_CPU_PART_CORTEX_A17 0xC0E0 #define ARM_CPU_XSCALE_ARCH_MASK 0xe000 #define ARM_CPU_XSCALE_ARCH_V1 0x2000 diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h index e701a4d9aa59..c45b61a4b4a5 100644 --- a/arch/arm/include/asm/dma-mapping.h +++ b/arch/arm/include/asm/dma-mapping.h @@ -58,21 +58,37 @@ static inline int dma_set_mask(struct device *dev, u64 mask) #ifndef __arch_pfn_to_dma static inline dma_addr_t pfn_to_dma(struct device *dev, unsigned long pfn) { + if (dev) + pfn -= dev->dma_pfn_offset; return (dma_addr_t)__pfn_to_bus(pfn); } static inline unsigned long dma_to_pfn(struct device *dev, dma_addr_t addr) { - return __bus_to_pfn(addr); + unsigned long pfn = __bus_to_pfn(addr); + + if (dev) + pfn += dev->dma_pfn_offset; + + return pfn; } static inline void *dma_to_virt(struct device *dev, dma_addr_t addr) { + if (dev) { + unsigned long pfn = dma_to_pfn(dev, addr); + + return phys_to_virt(__pfn_to_phys(pfn)); + } + return (void *)__bus_to_virt((unsigned long)addr); } static inline dma_addr_t virt_to_dma(struct device *dev, void *addr) { + if (dev) + return pfn_to_dma(dev, virt_to_pfn(addr)); + return (dma_addr_t)__virt_to_bus((unsigned long)(addr)); } @@ -105,6 +121,13 @@ static inline unsigned long dma_max_pfn(struct device *dev) } #define dma_max_pfn(dev) dma_max_pfn(dev) +static inline int set_arch_dma_coherent_ops(struct device *dev) +{ + set_dma_ops(dev, &arm_coherent_dma_ops); + return 0; +} +#define set_arch_dma_coherent_ops(dev) set_arch_dma_coherent_ops(dev) + static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) { unsigned int offset = paddr & ~PAGE_MASK; diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h index bbae919bceb4..74124b0d0d79 100644 --- a/arch/arm/include/asm/fixmap.h +++ b/arch/arm/include/asm/fixmap.h @@ -1,24 +1,11 @@ #ifndef _ASM_FIXMAP_H #define _ASM_FIXMAP_H -/* - * Nothing too fancy for now. - * - * On ARM we already have well known fixed virtual addresses imposed by - * the architecture such as the vector page which is located at 0xffff0000, - * therefore a second level page table is already allocated covering - * 0xfff00000 upwards. - * - * The cache flushing code in proc-xscale.S uses the virtual area between - * 0xfffe0000 and 0xfffeffff. - */ - -#define FIXADDR_START 0xfff00000UL -#define FIXADDR_TOP 0xfffe0000UL +#define FIXADDR_START 0xffc00000UL +#define FIXADDR_TOP 0xffe00000UL #define FIXADDR_SIZE (FIXADDR_TOP - FIXADDR_START) -#define FIX_KMAP_BEGIN 0 -#define FIX_KMAP_END (FIXADDR_SIZE >> PAGE_SHIFT) +#define FIX_KMAP_NR_PTES (FIXADDR_SIZE >> PAGE_SHIFT) #define __fix_to_virt(x) (FIXADDR_START + ((x) << PAGE_SHIFT)) #define __virt_to_fix(x) (((x) - FIXADDR_START) >> PAGE_SHIFT) @@ -27,7 +14,7 @@ extern void __this_fixmap_does_not_exist(void); static inline unsigned long fix_to_virt(const unsigned int idx) { - if (idx >= FIX_KMAP_END) + if (idx >= FIX_KMAP_NR_PTES) __this_fixmap_does_not_exist(); return __fix_to_virt(idx); } diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index f89515adac60..eb577f4f5f70 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -52,15 +52,7 @@ extern inline void *return_address(unsigned int level) #endif -#define HAVE_ARCH_CALLER_ADDR - -#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -#define CALLER_ADDR1 ((unsigned long)return_address(1)) -#define CALLER_ADDR2 ((unsigned long)return_address(2)) -#define CALLER_ADDR3 ((unsigned long)return_address(3)) -#define CALLER_ADDR4 ((unsigned long)return_address(4)) -#define CALLER_ADDR5 ((unsigned long)return_address(5)) -#define CALLER_ADDR6 ((unsigned long)return_address(6)) +#define ftrace_return_addr(n) return_address(n) #endif /* ifndef __ASSEMBLY__ */ diff --git a/arch/arm/include/asm/glue-df.h b/arch/arm/include/asm/glue-df.h index 6b70f1b46a6e..04e18b656659 100644 --- a/arch/arm/include/asm/glue-df.h +++ b/arch/arm/include/asm/glue-df.h @@ -31,14 +31,6 @@ #undef CPU_DABORT_HANDLER #undef MULTI_DABORT -#if defined(CONFIG_CPU_ARM710) -# ifdef CPU_DABORT_HANDLER -# define MULTI_DABORT 1 -# else -# define CPU_DABORT_HANDLER cpu_arm7_data_abort -# endif -#endif - #ifdef CONFIG_CPU_ABRT_EV4 # ifdef CPU_DABORT_HANDLER # define MULTI_DABORT 1 diff --git a/arch/arm/include/asm/hardware/cache-l2x0.h b/arch/arm/include/asm/hardware/cache-l2x0.h index 6795ff743b3d..3a5ec1c25659 100644 --- a/arch/arm/include/asm/hardware/cache-l2x0.h +++ b/arch/arm/include/asm/hardware/cache-l2x0.h @@ -26,8 +26,8 @@ #define L2X0_CACHE_TYPE 0x004 #define L2X0_CTRL 0x100 #define L2X0_AUX_CTRL 0x104 -#define L2X0_TAG_LATENCY_CTRL 0x108 -#define L2X0_DATA_LATENCY_CTRL 0x10C +#define L310_TAG_LATENCY_CTRL 0x108 +#define L310_DATA_LATENCY_CTRL 0x10C #define L2X0_EVENT_CNT_CTRL 0x200 #define L2X0_EVENT_CNT1_CFG 0x204 #define L2X0_EVENT_CNT0_CFG 0x208 @@ -54,53 +54,93 @@ #define L2X0_LOCKDOWN_WAY_D_BASE 0x900 #define L2X0_LOCKDOWN_WAY_I_BASE 0x904 #define L2X0_LOCKDOWN_STRIDE 0x08 -#define L2X0_ADDR_FILTER_START 0xC00 -#define L2X0_ADDR_FILTER_END 0xC04 +#define L310_ADDR_FILTER_START 0xC00 +#define L310_ADDR_FILTER_END 0xC04 #define L2X0_TEST_OPERATION 0xF00 #define L2X0_LINE_DATA 0xF10 #define L2X0_LINE_TAG 0xF30 #define L2X0_DEBUG_CTRL 0xF40 -#define L2X0_PREFETCH_CTRL 0xF60 -#define L2X0_POWER_CTRL 0xF80 -#define L2X0_DYNAMIC_CLK_GATING_EN (1 << 1) -#define L2X0_STNDBY_MODE_EN (1 << 0) +#define L310_PREFETCH_CTRL 0xF60 +#define L310_POWER_CTRL 0xF80 +#define L310_DYNAMIC_CLK_GATING_EN (1 << 1) +#define L310_STNDBY_MODE_EN (1 << 0) /* Registers shifts and masks */ #define L2X0_CACHE_ID_PART_MASK (0xf << 6) #define L2X0_CACHE_ID_PART_L210 (1 << 6) +#define L2X0_CACHE_ID_PART_L220 (2 << 6) #define L2X0_CACHE_ID_PART_L310 (3 << 6) #define L2X0_CACHE_ID_RTL_MASK 0x3f -#define L2X0_CACHE_ID_RTL_R0P0 0x0 -#define L2X0_CACHE_ID_RTL_R1P0 0x2 -#define L2X0_CACHE_ID_RTL_R2P0 0x4 -#define L2X0_CACHE_ID_RTL_R3P0 0x5 -#define L2X0_CACHE_ID_RTL_R3P1 0x6 -#define L2X0_CACHE_ID_RTL_R3P2 0x8 +#define L210_CACHE_ID_RTL_R0P2_02 0x00 +#define L210_CACHE_ID_RTL_R0P1 0x01 +#define L210_CACHE_ID_RTL_R0P2_01 0x02 +#define L210_CACHE_ID_RTL_R0P3 0x03 +#define L210_CACHE_ID_RTL_R0P4 0x0b +#define L210_CACHE_ID_RTL_R0P5 0x0f +#define L220_CACHE_ID_RTL_R1P7_01REL0 0x06 +#define L310_CACHE_ID_RTL_R0P0 0x00 +#define L310_CACHE_ID_RTL_R1P0 0x02 +#define L310_CACHE_ID_RTL_R2P0 0x04 +#define L310_CACHE_ID_RTL_R3P0 0x05 +#define L310_CACHE_ID_RTL_R3P1 0x06 +#define L310_CACHE_ID_RTL_R3P1_50REL0 0x07 +#define L310_CACHE_ID_RTL_R3P2 0x08 +#define L310_CACHE_ID_RTL_R3P3 0x09 -#define L2X0_AUX_CTRL_MASK 0xc0000fff +/* L2C auxiliary control register - bits common to L2C-210/220/310 */ +#define L2C_AUX_CTRL_WAY_SIZE_SHIFT 17 +#define L2C_AUX_CTRL_WAY_SIZE_MASK (7 << 17) +#define L2C_AUX_CTRL_WAY_SIZE(n) ((n) << 17) +#define L2C_AUX_CTRL_EVTMON_ENABLE BIT(20) +#define L2C_AUX_CTRL_PARITY_ENABLE BIT(21) +#define L2C_AUX_CTRL_SHARED_OVERRIDE BIT(22) +/* L2C-210/220 common bits */ #define L2X0_AUX_CTRL_DATA_RD_LATENCY_SHIFT 0 -#define L2X0_AUX_CTRL_DATA_RD_LATENCY_MASK 0x7 +#define L2X0_AUX_CTRL_DATA_RD_LATENCY_MASK (7 << 0) #define L2X0_AUX_CTRL_DATA_WR_LATENCY_SHIFT 3 -#define L2X0_AUX_CTRL_DATA_WR_LATENCY_MASK (0x7 << 3) +#define L2X0_AUX_CTRL_DATA_WR_LATENCY_MASK (7 << 3) #define L2X0_AUX_CTRL_TAG_LATENCY_SHIFT 6 -#define L2X0_AUX_CTRL_TAG_LATENCY_MASK (0x7 << 6) +#define L2X0_AUX_CTRL_TAG_LATENCY_MASK (7 << 6) #define L2X0_AUX_CTRL_DIRTY_LATENCY_SHIFT 9 -#define L2X0_AUX_CTRL_DIRTY_LATENCY_MASK (0x7 << 9) -#define L2X0_AUX_CTRL_ASSOCIATIVITY_SHIFT 16 -#define L2X0_AUX_CTRL_WAY_SIZE_SHIFT 17 -#define L2X0_AUX_CTRL_WAY_SIZE_MASK (0x7 << 17) -#define L2X0_AUX_CTRL_SHARE_OVERRIDE_SHIFT 22 -#define L2X0_AUX_CTRL_NS_LOCKDOWN_SHIFT 26 -#define L2X0_AUX_CTRL_NS_INT_CTRL_SHIFT 27 -#define L2X0_AUX_CTRL_DATA_PREFETCH_SHIFT 28 -#define L2X0_AUX_CTRL_INSTR_PREFETCH_SHIFT 29 -#define L2X0_AUX_CTRL_EARLY_BRESP_SHIFT 30 +#define L2X0_AUX_CTRL_DIRTY_LATENCY_MASK (7 << 9) +#define L2X0_AUX_CTRL_ASSOC_SHIFT 13 +#define L2X0_AUX_CTRL_ASSOC_MASK (15 << 13) +/* L2C-210 specific bits */ +#define L210_AUX_CTRL_WRAP_DISABLE BIT(12) +#define L210_AUX_CTRL_WA_OVERRIDE BIT(23) +#define L210_AUX_CTRL_EXCLUSIVE_ABORT BIT(24) +/* L2C-220 specific bits */ +#define L220_AUX_CTRL_EXCLUSIVE_CACHE BIT(12) +#define L220_AUX_CTRL_FWA_SHIFT 23 +#define L220_AUX_CTRL_FWA_MASK (3 << 23) +#define L220_AUX_CTRL_NS_LOCKDOWN BIT(26) +#define L220_AUX_CTRL_NS_INT_CTRL BIT(27) +/* L2C-310 specific bits */ +#define L310_AUX_CTRL_FULL_LINE_ZERO BIT(0) /* R2P0+ */ +#define L310_AUX_CTRL_HIGHPRIO_SO_DEV BIT(10) /* R2P0+ */ +#define L310_AUX_CTRL_STORE_LIMITATION BIT(11) /* R2P0+ */ +#define L310_AUX_CTRL_EXCLUSIVE_CACHE BIT(12) +#define L310_AUX_CTRL_ASSOCIATIVITY_16 BIT(16) +#define L310_AUX_CTRL_CACHE_REPLACE_RR BIT(25) /* R2P0+ */ +#define L310_AUX_CTRL_NS_LOCKDOWN BIT(26) +#define L310_AUX_CTRL_NS_INT_CTRL BIT(27) +#define L310_AUX_CTRL_DATA_PREFETCH BIT(28) +#define L310_AUX_CTRL_INSTR_PREFETCH BIT(29) +#define L310_AUX_CTRL_EARLY_BRESP BIT(30) /* R2P0+ */ -#define L2X0_LATENCY_CTRL_SETUP_SHIFT 0 -#define L2X0_LATENCY_CTRL_RD_SHIFT 4 -#define L2X0_LATENCY_CTRL_WR_SHIFT 8 +#define L310_LATENCY_CTRL_SETUP(n) ((n) << 0) +#define L310_LATENCY_CTRL_RD(n) ((n) << 4) +#define L310_LATENCY_CTRL_WR(n) ((n) << 8) -#define L2X0_ADDR_FILTER_EN 1 +#define L310_ADDR_FILTER_EN 1 + +#define L310_PREFETCH_CTRL_OFFSET_MASK 0x1f +#define L310_PREFETCH_CTRL_DBL_LINEFILL_INCR BIT(23) +#define L310_PREFETCH_CTRL_PREFETCH_DROP BIT(24) +#define L310_PREFETCH_CTRL_DBL_LINEFILL_WRAP BIT(27) +#define L310_PREFETCH_CTRL_DATA_PREFETCH BIT(28) +#define L310_PREFETCH_CTRL_INSTR_PREFETCH BIT(29) +#define L310_PREFETCH_CTRL_DBL_LINEFILL BIT(30) #define L2X0_CTRL_EN 1 diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 91b99abe7a95..535579511ed0 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -18,6 +18,7 @@ } while (0) extern pte_t *pkmap_page_table; +extern pte_t *fixmap_page_table; extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 8aa4cca74501..3d23418cbddd 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -179,6 +179,12 @@ static inline void __iomem *__typesafe_io(unsigned long addr) /* PCI fixed i/o mapping */ #define PCI_IO_VIRT_BASE 0xfee00000 +#if defined(CONFIG_PCI) +void pci_ioremap_set_mem_type(int mem_type); +#else +static inline void pci_ioremap_set_mem_type(int mem_type) {} +#endif + extern int pci_ioremap_io(unsigned int offset, phys_addr_t phys_addr); /* diff --git a/arch/arm/include/asm/mach/arch.h b/arch/arm/include/asm/mach/arch.h index 17a3fa2979e8..060a75e99263 100644 --- a/arch/arm/include/asm/mach/arch.h +++ b/arch/arm/include/asm/mach/arch.h @@ -14,7 +14,6 @@ #include <linux/reboot.h> struct tag; -struct meminfo; struct pt_regs; struct smp_operations; #ifdef CONFIG_SMP @@ -45,10 +44,12 @@ struct machine_desc { unsigned char reserve_lp1 :1; /* never has lp1 */ unsigned char reserve_lp2 :1; /* never has lp2 */ enum reboot_mode reboot_mode; /* default restart mode */ + unsigned l2c_aux_val; /* L2 cache aux value */ + unsigned l2c_aux_mask; /* L2 cache aux mask */ + void (*l2c_write_sec)(unsigned long, unsigned); struct smp_operations *smp; /* SMP operations */ bool (*smp_init)(void); - void (*fixup)(struct tag *, char **, - struct meminfo *); + void (*fixup)(struct tag *, char **); void (*init_meminfo)(void); void (*reserve)(void);/* reserve mem blocks */ void (*map_io)(void);/* IO mapping function */ diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h index a5ff410dcdb6..d9702eb0b02b 100644 --- a/arch/arm/include/asm/mcpm.h +++ b/arch/arm/include/asm/mcpm.h @@ -98,14 +98,14 @@ int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster); * previously in which case the caller should take appropriate action. * * On success, the CPU is not guaranteed to be truly halted until - * mcpm_cpu_power_down_finish() subsequently returns non-zero for the + * mcpm_wait_for_cpu_powerdown() subsequently returns non-zero for the * specified cpu. Until then, other CPUs should make sure they do not * trash memory the target CPU might be executing/accessing. */ void mcpm_cpu_power_down(void); /** - * mcpm_cpu_power_down_finish - wait for a specified CPU to halt, and + * mcpm_wait_for_cpu_powerdown - wait for a specified CPU to halt, and * make sure it is powered off * * @cpu: CPU number within given cluster @@ -127,7 +127,7 @@ void mcpm_cpu_power_down(void); * - zero if the CPU is in a safely parked state * - nonzero otherwise (e.g., timeout) */ -int mcpm_cpu_power_down_finish(unsigned int cpu, unsigned int cluster); +int mcpm_wait_for_cpu_powerdown(unsigned int cpu, unsigned int cluster); /** * mcpm_cpu_suspend - bring the calling CPU in a suspended state @@ -171,7 +171,7 @@ int mcpm_cpu_powered_up(void); struct mcpm_platform_ops { int (*power_up)(unsigned int cpu, unsigned int cluster); void (*power_down)(void); - int (*power_down_finish)(unsigned int cpu, unsigned int cluster); + int (*wait_for_powerdown)(unsigned int cpu, unsigned int cluster); void (*suspend)(u64); void (*powered_up)(void); }; diff --git a/arch/arm/include/asm/memblock.h b/arch/arm/include/asm/memblock.h index c2f5102ae659..bf47a6c110a2 100644 --- a/arch/arm/include/asm/memblock.h +++ b/arch/arm/include/asm/memblock.h @@ -1,10 +1,9 @@ #ifndef _ASM_ARM_MEMBLOCK_H #define _ASM_ARM_MEMBLOCK_H -struct meminfo; struct machine_desc; -void arm_memblock_init(struct meminfo *, const struct machine_desc *); +void arm_memblock_init(const struct machine_desc *); phys_addr_t arm_memblock_steal(phys_addr_t size, phys_addr_t align); #endif diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 02fa2558f662..2b751464d6ff 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -83,8 +83,6 @@ */ #define IOREMAP_MAX_ORDER 24 -#define CONSISTENT_END (0xffe00000UL) - #else /* CONFIG_MMU */ /* diff --git a/arch/arm/include/asm/outercache.h b/arch/arm/include/asm/outercache.h index f94784f0e3a6..891a56b35bcf 100644 --- a/arch/arm/include/asm/outercache.h +++ b/arch/arm/include/asm/outercache.h @@ -28,53 +28,84 @@ struct outer_cache_fns { void (*clean_range)(unsigned long, unsigned long); void (*flush_range)(unsigned long, unsigned long); void (*flush_all)(void); - void (*inv_all)(void); void (*disable)(void); #ifdef CONFIG_OUTER_CACHE_SYNC void (*sync)(void); #endif - void (*set_debug)(unsigned long); void (*resume)(void); + + /* This is an ARM L2C thing */ + void (*write_sec)(unsigned long, unsigned); }; extern struct outer_cache_fns outer_cache; #ifdef CONFIG_OUTER_CACHE - +/** + * outer_inv_range - invalidate range of outer cache lines + * @start: starting physical address, inclusive + * @end: end physical address, exclusive + */ static inline void outer_inv_range(phys_addr_t start, phys_addr_t end) { if (outer_cache.inv_range) outer_cache.inv_range(start, end); } + +/** + * outer_clean_range - clean dirty outer cache lines + * @start: starting physical address, inclusive + * @end: end physical address, exclusive + */ static inline void outer_clean_range(phys_addr_t start, phys_addr_t end) { if (outer_cache.clean_range) outer_cache.clean_range(start, end); } + +/** + * outer_flush_range - clean and invalidate outer cache lines + * @start: starting physical address, inclusive + * @end: end physical address, exclusive + */ static inline void outer_flush_range(phys_addr_t start, phys_addr_t end) { if (outer_cache.flush_range) outer_cache.flush_range(start, end); } +/** + * outer_flush_all - clean and invalidate all cache lines in the outer cache + * + * Note: depending on implementation, this may not be atomic - it must + * only be called with interrupts disabled and no other active outer + * cache masters. + * + * It is intended that this function is only used by implementations + * needing to override the outer_cache.disable() method due to security. + * (Some implementations perform this as a clean followed by an invalidate.) + */ static inline void outer_flush_all(void) { if (outer_cache.flush_all) outer_cache.flush_all(); } -static inline void outer_inv_all(void) -{ - if (outer_cache.inv_all) - outer_cache.inv_all(); -} - -static inline void outer_disable(void) -{ - if (outer_cache.disable) - outer_cache.disable(); -} +/** + * outer_disable - clean, invalidate and disable the outer cache + * + * Disable the outer cache, ensuring that any data contained in the outer + * cache is pushed out to lower levels of system memory. The note and + * conditions above concerning outer_flush_all() applies here. + */ +extern void outer_disable(void); +/** + * outer_resume - restore the cache configuration and re-enable outer cache + * + * Restore any configuration that the cache had when previously enabled, + * and re-enable the outer cache. + */ static inline void outer_resume(void) { if (outer_cache.resume) @@ -90,13 +121,18 @@ static inline void outer_clean_range(phys_addr_t start, phys_addr_t end) static inline void outer_flush_range(phys_addr_t start, phys_addr_t end) { } static inline void outer_flush_all(void) { } -static inline void outer_inv_all(void) { } static inline void outer_disable(void) { } static inline void outer_resume(void) { } #endif #ifdef CONFIG_OUTER_CACHE_SYNC +/** + * outer_sync - perform a sync point for outer cache + * + * Ensure that all outer cache operations are complete and any store + * buffers are drained. + */ static inline void outer_sync(void) { if (outer_cache.sync) diff --git a/arch/arm/include/asm/setup.h b/arch/arm/include/asm/setup.h index 8d6a089dfb76..e0adb9f1bf94 100644 --- a/arch/arm/include/asm/setup.h +++ b/arch/arm/include/asm/setup.h @@ -21,34 +21,6 @@ #define __tagtable(tag, fn) \ static const struct tagtable __tagtable_##fn __tag = { tag, fn } -/* - * Memory map description - */ -#define NR_BANKS CONFIG_ARM_NR_BANKS - -struct membank { - phys_addr_t start; - phys_addr_t size; - unsigned int highmem; -}; - -struct meminfo { - int nr_banks; - struct membank bank[NR_BANKS]; -}; - -extern struct meminfo meminfo; - -#define for_each_bank(iter,mi) \ - for (iter = 0; iter < (mi)->nr_banks; iter++) - -#define bank_pfn_start(bank) __phys_to_pfn((bank)->start) -#define bank_pfn_end(bank) __phys_to_pfn((bank)->start + (bank)->size) -#define bank_pfn_size(bank) ((bank)->size >> PAGE_SHIFT) -#define bank_phys_start(bank) (bank)->start -#define bank_phys_end(bank) ((bank)->start + (bank)->size) -#define bank_phys_size(bank) (bank)->size - extern int arm_add_memory(u64 start, u64 size); extern void early_print(const char *str, ...); extern void dump_machine_table(void); diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 040619c32d68..38ddd9f83d0e 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_ARTHUR) += arthur.o obj-$(CONFIG_ISA_DMA) += dma-isa.o obj-$(CONFIG_PCI) += bios32.o isa.o obj-$(CONFIG_ARM_CPU_SUSPEND) += sleep.o suspend.o +obj-$(CONFIG_HIBERNATION) += hibernate.o obj-$(CONFIG_SMP) += smp.o ifdef CONFIG_MMU obj-$(CONFIG_SMP) += smp_tlb.o diff --git a/arch/arm/kernel/atags_parse.c b/arch/arm/kernel/atags_parse.c index 8c14de8180c0..7807ef58a2ab 100644 --- a/arch/arm/kernel/atags_parse.c +++ b/arch/arm/kernel/atags_parse.c @@ -22,6 +22,7 @@ #include <linux/fs.h> #include <linux/root_dev.h> #include <linux/screen_info.h> +#include <linux/memblock.h> #include <asm/setup.h> #include <asm/system_info.h> @@ -222,10 +223,10 @@ setup_machine_tags(phys_addr_t __atags_pointer, unsigned int machine_nr) } if (mdesc->fixup) - mdesc->fixup(tags, &from, &meminfo); + mdesc->fixup(tags, &from); if (tags->hdr.tag == ATAG_CORE) { - if (meminfo.nr_banks != 0) + if (memblock_phys_mem_size()) squash_mem_tags(tags); save_atags(tags); parse_tags(tags); diff --git a/arch/arm/kernel/devtree.c b/arch/arm/kernel/devtree.c index ea9ce92a4b52..e94a157ddff1 100644 --- a/arch/arm/kernel/devtree.c +++ b/arch/arm/kernel/devtree.c @@ -27,10 +27,6 @@ #include <asm/mach/arch.h> #include <asm/mach-types.h> -void __init early_init_dt_add_memory_arch(u64 base, u64 size) -{ - arm_add_memory(base, size); -} #ifdef CONFIG_SMP extern struct of_cpu_method __cpu_method_of_table[]; diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 1879e8dd2acc..52a949a8077d 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -344,7 +344,7 @@ ENDPROC(__pabt_svc) @ @ Enable the alignment trap while in kernel mode @ - alignment_trap r0 + alignment_trap r0, .LCcralign @ @ Clear FP to mark the first stack frame @@ -413,6 +413,11 @@ __und_usr: @ adr r9, BSYM(ret_from_exception) + @ IRQs must be enabled before attempting to read the instruction from + @ user space since that could cause a page/translation fault if the + @ page table was modified by another CPU. + enable_irq + tst r3, #PSR_T_BIT @ Thumb mode? bne __und_usr_thumb sub r4, r2, #4 @ ARM instr at LR - 4 @@ -484,7 +489,8 @@ ENDPROC(__und_usr) */ .pushsection .fixup, "ax" .align 2 -4: mov pc, r9 +4: str r4, [sp, #S_PC] @ retry current instruction + mov pc, r9 .popsection .pushsection __ex_table,"a" .long 1b, 4b @@ -517,7 +523,7 @@ ENDPROC(__und_usr) * r9 = normal "successful" return address * r10 = this threads thread_info structure * lr = unrecognised instruction return address - * IRQs disabled, FIQs enabled. + * IRQs enabled, FIQs enabled. */ @ @ Fall-through from Thumb-2 __und_usr @@ -624,7 +630,6 @@ call_fpe: #endif do_fpe: - enable_irq ldr r4, .LCfp add r10, r10, #TI_FPSTATE @ r10 = workspace ldr pc, [r4] @ Call FP module USR entry point @@ -652,8 +657,7 @@ __und_usr_fault_32: b 1f __und_usr_fault_16: mov r1, #2 -1: enable_irq - mov r0, sp +1: mov r0, sp adr lr, BSYM(ret_from_exception) b __und_fault ENDPROC(__und_usr_fault_32) @@ -1143,11 +1147,8 @@ __vectors_start: .data .globl cr_alignment - .globl cr_no_alignment cr_alignment: .space 4 -cr_no_alignment: - .space 4 #ifdef CONFIG_MULTI_IRQ_HANDLER .globl handle_arch_irq diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index a2dcafdf1bc8..7139d4a7dea7 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -365,13 +365,7 @@ ENTRY(vector_swi) str r0, [sp, #S_OLD_R0] @ Save OLD_R0 #endif zero_fp - -#ifdef CONFIG_ALIGNMENT_TRAP - ldr ip, __cr_alignment - ldr ip, [ip] - mcr p15, 0, ip, c1, c0 @ update control register -#endif - + alignment_trap ip, __cr_alignment enable_irq ct_user_exit get_thread_info tsk diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index efb208de75ec..5d702f8900b1 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -37,9 +37,9 @@ #endif .endm - .macro alignment_trap, rtemp + .macro alignment_trap, rtemp, label #ifdef CONFIG_ALIGNMENT_TRAP - ldr \rtemp, .LCcralign + ldr \rtemp, \label ldr \rtemp, [\rtemp] mcr p15, 0, \rtemp, c1, c0 #endif diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index c108ddcb9ba4..af9a8a927a4e 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -14,6 +14,7 @@ #include <linux/ftrace.h> #include <linux/uaccess.h> +#include <linux/module.h> #include <asm/cacheflush.h> #include <asm/opcodes.h> @@ -63,6 +64,18 @@ static unsigned long adjust_address(struct dyn_ftrace *rec, unsigned long addr) } #endif +int ftrace_arch_code_modify_prepare(void) +{ + set_all_modules_text_rw(); + return 0; +} + +int ftrace_arch_code_modify_post_process(void) +{ + set_all_modules_text_ro(); + return 0; +} + static unsigned long ftrace_call_replace(unsigned long pc, unsigned long addr) { return arm_gen_branch_link(pc, addr); diff --git a/arch/arm/kernel/head-common.S b/arch/arm/kernel/head-common.S index c96ecacb2021..572a38335c96 100644 --- a/arch/arm/kernel/head-common.S +++ b/arch/arm/kernel/head-common.S @@ -99,8 +99,7 @@ __mmap_switched: str r1, [r5] @ Save machine type str r2, [r6] @ Save atags pointer cmp r7, #0 - bicne r4, r0, #CR_A @ Clear 'A' bit - stmneia r7, {r0, r4} @ Save control register values + strne r0, [r7] @ Save control register values b start_kernel ENDPROC(__mmap_switched) diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 591d6e4a6492..2c35f0ff2fdc 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -475,7 +475,7 @@ ENDPROC(__turn_mmu_on) #ifdef CONFIG_SMP_ON_UP - __INIT + __HEAD __fixup_smp: and r3, r9, #0x000f0000 @ architecture version teq r3, #0x000f0000 @ CPU ID supported? diff --git a/arch/arm/kernel/hibernate.c b/arch/arm/kernel/hibernate.c new file mode 100644 index 000000000000..bb8b79648643 --- /dev/null +++ b/arch/arm/kernel/hibernate.c @@ -0,0 +1,107 @@ +/* + * Hibernation support specific for ARM + * + * Derived from work on ARM hibernation support by: + * + * Ubuntu project, hibernation support for mach-dove + * Copyright (C) 2010 Nokia Corporation (Hiroshi Doyu) + * Copyright (C) 2010 Texas Instruments, Inc. (Teerth Reddy et al.) + * https://lkml.org/lkml/2010/6/18/4 + * https://lists.linux-foundation.org/pipermail/linux-pm/2010-June/027422.html + * https://patchwork.kernel.org/patch/96442/ + * + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * + * License terms: GNU General Public License (GPL) version 2 + */ + +#include <linux/mm.h> +#include <linux/suspend.h> +#include <asm/system_misc.h> +#include <asm/idmap.h> +#include <asm/suspend.h> +#include <asm/memory.h> + +extern const void __nosave_begin, __nosave_end; + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = virt_to_pfn(&__nosave_begin); + unsigned long nosave_end_pfn = virt_to_pfn(&__nosave_end - 1); + + return (pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn); +} + +void notrace save_processor_state(void) +{ + WARN_ON(num_online_cpus() != 1); + local_fiq_disable(); +} + +void notrace restore_processor_state(void) +{ + local_fiq_enable(); +} + +/* + * Snapshot kernel memory and reset the system. + * + * swsusp_save() is executed in the suspend finisher so that the CPU + * context pointer and memory are part of the saved image, which is + * required by the resume kernel image to restart execution from + * swsusp_arch_suspend(). + * + * soft_restart is not technically needed, but is used to get success + * returned from cpu_suspend. + * + * When soft reboot completes, the hibernation snapshot is written out. + */ +static int notrace arch_save_image(unsigned long unused) +{ + int ret; + + ret = swsusp_save(); + if (ret == 0) + soft_restart(virt_to_phys(cpu_resume)); + return ret; +} + +/* + * Save the current CPU state before suspend / poweroff. + */ +int notrace swsusp_arch_suspend(void) +{ + return cpu_suspend(0, arch_save_image); +} + +/* + * Restore page contents for physical pages that were in use during loading + * hibernation image. Switch to idmap_pgd so the physical page tables + * are overwritten with the same contents. + */ +static void notrace arch_restore_image(void *unused) +{ + struct pbe *pbe; + + cpu_switch_mm(idmap_pgd, &init_mm); + for (pbe = restore_pblist; pbe; pbe = pbe->next) + copy_page(pbe->orig_address, pbe->address); + + soft_restart(virt_to_phys(cpu_resume)); +} + +static u64 resume_stack[PAGE_SIZE/2/sizeof(u64)] __nosavedata; + +/* + * Resume from the hibernation image. + * Due to the kernel heap / data restore, stack contents change underneath + * and that would make function calls impossible; switch to a temporary + * stack within the nosave region to avoid that problem. + */ +int swsusp_arch_resume(void) +{ + extern void call_with_stack(void (*fn)(void *), void *arg, void *sp); + call_with_stack(arch_restore_image, 0, + resume_stack + ARRAY_SIZE(resume_stack)); + return 0; +} diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index 9723d17b8f38..2c4257604513 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -37,6 +37,7 @@ #include <linux/proc_fs.h> #include <linux/export.h> +#include <asm/hardware/cache-l2x0.h> #include <asm/exception.h> #include <asm/mach/arch.h> #include <asm/mach/irq.h> @@ -115,10 +116,21 @@ EXPORT_SYMBOL_GPL(set_irq_flags); void __init init_IRQ(void) { + int ret; + if (IS_ENABLED(CONFIG_OF) && !machine_desc->init_irq) irqchip_init(); else machine_desc->init_irq(); + + if (IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_CACHE_L2X0) && + (machine_desc->l2c_aux_mask || machine_desc->l2c_aux_val)) { + outer_cache.write_sec = machine_desc->l2c_write_sec; + ret = l2x0_of_init(machine_desc->l2c_aux_val, + machine_desc->l2c_aux_mask); + if (ret) + pr_err("L2C: failed to init: %d\n", ret); + } } #ifdef CONFIG_MULTI_IRQ_HANDLER diff --git a/arch/arm/kernel/isa.c b/arch/arm/kernel/isa.c index 346485910732..9d1cf7156895 100644 --- a/arch/arm/kernel/isa.c +++ b/arch/arm/kernel/isa.c @@ -20,7 +20,7 @@ static unsigned int isa_membase, isa_portbase, isa_portshift; -static ctl_table ctl_isa_vars[4] = { +static struct ctl_table ctl_isa_vars[4] = { { .procname = "membase", .data = &isa_membase, @@ -44,7 +44,7 @@ static ctl_table ctl_isa_vars[4] = { static struct ctl_table_header *isa_sysctl_header; -static ctl_table ctl_isa[2] = { +static struct ctl_table ctl_isa[2] = { { .procname = "isa", .mode = 0555, @@ -52,7 +52,7 @@ static ctl_table ctl_isa[2] = { }, {} }; -static ctl_table ctl_bus[2] = { +static struct ctl_table ctl_bus[2] = { { .procname = "bus", .mode = 0555, diff --git a/arch/arm/kernel/iwmmxt.S b/arch/arm/kernel/iwmmxt.S index 2452dd1bef53..a5599cfc43cb 100644 --- a/arch/arm/kernel/iwmmxt.S +++ b/arch/arm/kernel/iwmmxt.S @@ -18,6 +18,7 @@ #include <asm/ptrace.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> +#include <asm/assembler.h> #if defined(CONFIG_CPU_PJ4) || defined(CONFIG_CPU_PJ4B) #define PJ4(code...) code @@ -65,17 +66,18 @@ * r9 = ret_from_exception * lr = undefined instr exit * - * called from prefetch exception handler with interrupts disabled + * called from prefetch exception handler with interrupts enabled */ ENTRY(iwmmxt_task_enable) + inc_preempt_count r10, r3 XSC(mrc p15, 0, r2, c15, c1, 0) PJ4(mrc p15, 0, r2, c1, c0, 2) @ CP0 and CP1 accessible? XSC(tst r2, #0x3) PJ4(tst r2, #0xf) - movne pc, lr @ if so no business here + bne 4f @ if so no business here @ enable access to CP0 and CP1 XSC(orr r2, r2, #0x3) XSC(mcr p15, 0, r2, c15, c1, 0) @@ -136,7 +138,7 @@ concan_dump: wstrd wR15, [r1, #MMX_WR15] 2: teq r0, #0 @ anything to load? - moveq pc, lr + beq 3f concan_load: @@ -169,8 +171,14 @@ concan_load: @ clear CUP/MUP (only if r1 != 0) teq r1, #0 mov r2, #0 - moveq pc, lr + beq 3f tmcr wCon, r2 + +3: +#ifdef CONFIG_PREEMPT_COUNT + get_thread_info r10 +#endif +4: dec_preempt_count r10, r3 mov pc, lr /* diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c index 51798d7854ac..a71ae1523620 100644 --- a/arch/arm/kernel/perf_event_cpu.c +++ b/arch/arm/kernel/perf_event_cpu.c @@ -221,6 +221,7 @@ static struct notifier_block cpu_pmu_hotplug_notifier = { * PMU platform driver and devicetree bindings. */ static struct of_device_id cpu_pmu_of_device_ids[] = { + {.compatible = "arm,cortex-a17-pmu", .data = armv7_a17_pmu_init}, {.compatible = "arm,cortex-a15-pmu", .data = armv7_a15_pmu_init}, {.compatible = "arm,cortex-a12-pmu", .data = armv7_a12_pmu_init}, {.compatible = "arm,cortex-a9-pmu", .data = armv7_a9_pmu_init}, diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c index f4ef3981ed02..2037f7205987 100644 --- a/arch/arm/kernel/perf_event_v7.c +++ b/arch/arm/kernel/perf_event_v7.c @@ -1599,6 +1599,13 @@ static int armv7_a12_pmu_init(struct arm_pmu *cpu_pmu) return 0; } +static int armv7_a17_pmu_init(struct arm_pmu *cpu_pmu) +{ + armv7_a12_pmu_init(cpu_pmu); + cpu_pmu->name = "ARMv7 Cortex-A17"; + return 0; +} + /* * Krait Performance Monitor Region Event Selection Register (PMRESRn) * @@ -2021,6 +2028,11 @@ static inline int armv7_a12_pmu_init(struct arm_pmu *cpu_pmu) return -ENODEV; } +static inline int armv7_a17_pmu_init(struct arm_pmu *cpu_pmu) +{ + return -ENODEV; +} + static inline int krait_pmu_init(struct arm_pmu *cpu_pmu) { return -ENODEV; diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 50e198c1e9c8..8a16ee5d8a95 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -72,6 +72,7 @@ static int __init fpe_setup(char *line) __setup("fpe=", fpe_setup); #endif +extern void init_default_cache_policy(unsigned long); extern void paging_init(const struct machine_desc *desc); extern void early_paging_init(const struct machine_desc *, struct proc_info_list *); @@ -590,7 +591,7 @@ static void __init setup_processor(void) pr_info("CPU: %s [%08x] revision %d (ARMv%s), cr=%08lx\n", cpu_name, read_cpuid_id(), read_cpuid_id() & 15, - proc_arch[cpu_architecture()], cr_alignment); + proc_arch[cpu_architecture()], get_cr()); snprintf(init_utsname()->machine, __NEW_UTS_LEN + 1, "%s%c", list->arch_name, ENDIANNESS); @@ -603,7 +604,9 @@ static void __init setup_processor(void) #ifndef CONFIG_ARM_THUMB elf_hwcap &= ~(HWCAP_THUMB | HWCAP_IDIVT); #endif - +#ifdef CONFIG_MMU + init_default_cache_policy(list->__cpu_mm_mmu_flags); +#endif erratum_a15_798181_init(); feat_v6_fixup(); @@ -628,15 +631,8 @@ void __init dump_machine_table(void) int __init arm_add_memory(u64 start, u64 size) { - struct membank *bank = &meminfo.bank[meminfo.nr_banks]; u64 aligned_start; - if (meminfo.nr_banks >= NR_BANKS) { - pr_crit("NR_BANKS too low, ignoring memory at 0x%08llx\n", - (long long)start); - return -EINVAL; - } - /* * Ensure that start/size are aligned to a page boundary. * Size is appropriately rounded down, start is rounded up. @@ -677,17 +673,17 @@ int __init arm_add_memory(u64 start, u64 size) aligned_start = PHYS_OFFSET; } - bank->start = aligned_start; - bank->size = size & ~(phys_addr_t)(PAGE_SIZE - 1); + start = aligned_start; + size = size & ~(phys_addr_t)(PAGE_SIZE - 1); /* * Check whether this memory region has non-zero size or * invalid node number. */ - if (bank->size == 0) + if (size == 0) return -EINVAL; - meminfo.nr_banks++; + memblock_add(start, size); return 0; } @@ -695,6 +691,7 @@ int __init arm_add_memory(u64 start, u64 size) * Pick out the memory size. We look for mem=size@start, * where start and size are "size[KkMm]" */ + static int __init early_mem(char *p) { static int usermem __initdata = 0; @@ -709,7 +706,8 @@ static int __init early_mem(char *p) */ if (usermem == 0) { usermem = 1; - meminfo.nr_banks = 0; + memblock_remove(memblock_start_of_DRAM(), + memblock_end_of_DRAM() - memblock_start_of_DRAM()); } start = PHYS_OFFSET; @@ -854,13 +852,6 @@ static void __init reserve_crashkernel(void) static inline void reserve_crashkernel(void) {} #endif /* CONFIG_KEXEC */ -static int __init meminfo_cmp(const void *_a, const void *_b) -{ - const struct membank *a = _a, *b = _b; - long cmp = bank_pfn_start(a) - bank_pfn_start(b); - return cmp < 0 ? -1 : cmp > 0 ? 1 : 0; -} - void __init hyp_mode_check(void) { #ifdef CONFIG_ARM_VIRT_EXT @@ -903,12 +894,10 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); - sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); - early_paging_init(mdesc, lookup_processor_type(read_cpuid_id())); setup_dma_zone(mdesc); sanity_check_meminfo(); - arm_memblock_init(&meminfo, mdesc); + arm_memblock_init(mdesc); paging_init(mdesc); request_standard_resources(mdesc); diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S index b907d9b790ab..1b880db2a033 100644 --- a/arch/arm/kernel/sleep.S +++ b/arch/arm/kernel/sleep.S @@ -127,6 +127,10 @@ ENDPROC(cpu_resume_after_mmu) .align ENTRY(cpu_resume) ARM_BE8(setend be) @ ensure we are in BE mode +#ifdef CONFIG_ARM_VIRT_EXT + bl __hyp_stub_install_secondary +#endif + safe_svcmode_maskall r1 mov r1, #0 ALT_SMP(mrc p15, 0, r0, c0, c0, 5) ALT_UP_B(1f) @@ -144,7 +148,6 @@ ARM_BE8(setend be) @ ensure we are in BE mode ldr r0, [r0, #SLEEP_SAVE_SP_PHYS] ldr r0, [r0, r1, lsl #2] - setmode PSR_I_BIT | PSR_F_BIT | SVC_MODE, r1 @ set SVC, irqs off @ load phys pgd, stack, resume fn ARM( ldmia r0!, {r1, sp, pc} ) THUMB( ldmia r0!, {r1, r2, r3} ) diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index af4e8c8a5422..f065eb05d254 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -3,6 +3,7 @@ #include <linux/stacktrace.h> #include <asm/stacktrace.h> +#include <asm/traps.h> #if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) /* @@ -61,6 +62,7 @@ EXPORT_SYMBOL(walk_stackframe); #ifdef CONFIG_STACKTRACE struct stack_trace_data { struct stack_trace *trace; + unsigned long last_pc; unsigned int no_sched_functions; unsigned int skip; }; @@ -69,6 +71,7 @@ static int save_trace(struct stackframe *frame, void *d) { struct stack_trace_data *data = d; struct stack_trace *trace = data->trace; + struct pt_regs *regs; unsigned long addr = frame->pc; if (data->no_sched_functions && in_sched_functions(addr)) @@ -80,16 +83,39 @@ static int save_trace(struct stackframe *frame, void *d) trace->entries[trace->nr_entries++] = addr; + if (trace->nr_entries >= trace->max_entries) + return 1; + + /* + * in_exception_text() is designed to test if the PC is one of + * the functions which has an exception stack above it, but + * unfortunately what is in frame->pc is the return LR value, + * not the saved PC value. So, we need to track the previous + * frame PC value when doing this. + */ + addr = data->last_pc; + data->last_pc = frame->pc; + if (!in_exception_text(addr)) + return 0; + + regs = (struct pt_regs *)frame->sp; + + trace->entries[trace->nr_entries++] = regs->ARM_pc; + return trace->nr_entries >= trace->max_entries; } -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +/* This must be noinline to so that our skip calculation works correctly */ +static noinline void __save_stack_trace(struct task_struct *tsk, + struct stack_trace *trace, unsigned int nosched) { struct stack_trace_data data; struct stackframe frame; data.trace = trace; + data.last_pc = ULONG_MAX; data.skip = trace->skip; + data.no_sched_functions = nosched; if (tsk != current) { #ifdef CONFIG_SMP @@ -102,7 +128,6 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) trace->entries[trace->nr_entries++] = ULONG_MAX; return; #else - data.no_sched_functions = 1; frame.fp = thread_saved_fp(tsk); frame.sp = thread_saved_sp(tsk); frame.lr = 0; /* recovered from the stack */ @@ -111,11 +136,12 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) } else { register unsigned long current_sp asm ("sp"); - data.no_sched_functions = 0; + /* We don't want this function nor the caller */ + data.skip += 2; frame.fp = (unsigned long)__builtin_frame_address(0); frame.sp = current_sp; frame.lr = (unsigned long)__builtin_return_address(0); - frame.pc = (unsigned long)save_stack_trace_tsk; + frame.pc = (unsigned long)__save_stack_trace; } walk_stackframe(&frame, save_trace, &data); @@ -123,9 +149,33 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) trace->entries[trace->nr_entries++] = ULONG_MAX; } +void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) +{ + struct stack_trace_data data; + struct stackframe frame; + + data.trace = trace; + data.skip = trace->skip; + data.no_sched_functions = 0; + + frame.fp = regs->ARM_fp; + frame.sp = regs->ARM_sp; + frame.lr = regs->ARM_lr; + frame.pc = regs->ARM_pc; + + walk_stackframe(&frame, save_trace, &data); + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + +void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + __save_stack_trace(tsk, trace, 1); +} + void save_stack_trace(struct stack_trace *trace) { - save_stack_trace_tsk(current, trace); + __save_stack_trace(current, trace, 0); } EXPORT_SYMBOL_GPL(save_stack_trace); #endif diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 71e1fec6d31a..3997c411c140 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -91,13 +91,13 @@ static void __init parse_dt_topology(void) { const struct cpu_efficiency *cpu_eff; struct device_node *cn = NULL; - unsigned long min_capacity = (unsigned long)(-1); + unsigned long min_capacity = ULONG_MAX; unsigned long max_capacity = 0; unsigned long capacity = 0; - int alloc_size, cpu = 0; + int cpu = 0; - alloc_size = nr_cpu_ids * sizeof(*__cpu_capacity); - __cpu_capacity = kzalloc(alloc_size, GFP_NOWAIT); + __cpu_capacity = kcalloc(nr_cpu_ids, sizeof(*__cpu_capacity), + GFP_NOWAIT); for_each_possible_cpu(cpu) { const u32 *rate; diff --git a/arch/arm/kernel/uprobes.c b/arch/arm/kernel/uprobes.c index f9bacee973bf..56adf9c1fde0 100644 --- a/arch/arm/kernel/uprobes.c +++ b/arch/arm/kernel/uprobes.c @@ -113,6 +113,26 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, return 0; } +void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + void *xol_page_kaddr = kmap_atomic(page); + void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + + preempt_disable(); + + /* Initialize the slot */ + memcpy(dst, src, len); + + /* flush caches (dcache/icache) */ + flush_uprobe_xol_access(page, vaddr, dst, len); + + preempt_enable(); + + kunmap_atomic(xol_page_kaddr); +} + + int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) { struct uprobe_task *utask = current->utask; diff --git a/arch/arm/mach-at91/sysirq_mask.c b/arch/arm/mach-at91/sysirq_mask.c index 2ba694f9626b..f8bc3511a8c8 100644 --- a/arch/arm/mach-at91/sysirq_mask.c +++ b/arch/arm/mach-at91/sysirq_mask.c @@ -25,24 +25,28 @@ #include "generic.h" -#define AT91_RTC_IDR 0x24 /* Interrupt Disable Register */ -#define AT91_RTC_IMR 0x28 /* Interrupt Mask Register */ +#define AT91_RTC_IDR 0x24 /* Interrupt Disable Register */ +#define AT91_RTC_IMR 0x28 /* Interrupt Mask Register */ +#define AT91_RTC_IRQ_MASK 0x1f /* Available IRQs mask */ void __init at91_sysirq_mask_rtc(u32 rtc_base) { void __iomem *base; - u32 mask; base = ioremap(rtc_base, 64); if (!base) return; - mask = readl_relaxed(base + AT91_RTC_IMR); - if (mask) { - pr_info("AT91: Disabling rtc irq\n"); - writel_relaxed(mask, base + AT91_RTC_IDR); - (void)readl_relaxed(base + AT91_RTC_IMR); /* flush */ - } + /* + * sam9x5 SoCs have the following errata: + * "RTC: Interrupt Mask Register cannot be used + * Interrupt Mask Register read always returns 0." + * + * Hence we're not relying on IMR values to disable + * interrupts. + */ + writel_relaxed(AT91_RTC_IRQ_MASK, base + AT91_RTC_IDR); + (void)readl_relaxed(base + AT91_RTC_IMR); /* flush */ iounmap(base); } diff --git a/arch/arm/mach-bcm/bcm_5301x.c b/arch/arm/mach-bcm/bcm_5301x.c index edff69761e04..e9bcbdbce555 100644 --- a/arch/arm/mach-bcm/bcm_5301x.c +++ b/arch/arm/mach-bcm/bcm_5301x.c @@ -43,19 +43,14 @@ static void __init bcm5301x_init_early(void) "imprecise external abort"); } -static void __init bcm5301x_dt_init(void) -{ - l2x0_of_init(0, ~0UL); - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} - static const char __initconst *bcm5301x_dt_compat[] = { "brcm,bcm4708", NULL, }; DT_MACHINE_START(BCM5301X, "BCM5301X") + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .init_early = bcm5301x_init_early, - .init_machine = bcm5301x_dt_init, .dt_compat = bcm5301x_dt_compat, MACHINE_END diff --git a/arch/arm/mach-berlin/berlin.c b/arch/arm/mach-berlin/berlin.c index 025bcb5473eb..ac181c6797ee 100644 --- a/arch/arm/mach-berlin/berlin.c +++ b/arch/arm/mach-berlin/berlin.c @@ -18,16 +18,6 @@ #include <asm/hardware/cache-l2x0.h> #include <asm/mach/arch.h> -static void __init berlin_init_machine(void) -{ - /* - * with DT probing for L2CCs, berlin_init_machine can be removed. - * Note: 88DE3005 (Armada 1500-mini) uses pl310 l2cc - */ - l2x0_of_init(0x70c00000, 0xfeffffff); - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} - static const char * const berlin_dt_compat[] = { "marvell,berlin", NULL, @@ -35,5 +25,10 @@ static const char * const berlin_dt_compat[] = { DT_MACHINE_START(BERLIN_DT, "Marvell Berlin") .dt_compat = berlin_dt_compat, - .init_machine = berlin_init_machine, + /* + * with DT probing for L2CCs, berlin_init_machine can be removed. + * Note: 88DE3005 (Armada 1500-mini) uses pl310 l2cc + */ + .l2c_aux_val = 0x30c00000, + .l2c_aux_mask = 0xfeffffff, MACHINE_END diff --git a/arch/arm/mach-clps711x/board-clep7312.c b/arch/arm/mach-clps711x/board-clep7312.c index 221b9de32dd6..94a7add88a3f 100644 --- a/arch/arm/mach-clps711x/board-clep7312.c +++ b/arch/arm/mach-clps711x/board-clep7312.c @@ -18,6 +18,7 @@ #include <linux/init.h> #include <linux/types.h> #include <linux/string.h> +#include <linux/memblock.h> #include <asm/setup.h> #include <asm/mach-types.h> @@ -26,11 +27,9 @@ #include "common.h" static void __init -fixup_clep7312(struct tag *tags, char **cmdline, struct meminfo *mi) +fixup_clep7312(struct tag *tags, char **cmdline) { - mi->nr_banks=1; - mi->bank[0].start = 0xc0000000; - mi->bank[0].size = 0x01000000; + memblock_add(0xc0000000, 0x01000000); } MACHINE_START(CLEP7212, "Cirrus Logic 7212/7312") diff --git a/arch/arm/mach-clps711x/board-edb7211.c b/arch/arm/mach-clps711x/board-edb7211.c index 077609841f14..f9828f89972a 100644 --- a/arch/arm/mach-clps711x/board-edb7211.c +++ b/arch/arm/mach-clps711x/board-edb7211.c @@ -16,6 +16,7 @@ #include <linux/interrupt.h> #include <linux/backlight.h> #include <linux/platform_device.h> +#include <linux/memblock.h> #include <linux/mtd/physmap.h> #include <linux/mtd/partitions.h> @@ -133,7 +134,7 @@ static void __init edb7211_reserve(void) } static void __init -fixup_edb7211(struct tag *tags, char **cmdline, struct meminfo *mi) +fixup_edb7211(struct tag *tags, char **cmdline) { /* * Bank start addresses are not present in the information @@ -143,11 +144,8 @@ fixup_edb7211(struct tag *tags, char **cmdline, struct meminfo *mi) * Banks sizes _are_ present in the param block, but we're * not using that information yet. */ - mi->bank[0].start = 0xc0000000; - mi->bank[0].size = SZ_8M; - mi->bank[1].start = 0xc1000000; - mi->bank[1].size = SZ_8M; - mi->nr_banks = 2; + memblock_add(0xc0000000, SZ_8M); + memblock_add(0xc1000000, SZ_8M); } static void __init edb7211_init(void) diff --git a/arch/arm/mach-clps711x/board-p720t.c b/arch/arm/mach-clps711x/board-p720t.c index 67b733744ed7..0cf0e51e6546 100644 --- a/arch/arm/mach-clps711x/board-p720t.c +++ b/arch/arm/mach-clps711x/board-p720t.c @@ -295,7 +295,7 @@ static struct generic_bl_info p720t_lcd_backlight_pdata = { }; static void __init -fixup_p720t(struct tag *tag, char **cmdline, struct meminfo *mi) +fixup_p720t(struct tag *tag, char **cmdline) { /* * Our bootloader doesn't setup any tags (yet). diff --git a/arch/arm/mach-cns3xxx/core.c b/arch/arm/mach-cns3xxx/core.c index 2ae28a69e3e5..f85449a6accd 100644 --- a/arch/arm/mach-cns3xxx/core.c +++ b/arch/arm/mach-cns3xxx/core.c @@ -272,9 +272,9 @@ void __init cns3xxx_l2x0_init(void) * * 1 cycle of latency for setup, read and write accesses */ - val = readl(base + L2X0_TAG_LATENCY_CTRL); + val = readl(base + L310_TAG_LATENCY_CTRL); val &= 0xfffff888; - writel(val, base + L2X0_TAG_LATENCY_CTRL); + writel(val, base + L310_TAG_LATENCY_CTRL); /* * Data RAM Control register @@ -285,12 +285,12 @@ void __init cns3xxx_l2x0_init(void) * * 1 cycle of latency for setup, read and write accesses */ - val = readl(base + L2X0_DATA_LATENCY_CTRL); + val = readl(base + L310_DATA_LATENCY_CTRL); val &= 0xfffff888; - writel(val, base + L2X0_DATA_LATENCY_CTRL); + writel(val, base + L310_DATA_LATENCY_CTRL); /* 32 KiB, 8-way, parity disable */ - l2x0_init(base, 0x00540000, 0xfe000fff); + l2x0_init(base, 0x00500000, 0xfe0f0fff); } #endif /* CONFIG_CACHE_L2X0 */ diff --git a/arch/arm/mach-ep93xx/crunch-bits.S b/arch/arm/mach-ep93xx/crunch-bits.S index 0ec9bb48fab9..e96923a3017b 100644 --- a/arch/arm/mach-ep93xx/crunch-bits.S +++ b/arch/arm/mach-ep93xx/crunch-bits.S @@ -16,6 +16,7 @@ #include <asm/ptrace.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> +#include <asm/assembler.h> #include <mach/ep93xx-regs.h> /* @@ -62,14 +63,16 @@ * r9 = ret_from_exception * lr = undefined instr exit * - * called from prefetch exception handler with interrupts disabled + * called from prefetch exception handler with interrupts enabled */ ENTRY(crunch_task_enable) + inc_preempt_count r10, r3 + ldr r8, =(EP93XX_APB_VIRT_BASE + 0x00130000) @ syscon addr ldr r1, [r8, #0x80] tst r1, #0x00800000 @ access to crunch enabled? - movne pc, lr @ if so no business here + bne 2f @ if so no business here mov r3, #0xaa @ unlock syscon swlock str r3, [r8, #0xc0] orr r1, r1, #0x00800000 @ enable access to crunch @@ -142,7 +145,7 @@ crunch_save: teq r0, #0 @ anything to load? cfldr64eq mvdx0, [r1, #CRUNCH_MVDX0] @ mvdx0 was clobbered - moveq pc, lr + beq 1f crunch_load: cfldr64 mvdx0, [r0, #CRUNCH_DSPSC] @ load status word @@ -190,6 +193,11 @@ crunch_load: cfldr64 mvdx14, [r0, #CRUNCH_MVDX14] cfldr64 mvdx15, [r0, #CRUNCH_MVDX15] +1: +#ifdef CONFIG_PREEMPT_COUNT + get_thread_info r10 +#endif +2: dec_preempt_count r10, r3 mov pc, lr /* diff --git a/arch/arm/mach-exynos/common.h b/arch/arm/mach-exynos/common.h index 80b90e346ca0..16617bdb37a9 100644 --- a/arch/arm/mach-exynos/common.h +++ b/arch/arm/mach-exynos/common.h @@ -153,7 +153,6 @@ enum sys_powerdown { NUM_SYS_POWERDOWN, }; -extern unsigned long l2x0_regs_phys; struct exynos_pmu_conf { void __iomem *reg; unsigned int val[NUM_SYS_POWERDOWN]; diff --git a/arch/arm/mach-exynos/exynos.c b/arch/arm/mach-exynos/exynos.c index a56ce45a3f90..90aab4d75d08 100644 --- a/arch/arm/mach-exynos/exynos.c +++ b/arch/arm/mach-exynos/exynos.c @@ -30,9 +30,6 @@ #include "mfc.h" #include "regs-pmu.h" -#define L2_AUX_VAL 0x7C470001 -#define L2_AUX_MASK 0xC200ffff - static struct map_desc exynos4_iodesc[] __initdata = { { .virtual = (unsigned long)S3C_VA_SYS, @@ -246,25 +243,6 @@ void __init exynos_init_io(void) exynos_map_io(); } -static int __init exynos4_l2x0_cache_init(void) -{ - int ret; - - if (!soc_is_exynos4()) - return 0; - - ret = l2x0_of_init(L2_AUX_VAL, L2_AUX_MASK); - if (ret) - return ret; - - if (IS_ENABLED(CONFIG_S5P_SLEEP)) { - l2x0_regs_phys = virt_to_phys(&l2x0_saved_regs); - clean_dcache_area(&l2x0_regs_phys, sizeof(unsigned long)); - } - return 0; -} -early_initcall(exynos4_l2x0_cache_init); - static void __init exynos_dt_machine_init(void) { struct device_node *i2c_np; @@ -333,6 +311,8 @@ static void __init exynos_reserve(void) DT_MACHINE_START(EXYNOS_DT, "SAMSUNG EXYNOS (Flattened Device Tree)") /* Maintainer: Thomas Abraham <thomas.abraham@linaro.org> */ /* Maintainer: Kukjin Kim <kgene.kim@samsung.com> */ + .l2c_aux_val = 0x3c400001, + .l2c_aux_mask = 0xc20fffff, .smp = smp_ops(exynos_smp_ops), .map_io = exynos_init_io, .init_early = exynos_firmware_init, diff --git a/arch/arm/mach-exynos/sleep.S b/arch/arm/mach-exynos/sleep.S index a2613e944e10..108a45f4bb62 100644 --- a/arch/arm/mach-exynos/sleep.S +++ b/arch/arm/mach-exynos/sleep.S @@ -16,8 +16,6 @@ */ #include <linux/linkage.h> -#include <asm/asm-offsets.h> -#include <asm/hardware/cache-l2x0.h> #define CPU_MASK 0xff0ffff0 #define CPU_CORTEX_A9 0x410fc090 @@ -53,33 +51,7 @@ ENTRY(exynos_cpu_resume) and r0, r0, r1 ldr r1, =CPU_CORTEX_A9 cmp r0, r1 - bne skip_l2_resume - adr r0, l2x0_regs_phys - ldr r0, [r0] - cmp r0, #0 - beq skip_l2_resume - ldr r1, [r0, #L2X0_R_PHY_BASE] - ldr r2, [r1, #L2X0_CTRL] - tst r2, #0x1 - bne skip_l2_resume - ldr r2, [r0, #L2X0_R_AUX_CTRL] - str r2, [r1, #L2X0_AUX_CTRL] - ldr r2, [r0, #L2X0_R_TAG_LATENCY] - str r2, [r1, #L2X0_TAG_LATENCY_CTRL] - ldr r2, [r0, #L2X0_R_DATA_LATENCY] - str r2, [r1, #L2X0_DATA_LATENCY_CTRL] - ldr r2, [r0, #L2X0_R_PREFETCH_CTRL] - str r2, [r1, #L2X0_PREFETCH_CTRL] - ldr r2, [r0, #L2X0_R_PWR_CTRL] - str r2, [r1, #L2X0_POWER_CTRL] - mov r2, #1 - str r2, [r1, #L2X0_CTRL] -skip_l2_resume: + bleq l2c310_early_resume #endif b cpu_resume ENDPROC(exynos_cpu_resume) -#ifdef CONFIG_CACHE_L2X0 - .globl l2x0_regs_phys -l2x0_regs_phys: - .long 0 -#endif diff --git a/arch/arm/mach-footbridge/cats-hw.c b/arch/arm/mach-footbridge/cats-hw.c index da0415094856..8f05489671b7 100644 --- a/arch/arm/mach-footbridge/cats-hw.c +++ b/arch/arm/mach-footbridge/cats-hw.c @@ -76,7 +76,7 @@ __initcall(cats_hw_init); * hard reboots fail on early boards. */ static void __init -fixup_cats(struct tag *tags, char **cmdline, struct meminfo *mi) +fixup_cats(struct tag *tags, char **cmdline) { #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) screen_info.orig_video_lines = 25; diff --git a/arch/arm/mach-footbridge/netwinder-hw.c b/arch/arm/mach-footbridge/netwinder-hw.c index eb1fa5c84723..cdee08c6d239 100644 --- a/arch/arm/mach-footbridge/netwinder-hw.c +++ b/arch/arm/mach-footbridge/netwinder-hw.c @@ -620,7 +620,7 @@ __initcall(nw_hw_init); * the parameter page. */ static void __init -fixup_netwinder(struct tag *tags, char **cmdline, struct meminfo *mi) +fixup_netwinder(struct tag *tags, char **cmdline) { #ifdef CONFIG_ISAPNP extern int isapnp_disable; diff --git a/arch/arm/mach-highbank/highbank.c b/arch/arm/mach-highbank/highbank.c index c7de89b263dd..8c35ae4ff176 100644 --- a/arch/arm/mach-highbank/highbank.c +++ b/arch/arm/mach-highbank/highbank.c @@ -51,11 +51,13 @@ static void __init highbank_scu_map_io(void) } -static void highbank_l2x0_disable(void) +static void highbank_l2c310_write_sec(unsigned long val, unsigned reg) { - outer_flush_all(); - /* Disable PL310 L2 Cache controller */ - highbank_smc1(0x102, 0x0); + if (reg == L2X0_CTRL) + highbank_smc1(0x102, val); + else + WARN_ONCE(1, "Highbank L2C310: ignoring write to reg 0x%x\n", + reg); } static void __init highbank_init_irq(void) @@ -64,14 +66,6 @@ static void __init highbank_init_irq(void) if (of_find_compatible_node(NULL, NULL, "arm,cortex-a9")) highbank_scu_map_io(); - - /* Enable PL310 L2 Cache controller */ - if (IS_ENABLED(CONFIG_CACHE_L2X0) && - of_find_compatible_node(NULL, NULL, "arm,pl310-cache")) { - highbank_smc1(0x102, 0x1); - l2x0_of_init(0, ~0UL); - outer_cache.disable = highbank_l2x0_disable; - } } static void highbank_power_off(void) @@ -185,6 +179,9 @@ DT_MACHINE_START(HIGHBANK, "Highbank") #if defined(CONFIG_ZONE_DMA) && defined(CONFIG_ARM_LPAE) .dma_zone_size = (4ULL * SZ_1G), #endif + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, + .l2c_write_sec = highbank_l2c310_write_sec, .init_irq = highbank_init_irq, .init_machine = highbank_init, .dt_compat = highbank_match, diff --git a/arch/arm/mach-imx/mach-vf610.c b/arch/arm/mach-imx/mach-vf610.c index 2d8aef5a6efa..c44602758120 100644 --- a/arch/arm/mach-imx/mach-vf610.c +++ b/arch/arm/mach-imx/mach-vf610.c @@ -20,19 +20,14 @@ static void __init vf610_init_machine(void) of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); } -static void __init vf610_init_irq(void) -{ - l2x0_of_init(0, ~0UL); - irqchip_init(); -} - static const char *vf610_dt_compat[] __initconst = { "fsl,vf610", NULL, }; DT_MACHINE_START(VYBRID_VF610, "Freescale Vybrid VF610 (Device Tree)") - .init_irq = vf610_init_irq, + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .init_machine = vf610_init_machine, .dt_compat = vf610_dt_compat, .restart = mxc_restart, diff --git a/arch/arm/mach-imx/suspend-imx6.S b/arch/arm/mach-imx/suspend-imx6.S index 20048ff05739..fe123b079c05 100644 --- a/arch/arm/mach-imx/suspend-imx6.S +++ b/arch/arm/mach-imx/suspend-imx6.S @@ -334,28 +334,10 @@ ENDPROC(imx6_suspend) * turned into relative ones. */ -#ifdef CONFIG_CACHE_L2X0 - .macro pl310_resume - adr r0, l2x0_saved_regs_offset - ldr r2, [r0] - add r2, r2, r0 - ldr r0, [r2, #L2X0_R_PHY_BASE] @ get physical base of l2x0 - ldr r1, [r2, #L2X0_R_AUX_CTRL] @ get aux_ctrl value - str r1, [r0, #L2X0_AUX_CTRL] @ restore aux_ctrl - mov r1, #0x1 - str r1, [r0, #L2X0_CTRL] @ re-enable L2 - .endm - -l2x0_saved_regs_offset: - .word l2x0_saved_regs - . - -#else - .macro pl310_resume - .endm -#endif - ENTRY(v7_cpu_resume) bl v7_invalidate_l1 - pl310_resume +#ifdef CONFIG_CACHE_L2X0 + bl l2c310_early_resume +#endif b cpu_resume ENDPROC(v7_cpu_resume) diff --git a/arch/arm/mach-imx/system.c b/arch/arm/mach-imx/system.c index 5e3027d3692f..3b0733edb68c 100644 --- a/arch/arm/mach-imx/system.c +++ b/arch/arm/mach-imx/system.c @@ -124,7 +124,7 @@ void __init imx_init_l2cache(void) } /* Configure the L2 PREFETCH and POWER registers */ - val = readl_relaxed(l2x0_base + L2X0_PREFETCH_CTRL); + val = readl_relaxed(l2x0_base + L310_PREFETCH_CTRL); val |= 0x70800000; /* * The L2 cache controller(PL310) version on the i.MX6D/Q is r3p1-50rel0 @@ -137,14 +137,12 @@ void __init imx_init_l2cache(void) */ if (cpu_is_imx6q()) val &= ~(1 << 30 | 1 << 23); - writel_relaxed(val, l2x0_base + L2X0_PREFETCH_CTRL); - val = L2X0_DYNAMIC_CLK_GATING_EN | L2X0_STNDBY_MODE_EN; - writel_relaxed(val, l2x0_base + L2X0_POWER_CTRL); + writel_relaxed(val, l2x0_base + L310_PREFETCH_CTRL); iounmap(l2x0_base); of_node_put(np); out: - l2x0_of_init(0, ~0UL); + l2x0_of_init(0, ~0); } #endif diff --git a/arch/arm/mach-msm/board-halibut.c b/arch/arm/mach-msm/board-halibut.c index a77529887cbc..61bfe584a9d7 100644 --- a/arch/arm/mach-msm/board-halibut.c +++ b/arch/arm/mach-msm/board-halibut.c @@ -83,11 +83,6 @@ static void __init halibut_init(void) platform_add_devices(devices, ARRAY_SIZE(devices)); } -static void __init halibut_fixup(struct tag *tags, char **cmdline, - struct meminfo *mi) -{ -} - static void __init halibut_map_io(void) { msm_map_common_io(); @@ -100,7 +95,6 @@ static void __init halibut_init_late(void) MACHINE_START(HALIBUT, "Halibut Board (QCT SURF7200A)") .atag_offset = 0x100, - .fixup = halibut_fixup, .map_io = halibut_map_io, .init_early = halibut_init_early, .init_irq = halibut_init_irq, diff --git a/arch/arm/mach-msm/board-mahimahi.c b/arch/arm/mach-msm/board-mahimahi.c index 7d9981cb400e..873c3ca3cd7e 100644 --- a/arch/arm/mach-msm/board-mahimahi.c +++ b/arch/arm/mach-msm/board-mahimahi.c @@ -22,6 +22,7 @@ #include <linux/io.h> #include <linux/kernel.h> #include <linux/platform_device.h> +#include <linux/memblock.h> #include <asm/mach-types.h> #include <asm/mach/arch.h> @@ -52,16 +53,10 @@ static void __init mahimahi_init(void) platform_add_devices(devices, ARRAY_SIZE(devices)); } -static void __init mahimahi_fixup(struct tag *tags, char **cmdline, - struct meminfo *mi) +static void __init mahimahi_fixup(struct tag *tags, char **cmdline) { - mi->nr_banks = 2; - mi->bank[0].start = PHYS_OFFSET; - mi->bank[0].node = PHYS_TO_NID(PHYS_OFFSET); - mi->bank[0].size = (219*1024*1024); - mi->bank[1].start = MSM_HIGHMEM_BASE; - mi->bank[1].node = PHYS_TO_NID(MSM_HIGHMEM_BASE); - mi->bank[1].size = MSM_HIGHMEM_SIZE; + memblock_add(PHYS_OFFSET, 219*SZ_1M); + memblock_add(MSM_HIGHMEM_BASE, MSM_HIGHMEM_SIZE); } static void __init mahimahi_map_io(void) diff --git a/arch/arm/mach-msm/board-msm7x30.c b/arch/arm/mach-msm/board-msm7x30.c index 0c4c200e1221..245884319d2e 100644 --- a/arch/arm/mach-msm/board-msm7x30.c +++ b/arch/arm/mach-msm/board-msm7x30.c @@ -40,8 +40,7 @@ #include "proc_comm.h" #include "common.h" -static void __init msm7x30_fixup(struct tag *tag, char **cmdline, - struct meminfo *mi) +static void __init msm7x30_fixup(struct tag *tag, char **cmdline) { for (; tag->hdr.size; tag = tag_next(tag)) if (tag->hdr.tag == ATAG_MEM && tag->u.mem.start == 0x200000) { diff --git a/arch/arm/mach-msm/board-sapphire.c b/arch/arm/mach-msm/board-sapphire.c index 327605174d63..e50967926dcd 100644 --- a/arch/arm/mach-msm/board-sapphire.c +++ b/arch/arm/mach-msm/board-sapphire.c @@ -35,6 +35,7 @@ #include <linux/mtd/nand.h> #include <linux/mtd/partitions.h> +#include <linux/memblock.h> #include "gpio_chip.h" #include "board-sapphire.h" @@ -74,22 +75,18 @@ static struct map_desc sapphire_io_desc[] __initdata = { } }; -static void __init sapphire_fixup(struct tag *tags, char **cmdline, - struct meminfo *mi) +static void __init sapphire_fixup(struct tag *tags, char **cmdline) { int smi_sz = parse_tag_smi((const struct tag *)tags); - mi->nr_banks = 1; - mi->bank[0].start = PHYS_OFFSET; - mi->bank[0].node = PHYS_TO_NID(PHYS_OFFSET); if (smi_sz == 32) { - mi->bank[0].size = (84*1024*1024); + memblock_add(PHYS_OFFSET, 84*SZ_1M); } else if (smi_sz == 64) { - mi->bank[0].size = (101*1024*1024); + memblock_add(PHYS_OFFSET, 101*SZ_1M); } else { + memblock_add(PHYS_OFFSET, 101*SZ_1M); /* Give a default value when not get smi size */ smi_sz = 64; - mi->bank[0].size = (101*1024*1024); } } diff --git a/arch/arm/mach-msm/board-trout.c b/arch/arm/mach-msm/board-trout.c index 5edfbd904d06..f72b07de2152 100644 --- a/arch/arm/mach-msm/board-trout.c +++ b/arch/arm/mach-msm/board-trout.c @@ -19,6 +19,7 @@ #include <linux/init.h> #include <linux/platform_device.h> #include <linux/clkdev.h> +#include <linux/memblock.h> #include <asm/system_info.h> #include <asm/mach-types.h> @@ -55,12 +56,9 @@ static void __init trout_init_irq(void) msm_init_irq(); } -static void __init trout_fixup(struct tag *tags, char **cmdline, - struct meminfo *mi) +static void __init trout_fixup(struct tag *tags, char **cmdline) { - mi->nr_banks = 1; - mi->bank[0].start = PHYS_OFFSET; - mi->bank[0].size = (101*1024*1024); + memblock_add(PHYS_OFFSET, 101*SZ_1M); } static void __init trout_init(void) diff --git a/arch/arm/mach-mvebu/board-v7.c b/arch/arm/mach-mvebu/board-v7.c index 01cfce6ac20b..8bb742fdf5ca 100644 --- a/arch/arm/mach-mvebu/board-v7.c +++ b/arch/arm/mach-mvebu/board-v7.c @@ -78,7 +78,6 @@ static void __init mvebu_timer_and_clk_init(void) mvebu_scu_enable(); coherency_init(); BUG_ON(mvebu_mbus_dt_init(coherency_available())); - l2x0_of_init(0, ~0UL); if (of_machine_is_compatible("marvell,armada375")) hook_fault_code(16 + 6, armada_375_external_abort_wa, SIGBUS, 0, @@ -182,6 +181,8 @@ static const char * const armada_370_xp_dt_compat[] = { }; DT_MACHINE_START(ARMADA_370_XP_DT, "Marvell Armada 370/XP (Device Tree)") + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .smp = smp_ops(armada_xp_smp_ops), .init_machine = mvebu_dt_init, .init_time = mvebu_timer_and_clk_init, @@ -195,6 +196,8 @@ static const char * const armada_375_dt_compat[] = { }; DT_MACHINE_START(ARMADA_375_DT, "Marvell Armada 375 (Device Tree)") + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .init_time = mvebu_timer_and_clk_init, .init_machine = mvebu_dt_init, .restart = mvebu_restart, @@ -208,6 +211,8 @@ static const char * const armada_38x_dt_compat[] = { }; DT_MACHINE_START(ARMADA_38X_DT, "Marvell Armada 380/385 (Device Tree)") + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .init_time = mvebu_timer_and_clk_init, .restart = mvebu_restart, .dt_compat = armada_38x_dt_compat, diff --git a/arch/arm/mach-nomadik/cpu-8815.c b/arch/arm/mach-nomadik/cpu-8815.c index 4a1065e41e9c..9116ca476d7c 100644 --- a/arch/arm/mach-nomadik/cpu-8815.c +++ b/arch/arm/mach-nomadik/cpu-8815.c @@ -143,23 +143,16 @@ static int __init cpu8815_mmcsd_init(void) } device_initcall(cpu8815_mmcsd_init); -static void __init cpu8815_init_of(void) -{ -#ifdef CONFIG_CACHE_L2X0 - /* At full speed latency must be >=2, so 0x249 in low bits */ - l2x0_of_init(0x00730249, 0xfe000fff); -#endif - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} - static const char * cpu8815_board_compat[] = { "calaosystems,usb-s8815", NULL, }; DT_MACHINE_START(NOMADIK_DT, "Nomadik STn8815") + /* At full speed latency must be >=2, so 0x249 in low bits */ + .l2c_aux_val = 0x00700249, + .l2c_aux_mask = 0xfe0fefff, .map_io = cpu8815_map_io, - .init_machine = cpu8815_init_of, .restart = cpu8815_restart, .dt_compat = cpu8815_board_compat, MACHINE_END diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig index cb31d4390d52..0ba482638ebf 100644 --- a/arch/arm/mach-omap2/Kconfig +++ b/arch/arm/mach-omap2/Kconfig @@ -65,6 +65,7 @@ config SOC_AM43XX select ARCH_HAS_OPP select ARM_GIC select MACH_OMAP_GENERIC + select MIGHT_HAVE_CACHE_L2X0 config SOC_DRA7XX bool "TI DRA7XX" diff --git a/arch/arm/mach-omap2/common.h b/arch/arm/mach-omap2/common.h index d88aff7baff8..ff029737c8f0 100644 --- a/arch/arm/mach-omap2/common.h +++ b/arch/arm/mach-omap2/common.h @@ -91,6 +91,7 @@ extern void omap3_sync32k_timer_init(void); extern void omap3_secure_sync32k_timer_init(void); extern void omap3_gptimer_timer_init(void); extern void omap4_local_timer_init(void); +int omap_l2_cache_init(void); extern void omap5_realtime_timer_init(void); void omap2420_init_early(void); diff --git a/arch/arm/mach-omap2/io.c b/arch/arm/mach-omap2/io.c index 4ec3b4a93843..8f559450c876 100644 --- a/arch/arm/mach-omap2/io.c +++ b/arch/arm/mach-omap2/io.c @@ -609,6 +609,7 @@ void __init am43xx_init_early(void) am43xx_clockdomains_init(); am43xx_hwmod_init(); omap_hwmod_init_postsetup(); + omap_l2_cache_init(); omap_clk_soc_init = am43xx_dt_clk_init; } @@ -640,6 +641,7 @@ void __init omap4430_init_early(void) omap44xx_clockdomains_init(); omap44xx_hwmod_init(); omap_hwmod_init_postsetup(); + omap_l2_cache_init(); omap_clk_soc_init = omap4xxx_dt_clk_init; } diff --git a/arch/arm/mach-omap2/omap-mpuss-lowpower.c b/arch/arm/mach-omap2/omap-mpuss-lowpower.c index eb76e47091ad..4001325f90fb 100644 --- a/arch/arm/mach-omap2/omap-mpuss-lowpower.c +++ b/arch/arm/mach-omap2/omap-mpuss-lowpower.c @@ -187,19 +187,15 @@ static void l2x0_pwrst_prepare(unsigned int cpu_id, unsigned int save_state) * in every restore MPUSS OFF path. */ #ifdef CONFIG_CACHE_L2X0 -static void save_l2x0_context(void) +static void __init save_l2x0_context(void) { - u32 val; - void __iomem *l2x0_base = omap4_get_l2cache_base(); - if (l2x0_base) { - val = readl_relaxed(l2x0_base + L2X0_AUX_CTRL); - writel_relaxed(val, sar_base + L2X0_AUXCTRL_OFFSET); - val = readl_relaxed(l2x0_base + L2X0_PREFETCH_CTRL); - writel_relaxed(val, sar_base + L2X0_PREFETCH_CTRL_OFFSET); - } + writel_relaxed(l2x0_saved_regs.aux_ctrl, + sar_base + L2X0_AUXCTRL_OFFSET); + writel_relaxed(l2x0_saved_regs.prefetch_ctrl, + sar_base + L2X0_PREFETCH_CTRL_OFFSET); } #else -static void save_l2x0_context(void) +static void __init save_l2x0_context(void) {} #endif diff --git a/arch/arm/mach-omap2/omap4-common.c b/arch/arm/mach-omap2/omap4-common.c index 99b0154493a4..326cd982a3cb 100644 --- a/arch/arm/mach-omap2/omap4-common.c +++ b/arch/arm/mach-omap2/omap4-common.c @@ -167,75 +167,57 @@ void __iomem *omap4_get_l2cache_base(void) return l2cache_base; } -static void omap4_l2x0_disable(void) +static void omap4_l2c310_write_sec(unsigned long val, unsigned reg) { - outer_flush_all(); - /* Disable PL310 L2 Cache controller */ - omap_smc1(0x102, 0x0); -} + unsigned smc_op; -static void omap4_l2x0_set_debug(unsigned long val) -{ - /* Program PL310 L2 Cache controller debug register */ - omap_smc1(0x100, val); + switch (reg) { + case L2X0_CTRL: + smc_op = OMAP4_MON_L2X0_CTRL_INDEX; + break; + + case L2X0_AUX_CTRL: + smc_op = OMAP4_MON_L2X0_AUXCTRL_INDEX; + break; + + case L2X0_DEBUG_CTRL: + smc_op = OMAP4_MON_L2X0_DBG_CTRL_INDEX; + break; + + case L310_PREFETCH_CTRL: + smc_op = OMAP4_MON_L2X0_PREFETCH_INDEX; + break; + + default: + WARN_ONCE(1, "OMAP L2C310: ignoring write to reg 0x%x\n", reg); + return; + } + + omap_smc1(smc_op, val); } -static int __init omap_l2_cache_init(void) +int __init omap_l2_cache_init(void) { - u32 aux_ctrl = 0; - - /* - * To avoid code running on other OMAPs in - * multi-omap builds - */ - if (!cpu_is_omap44xx()) - return -ENODEV; + u32 aux_ctrl; /* Static mapping, never released */ l2cache_base = ioremap(OMAP44XX_L2CACHE_BASE, SZ_4K); if (WARN_ON(!l2cache_base)) return -ENOMEM; - /* - * 16-way associativity, parity disabled - * Way size - 32KB (es1.0) - * Way size - 64KB (es2.0 +) - */ - aux_ctrl = ((1 << L2X0_AUX_CTRL_ASSOCIATIVITY_SHIFT) | - (0x1 << 25) | - (0x1 << L2X0_AUX_CTRL_NS_LOCKDOWN_SHIFT) | - (0x1 << L2X0_AUX_CTRL_NS_INT_CTRL_SHIFT)); - - if (omap_rev() == OMAP4430_REV_ES1_0) { - aux_ctrl |= 0x2 << L2X0_AUX_CTRL_WAY_SIZE_SHIFT; - } else { - aux_ctrl |= ((0x3 << L2X0_AUX_CTRL_WAY_SIZE_SHIFT) | - (1 << L2X0_AUX_CTRL_SHARE_OVERRIDE_SHIFT) | - (1 << L2X0_AUX_CTRL_DATA_PREFETCH_SHIFT) | - (1 << L2X0_AUX_CTRL_INSTR_PREFETCH_SHIFT) | - (1 << L2X0_AUX_CTRL_EARLY_BRESP_SHIFT)); - } - if (omap_rev() != OMAP4430_REV_ES1_0) - omap_smc1(0x109, aux_ctrl); - - /* Enable PL310 L2 Cache controller */ - omap_smc1(0x102, 0x1); + /* 16-way associativity, parity disabled, way size - 64KB (es2.0 +) */ + aux_ctrl = L2C_AUX_CTRL_SHARED_OVERRIDE | + L310_AUX_CTRL_DATA_PREFETCH | + L310_AUX_CTRL_INSTR_PREFETCH; + outer_cache.write_sec = omap4_l2c310_write_sec; if (of_have_populated_dt()) - l2x0_of_init(aux_ctrl, L2X0_AUX_CTRL_MASK); + l2x0_of_init(aux_ctrl, 0xcf9fffff); else - l2x0_init(l2cache_base, aux_ctrl, L2X0_AUX_CTRL_MASK); - - /* - * Override default outer_cache.disable with a OMAP4 - * specific one - */ - outer_cache.disable = omap4_l2x0_disable; - outer_cache.set_debug = omap4_l2x0_set_debug; + l2x0_init(l2cache_base, aux_ctrl, 0xcf9fffff); return 0; } -omap_early_initcall(omap_l2_cache_init); #endif void __iomem *omap4_get_sar_ram_base(void) diff --git a/arch/arm/mach-orion5x/common.c b/arch/arm/mach-orion5x/common.c index 3f1de1111e0f..6bbb7b55c6d1 100644 --- a/arch/arm/mach-orion5x/common.c +++ b/arch/arm/mach-orion5x/common.c @@ -365,8 +365,7 @@ void orion5x_restart(enum reboot_mode mode, const char *cmd) * Many orion-based systems have buggy bootloader implementations. * This is a common fixup for bogus memory tags. */ -void __init tag_fixup_mem32(struct tag *t, char **from, - struct meminfo *meminfo) +void __init tag_fixup_mem32(struct tag *t, char **from) { for (; t->hdr.size; t = tag_next(t)) if (t->hdr.tag == ATAG_MEM && diff --git a/arch/arm/mach-orion5x/common.h b/arch/arm/mach-orion5x/common.h index 26d6f34b6027..cd0389c6e822 100644 --- a/arch/arm/mach-orion5x/common.h +++ b/arch/arm/mach-orion5x/common.h @@ -64,9 +64,8 @@ int orion5x_pci_sys_setup(int nr, struct pci_sys_data *sys); struct pci_bus *orion5x_pci_sys_scan_bus(int nr, struct pci_sys_data *sys); int orion5x_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin); -struct meminfo; struct tag; -extern void __init tag_fixup_mem32(struct tag *, char **, struct meminfo *); +extern void __init tag_fixup_mem32(struct tag *, char **); #ifdef CONFIG_MACH_MSS2_DT extern void mss2_init(void); diff --git a/arch/arm/mach-prima2/Makefile b/arch/arm/mach-prima2/Makefile index 7a6b4a323125..8846e7d87ea5 100644 --- a/arch/arm/mach-prima2/Makefile +++ b/arch/arm/mach-prima2/Makefile @@ -2,7 +2,6 @@ obj-y += rstc.o obj-y += common.o obj-y += rtciobrg.o obj-$(CONFIG_DEBUG_LL) += lluart.o -obj-$(CONFIG_CACHE_L2X0) += l2x0.o obj-$(CONFIG_SUSPEND) += pm.o sleep.o obj-$(CONFIG_SMP) += platsmp.o headsmp.o obj-$(CONFIG_HOTPLUG_CPU) += hotplug.o diff --git a/arch/arm/mach-prima2/common.c b/arch/arm/mach-prima2/common.c index 47c7819edb9b..a860ea27e8ae 100644 --- a/arch/arm/mach-prima2/common.c +++ b/arch/arm/mach-prima2/common.c @@ -34,6 +34,8 @@ static const char *atlas6_dt_match[] __initconst = { DT_MACHINE_START(ATLAS6_DT, "Generic ATLAS6 (Flattened Device Tree)") /* Maintainer: Barry Song <baohua.song@csr.com> */ + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .map_io = sirfsoc_map_io, .init_late = sirfsoc_init_late, .dt_compat = atlas6_dt_match, @@ -48,6 +50,8 @@ static const char *prima2_dt_match[] __initconst = { DT_MACHINE_START(PRIMA2_DT, "Generic PRIMA2 (Flattened Device Tree)") /* Maintainer: Barry Song <baohua.song@csr.com> */ + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .map_io = sirfsoc_map_io, .dma_zone_size = SZ_256M, .init_late = sirfsoc_init_late, @@ -63,6 +67,8 @@ static const char *marco_dt_match[] __initconst = { DT_MACHINE_START(MARCO_DT, "Generic MARCO (Flattened Device Tree)") /* Maintainer: Barry Song <baohua.song@csr.com> */ + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .smp = smp_ops(sirfsoc_smp_ops), .map_io = sirfsoc_map_io, .init_late = sirfsoc_init_late, diff --git a/arch/arm/mach-prima2/l2x0.c b/arch/arm/mach-prima2/l2x0.c deleted file mode 100644 index c7102539c0b0..000000000000 --- a/arch/arm/mach-prima2/l2x0.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * l2 cache initialization for CSR SiRFprimaII - * - * Copyright (c) 2011 Cambridge Silicon Radio Limited, a CSR plc group company. - * - * Licensed under GPLv2 or later. - */ - -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/of.h> -#include <asm/hardware/cache-l2x0.h> - -struct l2x0_aux { - u32 val; - u32 mask; -}; - -static const struct l2x0_aux prima2_l2x0_aux __initconst = { - .val = 2 << L2X0_AUX_CTRL_WAY_SIZE_SHIFT, - .mask = 0, -}; - -static const struct l2x0_aux marco_l2x0_aux __initconst = { - .val = (2 << L2X0_AUX_CTRL_WAY_SIZE_SHIFT) | - (1 << L2X0_AUX_CTRL_ASSOCIATIVITY_SHIFT), - .mask = L2X0_AUX_CTRL_MASK, -}; - -static const struct of_device_id sirf_l2x0_ids[] __initconst = { - { .compatible = "sirf,prima2-pl310-cache", .data = &prima2_l2x0_aux, }, - { .compatible = "sirf,marco-pl310-cache", .data = &marco_l2x0_aux, }, - {}, -}; - -static int __init sirfsoc_l2x0_init(void) -{ - struct device_node *np; - const struct l2x0_aux *aux; - - np = of_find_matching_node(NULL, sirf_l2x0_ids); - if (np) { - aux = of_match_node(sirf_l2x0_ids, np)->data; - return l2x0_of_init(aux->val, aux->mask); - } - - return 0; -} -early_initcall(sirfsoc_l2x0_init); diff --git a/arch/arm/mach-prima2/pm.c b/arch/arm/mach-prima2/pm.c index c4525a88e5da..96e9bc102117 100644 --- a/arch/arm/mach-prima2/pm.c +++ b/arch/arm/mach-prima2/pm.c @@ -71,7 +71,6 @@ static int sirfsoc_pm_enter(suspend_state_t state) case PM_SUSPEND_MEM: sirfsoc_pre_suspend_power_off(); - outer_flush_all(); outer_disable(); /* go zzz */ cpu_suspend(0, sirfsoc_finish_suspend); diff --git a/arch/arm/mach-pxa/cm-x300.c b/arch/arm/mach-pxa/cm-x300.c index 584439bfa59f..4d3588d26c2a 100644 --- a/arch/arm/mach-pxa/cm-x300.c +++ b/arch/arm/mach-pxa/cm-x300.c @@ -837,8 +837,7 @@ static void __init cm_x300_init(void) cm_x300_init_bl(); } -static void __init cm_x300_fixup(struct tag *tags, char **cmdline, - struct meminfo *mi) +static void __init cm_x300_fixup(struct tag *tags, char **cmdline) { /* Make sure that mi->bank[0].start = PHYS_ADDR */ for (; tags->hdr.size; tags = tag_next(tags)) diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c index 57d60542f982..91dd1c7cdbcd 100644 --- a/arch/arm/mach-pxa/corgi.c +++ b/arch/arm/mach-pxa/corgi.c @@ -34,6 +34,7 @@ #include <linux/input/matrix_keypad.h> #include <linux/gpio_keys.h> #include <linux/module.h> +#include <linux/memblock.h> #include <video/w100fb.h> #include <asm/setup.h> @@ -753,16 +754,13 @@ static void __init corgi_init(void) platform_add_devices(devices, ARRAY_SIZE(devices)); } -static void __init fixup_corgi(struct tag *tags, char **cmdline, - struct meminfo *mi) +static void __init fixup_corgi(struct tag *tags, char **cmdline) { sharpsl_save_param(); - mi->nr_banks=1; - mi->bank[0].start = 0xa0000000; if (machine_is_corgi()) - mi->bank[0].size = (32*1024*1024); + memblock_add(0xa0000000, SZ_32M); else - mi->bank[0].size = (64*1024*1024); + memblock_add(0xa0000000, SZ_64M); } #ifdef CONFIG_MACH_CORGI diff --git a/arch/arm/mach-pxa/eseries.c b/arch/arm/mach-pxa/eseries.c index 8280ebcaab9f..cfb864173ce3 100644 --- a/arch/arm/mach-pxa/eseries.c +++ b/arch/arm/mach-pxa/eseries.c @@ -21,6 +21,7 @@ #include <linux/mtd/nand.h> #include <linux/mtd/partitions.h> #include <linux/usb/gpio_vbus.h> +#include <linux/memblock.h> #include <video/w100fb.h> @@ -41,14 +42,12 @@ #include "clock.h" /* Only e800 has 128MB RAM */ -void __init eseries_fixup(struct tag *tags, char **cmdline, struct meminfo *mi) +void __init eseries_fixup(struct tag *tags, char **cmdline) { - mi->nr_banks=1; - mi->bank[0].start = 0xa0000000; if (machine_is_e800()) - mi->bank[0].size = (128*1024*1024); + memblock_add(0xa0000000, SZ_128M); else - mi->bank[0].size = (64*1024*1024); + memblock_add(0xa0000000, SZ_64M); } struct gpio_vbus_mach_info e7xx_udc_info = { diff --git a/arch/arm/mach-pxa/poodle.c b/arch/arm/mach-pxa/poodle.c index aedf053a1de5..131991629116 100644 --- a/arch/arm/mach-pxa/poodle.c +++ b/arch/arm/mach-pxa/poodle.c @@ -29,6 +29,7 @@ #include <linux/spi/ads7846.h> #include <linux/spi/pxa2xx_spi.h> #include <linux/mtd/sharpsl.h> +#include <linux/memblock.h> #include <mach/hardware.h> #include <asm/mach-types.h> @@ -456,13 +457,10 @@ static void __init poodle_init(void) poodle_init_spi(); } -static void __init fixup_poodle(struct tag *tags, char **cmdline, - struct meminfo *mi) +static void __init fixup_poodle(struct tag *tags, char **cmdline) { sharpsl_save_param(); - mi->nr_banks=1; - mi->bank[0].start = 0xa0000000; - mi->bank[0].size = (32*1024*1024); + memblock_add(0xa0000000, SZ_32M); } MACHINE_START(POODLE, "SHARP Poodle") diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index 0b11c1af51c4..840c3a48e720 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -32,6 +32,7 @@ #include <linux/io.h> #include <linux/module.h> #include <linux/reboot.h> +#include <linux/memblock.h> #include <asm/setup.h> #include <asm/mach-types.h> @@ -971,13 +972,10 @@ static void __init spitz_init(void) spitz_i2c_init(); } -static void __init spitz_fixup(struct tag *tags, char **cmdline, - struct meminfo *mi) +static void __init spitz_fixup(struct tag *tags, char **cmdline) { sharpsl_save_param(); - mi->nr_banks = 1; - mi->bank[0].start = 0xa0000000; - mi->bank[0].size = (64*1024*1024); + memblock_add(0xa0000000, SZ_64M); } #ifdef CONFIG_MACH_SPITZ diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c index ef5557b807ed..c158a6e3e0aa 100644 --- a/arch/arm/mach-pxa/tosa.c +++ b/arch/arm/mach-pxa/tosa.c @@ -37,6 +37,7 @@ #include <linux/i2c/pxa-i2c.h> #include <linux/usb/gpio_vbus.h> #include <linux/reboot.h> +#include <linux/memblock.h> #include <asm/setup.h> #include <asm/mach-types.h> @@ -960,13 +961,10 @@ static void __init tosa_init(void) platform_add_devices(devices, ARRAY_SIZE(devices)); } -static void __init fixup_tosa(struct tag *tags, char **cmdline, - struct meminfo *mi) +static void __init fixup_tosa(struct tag *tags, char **cmdline) { sharpsl_save_param(); - mi->nr_banks=1; - mi->bank[0].start = 0xa0000000; - mi->bank[0].size = (64*1024*1024); + memblock_add(0xa0000000, SZ_64M); } MACHINE_START(TOSA, "SHARP Tosa") diff --git a/arch/arm/mach-realview/core.c b/arch/arm/mach-realview/core.c index 960b8dd78c44..8c1b39a0caa0 100644 --- a/arch/arm/mach-realview/core.c +++ b/arch/arm/mach-realview/core.c @@ -31,6 +31,7 @@ #include <linux/amba/mmci.h> #include <linux/gfp.h> #include <linux/mtd/physmap.h> +#include <linux/memblock.h> #include <mach/hardware.h> #include <asm/irq.h> @@ -385,19 +386,15 @@ void __init realview_timer_init(unsigned int timer_irq) /* * Setup the memory banks. */ -void realview_fixup(struct tag *tags, char **from, struct meminfo *meminfo) +void realview_fixup(struct tag *tags, char **from) { /* * Most RealView platforms have 512MB contiguous RAM at 0x70000000. * Half of this is mirrored at 0. */ #ifdef CONFIG_REALVIEW_HIGH_PHYS_OFFSET - meminfo->bank[0].start = 0x70000000; - meminfo->bank[0].size = SZ_512M; - meminfo->nr_banks = 1; + memblock_add(0x70000000, SZ_512M); #else - meminfo->bank[0].start = 0; - meminfo->bank[0].size = SZ_256M; - meminfo->nr_banks = 1; + memblock_add(0, SZ_256M); #endif } diff --git a/arch/arm/mach-realview/core.h b/arch/arm/mach-realview/core.h index 13dc830ef469..868ece221978 100644 --- a/arch/arm/mach-realview/core.h +++ b/arch/arm/mach-realview/core.h @@ -52,8 +52,7 @@ extern int realview_flash_register(struct resource *res, u32 num); extern int realview_eth_register(const char *name, struct resource *res); extern int realview_usb_register(struct resource *res); extern void realview_init_early(void); -extern void realview_fixup(struct tag *tags, char **from, - struct meminfo *meminfo); +extern void realview_fixup(struct tag *tags, char **from); extern struct smp_operations realview_smp_ops; extern void realview_cpu_die(unsigned int cpu); diff --git a/arch/arm/mach-realview/realview_eb.c b/arch/arm/mach-realview/realview_eb.c index 6bb070e80128..739d4f113097 100644 --- a/arch/arm/mach-realview/realview_eb.c +++ b/arch/arm/mach-realview/realview_eb.c @@ -442,8 +442,13 @@ static void __init realview_eb_init(void) realview_eb11mp_fixup(); #ifdef CONFIG_CACHE_L2X0 - /* 1MB (128KB/way), 8-way associativity, evmon/parity/share enabled - * Bits: .... ...0 0111 1001 0000 .... .... .... */ + /* + * The PL220 needs to be manually configured as the hardware + * doesn't report the correct sizes. + * 1MB (128KB/way), 8-way associativity, event monitor and + * parity enabled, ignore share bit, no force write allocate + * Bits: .... ...0 0111 1001 0000 .... .... .... + */ l2x0_init(__io_address(REALVIEW_EB11MP_L220_BASE), 0x00790000, 0xfe000fff); #endif platform_device_register(&pmu_device); diff --git a/arch/arm/mach-realview/realview_pb1176.c b/arch/arm/mach-realview/realview_pb1176.c index 173f2c15de49..b0e0dcaed944 100644 --- a/arch/arm/mach-realview/realview_pb1176.c +++ b/arch/arm/mach-realview/realview_pb1176.c @@ -32,6 +32,7 @@ #include <linux/irqchip/arm-gic.h> #include <linux/platform_data/clk-realview.h> #include <linux/reboot.h> +#include <linux/memblock.h> #include <mach/hardware.h> #include <asm/irq.h> @@ -339,15 +340,12 @@ static void realview_pb1176_restart(enum reboot_mode mode, const char *cmd) dsb(); } -static void realview_pb1176_fixup(struct tag *tags, char **from, - struct meminfo *meminfo) +static void realview_pb1176_fixup(struct tag *tags, char **from) { /* * RealView PB1176 only has 128MB of RAM mapped at 0. */ - meminfo->bank[0].start = 0; - meminfo->bank[0].size = SZ_128M; - meminfo->nr_banks = 1; + memblock_add(0, SZ_128M); } static void __init realview_pb1176_init(void) @@ -355,7 +353,13 @@ static void __init realview_pb1176_init(void) int i; #ifdef CONFIG_CACHE_L2X0 - /* 128Kb (16Kb/way) 8-way associativity. evmon/parity/share enabled. */ + /* + * The PL220 needs to be manually configured as the hardware + * doesn't report the correct sizes. + * 128kB (16kB/way), 8-way associativity, event monitor and + * parity enabled, ignore share bit, no force write allocate + * Bits: .... ...0 0111 0011 0000 .... .... .... + */ l2x0_init(__io_address(REALVIEW_PB1176_L220_BASE), 0x00730000, 0xfe000fff); #endif diff --git a/arch/arm/mach-realview/realview_pb11mp.c b/arch/arm/mach-realview/realview_pb11mp.c index bde7e6b1fd44..47bf55fdbf27 100644 --- a/arch/arm/mach-realview/realview_pb11mp.c +++ b/arch/arm/mach-realview/realview_pb11mp.c @@ -337,8 +337,13 @@ static void __init realview_pb11mp_init(void) int i; #ifdef CONFIG_CACHE_L2X0 - /* 1MB (128KB/way), 8-way associativity, evmon/parity/share enabled - * Bits: .... ...0 0111 1001 0000 .... .... .... */ + /* + * The PL220 needs to be manually configured as the hardware + * doesn't report the correct sizes. + * 1MB (128KB/way), 8-way associativity, event monitor and + * parity enabled, ignore share bit, no force write allocate + * Bits: .... ...0 0111 1001 0000 .... .... .... + */ l2x0_init(__io_address(REALVIEW_TC11MP_L220_BASE), 0x00790000, 0xfe000fff); #endif diff --git a/arch/arm/mach-realview/realview_pbx.c b/arch/arm/mach-realview/realview_pbx.c index 72c96caebefa..d89eb4023467 100644 --- a/arch/arm/mach-realview/realview_pbx.c +++ b/arch/arm/mach-realview/realview_pbx.c @@ -29,6 +29,7 @@ #include <linux/irqchip/arm-gic.h> #include <linux/platform_data/clk-realview.h> #include <linux/reboot.h> +#include <linux/memblock.h> #include <asm/irq.h> #include <asm/mach-types.h> @@ -325,23 +326,19 @@ static void __init realview_pbx_timer_init(void) realview_pbx_twd_init(); } -static void realview_pbx_fixup(struct tag *tags, char **from, - struct meminfo *meminfo) +static void realview_pbx_fixup(struct tag *tags, char **from) { #ifdef CONFIG_SPARSEMEM /* * Memory configuration with SPARSEMEM enabled on RealView PBX (see * asm/mach/memory.h for more information). */ - meminfo->bank[0].start = 0; - meminfo->bank[0].size = SZ_256M; - meminfo->bank[1].start = 0x20000000; - meminfo->bank[1].size = SZ_512M; - meminfo->bank[2].start = 0x80000000; - meminfo->bank[2].size = SZ_256M; - meminfo->nr_banks = 3; + + memblock_add(0, SZ_256M); + memblock_add(0x20000000, SZ_512M); + memblock_add(0x80000000, SZ_256M); #else - realview_fixup(tags, from, meminfo); + realview_fixup(tags, from); #endif } @@ -370,8 +367,8 @@ static void __init realview_pbx_init(void) __io_address(REALVIEW_PBX_TILE_L220_BASE); /* set RAM latencies to 1 cycle for eASIC */ - writel(0, l2x0_base + L2X0_TAG_LATENCY_CTRL); - writel(0, l2x0_base + L2X0_DATA_LATENCY_CTRL); + writel(0, l2x0_base + L310_TAG_LATENCY_CTRL); + writel(0, l2x0_base + L310_DATA_LATENCY_CTRL); /* 16KB way size, 8-way associativity, parity disabled * Bits: .. 0 0 0 0 1 00 1 0 1 001 0 000 0 .... .... .... */ diff --git a/arch/arm/mach-rockchip/rockchip.c b/arch/arm/mach-rockchip/rockchip.c index 4499b0a31a27..968cc348e624 100644 --- a/arch/arm/mach-rockchip/rockchip.c +++ b/arch/arm/mach-rockchip/rockchip.c @@ -24,12 +24,6 @@ #include <asm/hardware/cache-l2x0.h> #include "core.h" -static void __init rockchip_dt_init(void) -{ - l2x0_of_init(0, ~0UL); - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} - static const char * const rockchip_board_dt_compat[] = { "rockchip,rk2928", "rockchip,rk3066a", @@ -39,6 +33,7 @@ static const char * const rockchip_board_dt_compat[] = { }; DT_MACHINE_START(ROCKCHIP_DT, "Rockchip Cortex-A9 (Device Tree)") - .init_machine = rockchip_dt_init, + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .dt_compat = rockchip_board_dt_compat, MACHINE_END diff --git a/arch/arm/mach-s3c24xx/mach-smdk2413.c b/arch/arm/mach-s3c24xx/mach-smdk2413.c index a38f8a049e22..fb3b80e44595 100644 --- a/arch/arm/mach-s3c24xx/mach-smdk2413.c +++ b/arch/arm/mach-s3c24xx/mach-smdk2413.c @@ -22,6 +22,7 @@ #include <linux/serial_s3c.h> #include <linux/platform_device.h> #include <linux/io.h> +#include <linux/memblock.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> @@ -93,13 +94,10 @@ static struct platform_device *smdk2413_devices[] __initdata = { &s3c2412_device_dma, }; -static void __init smdk2413_fixup(struct tag *tags, char **cmdline, - struct meminfo *mi) +static void __init smdk2413_fixup(struct tag *tags, char **cmdline) { if (tags != phys_to_virt(S3C2410_SDRAM_PA + 0x100)) { - mi->nr_banks=1; - mi->bank[0].start = 0x30000000; - mi->bank[0].size = SZ_64M; + memblock_add(0x30000000, SZ_64M); } } diff --git a/arch/arm/mach-s3c24xx/mach-vstms.c b/arch/arm/mach-s3c24xx/mach-vstms.c index 6b706c915387..9104c2be36c9 100644 --- a/arch/arm/mach-s3c24xx/mach-vstms.c +++ b/arch/arm/mach-s3c24xx/mach-vstms.c @@ -23,6 +23,7 @@ #include <linux/mtd/nand.h> #include <linux/mtd/nand_ecc.h> #include <linux/mtd/partitions.h> +#include <linux/memblock.h> #include <asm/mach/arch.h> #include <asm/mach/map.h> @@ -129,13 +130,10 @@ static struct platform_device *vstms_devices[] __initdata = { &s3c2412_device_dma, }; -static void __init vstms_fixup(struct tag *tags, char **cmdline, - struct meminfo *mi) +static void __init vstms_fixup(struct tag *tags, char **cmdline) { if (tags != phys_to_virt(S3C2410_SDRAM_PA + 0x100)) { - mi->nr_banks=1; - mi->bank[0].start = 0x30000000; - mi->bank[0].size = SZ_64M; + memblock_add(0x30000000, SZ_64M); } } diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c index 8443a27bca2f..7dd894ece9ae 100644 --- a/arch/arm/mach-sa1100/assabet.c +++ b/arch/arm/mach-sa1100/assabet.c @@ -531,7 +531,7 @@ static void __init get_assabet_scr(void) } static void __init -fixup_assabet(struct tag *tags, char **cmdline, struct meminfo *mi) +fixup_assabet(struct tag *tags, char **cmdline) { /* This must be done before any call to machine_has_neponset() */ map_sa1100_gpio_regs(); diff --git a/arch/arm/mach-shmobile/board-armadillo800eva-reference.c b/arch/arm/mach-shmobile/board-armadillo800eva-reference.c index 57d246eb8813..f660fbb96e0b 100644 --- a/arch/arm/mach-shmobile/board-armadillo800eva-reference.c +++ b/arch/arm/mach-shmobile/board-armadillo800eva-reference.c @@ -164,8 +164,8 @@ static void __init eva_init(void) r8a7740_meram_workaround(); #ifdef CONFIG_CACHE_L2X0 - /* Early BRESP enable, Shared attribute override enable, 32K*8way */ - l2x0_init(IOMEM(0xf0002000), 0x40440000, 0x82000fff); + /* Shared attribute override enable, 32K*8way */ + l2x0_init(IOMEM(0xf0002000), 0x00400000, 0xc20f0fff); #endif r8a7740_add_standard_devices_dt(); diff --git a/arch/arm/mach-shmobile/board-armadillo800eva.c b/arch/arm/mach-shmobile/board-armadillo800eva.c index bc2cf7a89534..01f81100c330 100644 --- a/arch/arm/mach-shmobile/board-armadillo800eva.c +++ b/arch/arm/mach-shmobile/board-armadillo800eva.c @@ -1271,8 +1271,8 @@ static void __init eva_init(void) #ifdef CONFIG_CACHE_L2X0 - /* Early BRESP enable, Shared attribute override enable, 32K*8way */ - l2x0_init(IOMEM(0xf0002000), 0x40440000, 0x82000fff); + /* Shared attribute override enable, 32K*8way */ + l2x0_init(IOMEM(0xf0002000), 0x00400000, 0xc20f0fff); #endif i2c_register_board_info(0, i2c0_devices, ARRAY_SIZE(i2c0_devices)); diff --git a/arch/arm/mach-shmobile/board-kzm9g-reference.c b/arch/arm/mach-shmobile/board-kzm9g-reference.c index 598e32488410..a735a1d80c28 100644 --- a/arch/arm/mach-shmobile/board-kzm9g-reference.c +++ b/arch/arm/mach-shmobile/board-kzm9g-reference.c @@ -36,8 +36,8 @@ static void __init kzm_init(void) sh73a0_add_standard_devices_dt(); #ifdef CONFIG_CACHE_L2X0 - /* Early BRESP enable, Shared attribute override enable, 64K*8way */ - l2x0_init(IOMEM(0xf0100000), 0x40460000, 0x82000fff); + /* Shared attribute override enable, 64K*8way */ + l2x0_init(IOMEM(0xf0100000), 0x00400000, 0xc20f0fff); #endif } diff --git a/arch/arm/mach-shmobile/board-kzm9g.c b/arch/arm/mach-shmobile/board-kzm9g.c index 03dc3ac84502..f94ec8ca42c1 100644 --- a/arch/arm/mach-shmobile/board-kzm9g.c +++ b/arch/arm/mach-shmobile/board-kzm9g.c @@ -876,8 +876,8 @@ static void __init kzm_init(void) gpio_request_one(223, GPIOF_IN, NULL); /* IRQ8 */ #ifdef CONFIG_CACHE_L2X0 - /* Early BRESP enable, Shared attribute override enable, 64K*8way */ - l2x0_init(IOMEM(0xf0100000), 0x40460000, 0x82000fff); + /* Shared attribute override enable, 64K*8way */ + l2x0_init(IOMEM(0xf0100000), 0x00400000, 0xc20f0fff); #endif i2c_register_board_info(0, i2c0_devices, ARRAY_SIZE(i2c0_devices)); diff --git a/arch/arm/mach-shmobile/setup-r8a7778.c b/arch/arm/mach-shmobile/setup-r8a7778.c index 8c02e24f2483..d311ef903b39 100644 --- a/arch/arm/mach-shmobile/setup-r8a7778.c +++ b/arch/arm/mach-shmobile/setup-r8a7778.c @@ -285,10 +285,10 @@ void __init r8a7778_add_dt_devices(void) void __iomem *base = ioremap_nocache(0xf0100000, 0x1000); if (base) { /* - * Early BRESP enable, Shared attribute override enable, 64K*16way + * Shared attribute override enable, 64K*16way * don't call iounmap(base) */ - l2x0_init(base, 0x40470000, 0x82000fff); + l2x0_init(base, 0x00400000, 0xc20f0fff); } #endif diff --git a/arch/arm/mach-shmobile/setup-r8a7779.c b/arch/arm/mach-shmobile/setup-r8a7779.c index d197b5adc886..aba4ed652d54 100644 --- a/arch/arm/mach-shmobile/setup-r8a7779.c +++ b/arch/arm/mach-shmobile/setup-r8a7779.c @@ -660,8 +660,8 @@ static struct platform_device *r8a7779_standard_devices[] __initdata = { void __init r8a7779_add_standard_devices(void) { #ifdef CONFIG_CACHE_L2X0 - /* Early BRESP enable, Shared attribute override enable, 64K*16way */ - l2x0_init(IOMEM(0xf0100000), 0x40470000, 0x82000fff); + /* Shared attribute override enable, 64K*16way */ + l2x0_init(IOMEM(0xf0100000), 0x00400000, 0xc20f0fff); #endif r8a7779_pm_init(); diff --git a/arch/arm/mach-socfpga/socfpga.c b/arch/arm/mach-socfpga/socfpga.c index d86231e11b34..adbf38314ca8 100644 --- a/arch/arm/mach-socfpga/socfpga.c +++ b/arch/arm/mach-socfpga/socfpga.c @@ -98,22 +98,17 @@ static void socfpga_cyclone5_restart(enum reboot_mode mode, const char *cmd) writel(temp, rst_manager_base_addr + SOCFPGA_RSTMGR_CTRL); } -static void __init socfpga_cyclone5_init(void) -{ - l2x0_of_init(0, ~0UL); - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} - static const char *altera_dt_match[] = { "altr,socfpga", NULL }; DT_MACHINE_START(SOCFPGA, "Altera SOCFPGA") + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .smp = smp_ops(socfpga_smp_ops), .map_io = socfpga_map_io, .init_irq = socfpga_init_irq, - .init_machine = socfpga_cyclone5_init, .restart = socfpga_cyclone5_restart, .dt_compat = altera_dt_match, MACHINE_END diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c index c19751fff2c6..fd4297713d67 100644 --- a/arch/arm/mach-spear/platsmp.c +++ b/arch/arm/mach-spear/platsmp.c @@ -20,6 +20,18 @@ #include <mach/spear.h> #include "generic.h" +/* + * Write pen_release in a way that is guaranteed to be visible to all + * observers, irrespective of whether they're taking part in coherency + * or not. This is necessary for the hotplug code to work reliably. + */ +static void write_pen_release(int val) +{ + pen_release = val; + smp_wmb(); + sync_cache_w(&pen_release); +} + static DEFINE_SPINLOCK(boot_lock); static void __iomem *scu_base = IOMEM(VA_SCU_BASE); @@ -30,8 +42,7 @@ static void spear13xx_secondary_init(unsigned int cpu) * let the primary processor know we're out of the * pen, then head off into the C entry point */ - pen_release = -1; - smp_wmb(); + write_pen_release(-1); /* * Synchronise with the boot thread. @@ -58,9 +69,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle) * Note that "pen_release" is the hardware CPU ID, whereas * "cpu" is Linux's internal ID. */ - pen_release = cpu; - flush_cache_all(); - outer_flush_all(); + write_pen_release(cpu); timeout = jiffies + (1 * HZ); while (time_before(jiffies, timeout)) { diff --git a/arch/arm/mach-spear/spear13xx.c b/arch/arm/mach-spear/spear13xx.c index 7aa6e8cf830f..c9897ea38980 100644 --- a/arch/arm/mach-spear/spear13xx.c +++ b/arch/arm/mach-spear/spear13xx.c @@ -38,15 +38,15 @@ void __init spear13xx_l2x0_init(void) if (!IS_ENABLED(CONFIG_CACHE_L2X0)) return; - writel_relaxed(0x06, VA_L2CC_BASE + L2X0_PREFETCH_CTRL); + writel_relaxed(0x06, VA_L2CC_BASE + L310_PREFETCH_CTRL); /* * Program following latencies in order to make * SPEAr1340 work at 600 MHz */ - writel_relaxed(0x221, VA_L2CC_BASE + L2X0_TAG_LATENCY_CTRL); - writel_relaxed(0x441, VA_L2CC_BASE + L2X0_DATA_LATENCY_CTRL); - l2x0_init(VA_L2CC_BASE, 0x70A60001, 0xfe00ffff); + writel_relaxed(0x221, VA_L2CC_BASE + L310_TAG_LATENCY_CTRL); + writel_relaxed(0x441, VA_L2CC_BASE + L310_DATA_LATENCY_CTRL); + l2x0_init(VA_L2CC_BASE, 0x30a00001, 0xfe0fffff); } /* diff --git a/arch/arm/mach-sti/board-dt.c b/arch/arm/mach-sti/board-dt.c index df731f2322fa..3cf6ef8d4317 100644 --- a/arch/arm/mach-sti/board-dt.c +++ b/arch/arm/mach-sti/board-dt.c @@ -14,25 +14,6 @@ #include "smp.h" -void __init stih41x_l2x0_init(void) -{ - u32 way_size = 0x4; - u32 aux_ctrl; - /* may be this can be encoded in macros like BIT*() */ - aux_ctrl = (0x1 << L2X0_AUX_CTRL_SHARE_OVERRIDE_SHIFT) | - (0x1 << L2X0_AUX_CTRL_DATA_PREFETCH_SHIFT) | - (0x1 << L2X0_AUX_CTRL_INSTR_PREFETCH_SHIFT) | - (way_size << L2X0_AUX_CTRL_WAY_SIZE_SHIFT); - - l2x0_of_init(aux_ctrl, L2X0_AUX_CTRL_MASK); -} - -static void __init stih41x_machine_init(void) -{ - stih41x_l2x0_init(); - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} - static const char *stih41x_dt_match[] __initdata = { "st,stih415", "st,stih416", @@ -41,7 +22,11 @@ static const char *stih41x_dt_match[] __initdata = { }; DT_MACHINE_START(STM, "STiH415/416 SoC with Flattened Device Tree") - .init_machine = stih41x_machine_init, - .smp = smp_ops(sti_smp_ops), .dt_compat = stih41x_dt_match, + .l2c_aux_val = L2C_AUX_CTRL_SHARED_OVERRIDE | + L310_AUX_CTRL_DATA_PREFETCH | + L310_AUX_CTRL_INSTR_PREFETCH | + L2C_AUX_CTRL_WAY_SIZE(4), + .l2c_aux_mask = 0xc0000fff, + .smp = smp_ops(sti_smp_ops), MACHINE_END diff --git a/arch/arm/mach-tegra/pm.h b/arch/arm/mach-tegra/pm.h index 6e92a7c2ecbd..f4a89698e5b0 100644 --- a/arch/arm/mach-tegra/pm.h +++ b/arch/arm/mach-tegra/pm.h @@ -35,8 +35,6 @@ void tegra20_sleep_core_init(void); void tegra30_lp1_iram_hook(void); void tegra30_sleep_core_init(void); -extern unsigned long l2x0_saved_regs_addr; - void tegra_clear_cpu_in_lp2(void); bool tegra_set_cpu_in_lp2(void); diff --git a/arch/arm/mach-tegra/reset-handler.S b/arch/arm/mach-tegra/reset-handler.S index 8c1ba4fea384..578d4d1ad648 100644 --- a/arch/arm/mach-tegra/reset-handler.S +++ b/arch/arm/mach-tegra/reset-handler.S @@ -19,7 +19,6 @@ #include <asm/cache.h> #include <asm/asm-offsets.h> -#include <asm/hardware/cache-l2x0.h> #include "flowctrl.h" #include "fuse.h" @@ -78,8 +77,10 @@ ENTRY(tegra_resume) str r1, [r0] #endif +#ifdef CONFIG_CACHE_L2X0 /* L2 cache resume & re-enable */ - l2_cache_resume r0, r1, r2, l2x0_saved_regs_addr + bl l2c310_early_resume +#endif end_ca9_scu_l2_resume: mov32 r9, 0xc0f cmp r8, r9 @@ -89,12 +90,6 @@ end_ca9_scu_l2_resume: ENDPROC(tegra_resume) #endif -#ifdef CONFIG_CACHE_L2X0 - .globl l2x0_saved_regs_addr -l2x0_saved_regs_addr: - .long 0 -#endif - .align L1_CACHE_SHIFT ENTRY(__tegra_cpu_reset_handler_start) diff --git a/arch/arm/mach-tegra/sleep.h b/arch/arm/mach-tegra/sleep.h index a4edbb3abd3d..339fe42cd6fb 100644 --- a/arch/arm/mach-tegra/sleep.h +++ b/arch/arm/mach-tegra/sleep.h @@ -120,37 +120,6 @@ mov \tmp1, \tmp1, lsr #8 .endm -/* Macro to resume & re-enable L2 cache */ -#ifndef L2X0_CTRL_EN -#define L2X0_CTRL_EN 1 -#endif - -#ifdef CONFIG_CACHE_L2X0 -.macro l2_cache_resume, tmp1, tmp2, tmp3, phys_l2x0_saved_regs - W(adr) \tmp1, \phys_l2x0_saved_regs - ldr \tmp1, [\tmp1] - ldr \tmp2, [\tmp1, #L2X0_R_PHY_BASE] - ldr \tmp3, [\tmp2, #L2X0_CTRL] - tst \tmp3, #L2X0_CTRL_EN - bne exit_l2_resume - ldr \tmp3, [\tmp1, #L2X0_R_TAG_LATENCY] - str \tmp3, [\tmp2, #L2X0_TAG_LATENCY_CTRL] - ldr \tmp3, [\tmp1, #L2X0_R_DATA_LATENCY] - str \tmp3, [\tmp2, #L2X0_DATA_LATENCY_CTRL] - ldr \tmp3, [\tmp1, #L2X0_R_PREFETCH_CTRL] - str \tmp3, [\tmp2, #L2X0_PREFETCH_CTRL] - ldr \tmp3, [\tmp1, #L2X0_R_PWR_CTRL] - str \tmp3, [\tmp2, #L2X0_POWER_CTRL] - ldr \tmp3, [\tmp1, #L2X0_R_AUX_CTRL] - str \tmp3, [\tmp2, #L2X0_AUX_CTRL] - mov \tmp3, #L2X0_CTRL_EN - str \tmp3, [\tmp2, #L2X0_CTRL] -exit_l2_resume: -.endm -#else /* CONFIG_CACHE_L2X0 */ -.macro l2_cache_resume, tmp1, tmp2, tmp3, phys_l2x0_saved_regs -.endm -#endif /* CONFIG_CACHE_L2X0 */ #else void tegra_pen_lock(void); void tegra_pen_unlock(void); diff --git a/arch/arm/mach-tegra/tegra.c b/arch/arm/mach-tegra/tegra.c index 6191603379e1..15ac9fcc96b1 100644 --- a/arch/arm/mach-tegra/tegra.c +++ b/arch/arm/mach-tegra/tegra.c @@ -70,40 +70,12 @@ u32 tegra_uart_config[3] = { 0, }; -static void __init tegra_init_cache(void) -{ -#ifdef CONFIG_CACHE_L2X0 - static const struct of_device_id pl310_ids[] __initconst = { - { .compatible = "arm,pl310-cache", }, - {} - }; - - struct device_node *np; - int ret; - void __iomem *p = IO_ADDRESS(TEGRA_ARM_PERIF_BASE) + 0x3000; - u32 aux_ctrl, cache_type; - - np = of_find_matching_node(NULL, pl310_ids); - if (!np) - return; - - cache_type = readl(p + L2X0_CACHE_TYPE); - aux_ctrl = (cache_type & 0x700) << (17-8); - aux_ctrl |= 0x7C400001; - - ret = l2x0_of_init(aux_ctrl, 0x8200c3fe); - if (!ret) - l2x0_saved_regs_addr = virt_to_phys(&l2x0_saved_regs); -#endif -} - static void __init tegra_init_early(void) { of_register_trusted_foundations(); tegra_apb_io_init(); tegra_init_fuse(); tegra_cpu_reset_handler_init(); - tegra_init_cache(); tegra_powergate_init(); tegra_hotplug_init(); } @@ -191,8 +163,10 @@ static const char * const tegra_dt_board_compat[] = { }; DT_MACHINE_START(TEGRA_DT, "NVIDIA Tegra SoC (Flattened Device Tree)") - .map_io = tegra_map_common_io, + .l2c_aux_val = 0x3c400001, + .l2c_aux_mask = 0xc20fc3fe, .smp = smp_ops(tegra_smp_ops), + .map_io = tegra_map_common_io, .init_early = tegra_init_early, .init_irq = tegra_dt_init_irq, .init_machine = tegra_dt_init, diff --git a/arch/arm/mach-ux500/cache-l2x0.c b/arch/arm/mach-ux500/cache-l2x0.c index 264f894c0e3d..842ebedbdd1c 100644 --- a/arch/arm/mach-ux500/cache-l2x0.c +++ b/arch/arm/mach-ux500/cache-l2x0.c @@ -35,10 +35,16 @@ static int __init ux500_l2x0_unlock(void) return 0; } -static int __init ux500_l2x0_init(void) +static void ux500_l2c310_write_sec(unsigned long val, unsigned reg) { - u32 aux_val = 0x3e000000; + /* + * We can't write to secure registers as we are in non-secure + * mode, until we have some SMI service available. + */ +} +static int __init ux500_l2x0_init(void) +{ if (cpu_is_u8500_family() || cpu_is_ux540_family()) l2x0_base = __io_address(U8500_L2CC_BASE); else @@ -48,28 +54,12 @@ static int __init ux500_l2x0_init(void) /* Unlock before init */ ux500_l2x0_unlock(); - /* DBx540's L2 has 128KB way size */ - if (cpu_is_ux540_family()) - /* 128KB way size */ - aux_val |= (0x4 << L2X0_AUX_CTRL_WAY_SIZE_SHIFT); - else - /* 64KB way size */ - aux_val |= (0x3 << L2X0_AUX_CTRL_WAY_SIZE_SHIFT); + outer_cache.write_sec = ux500_l2c310_write_sec; - /* 64KB way size, 8 way associativity, force WA */ if (of_have_populated_dt()) - l2x0_of_init(aux_val, 0xc0000fff); + l2x0_of_init(0, ~0); else - l2x0_init(l2x0_base, aux_val, 0xc0000fff); - - /* - * We can't disable l2 as we are in non secure mode, currently - * this seems be called only during kexec path. So let's - * override outer.disable with nasty assignment until we have - * some SMI service available. - */ - outer_cache.disable = NULL; - outer_cache.set_debug = NULL; + l2x0_init(l2x0_base, 0, ~0); return 0; } diff --git a/arch/arm/mach-vexpress/ct-ca9x4.c b/arch/arm/mach-vexpress/ct-ca9x4.c index 494d70bfddad..86150d7a2e7d 100644 --- a/arch/arm/mach-vexpress/ct-ca9x4.c +++ b/arch/arm/mach-vexpress/ct-ca9x4.c @@ -45,6 +45,23 @@ static void __init ct_ca9x4_map_io(void) iotable_init(ct_ca9x4_io_desc, ARRAY_SIZE(ct_ca9x4_io_desc)); } +static void __init ca9x4_l2_init(void) +{ +#ifdef CONFIG_CACHE_L2X0 + void __iomem *l2x0_base = ioremap(CT_CA9X4_L2CC, SZ_4K); + + if (l2x0_base) { + /* set RAM latencies to 1 cycle for this core tile. */ + writel(0, l2x0_base + L310_TAG_LATENCY_CTRL); + writel(0, l2x0_base + L310_DATA_LATENCY_CTRL); + + l2x0_init(l2x0_base, 0x00400000, 0xfe0fffff); + } else { + pr_err("L2C: unable to map L2 cache controller\n"); + } +#endif +} + #ifdef CONFIG_HAVE_ARM_TWD static DEFINE_TWD_LOCAL_TIMER(twd_local_timer, A9_MPCORE_TWD, IRQ_LOCALTIMER); @@ -63,6 +80,7 @@ static void __init ct_ca9x4_init_irq(void) gic_init(0, 29, ioremap(A9_MPCORE_GIC_DIST, SZ_4K), ioremap(A9_MPCORE_GIC_CPU, SZ_256)); ca9x4_twd_init(); + ca9x4_l2_init(); } static int ct_ca9x4_clcd_setup(struct clcd_fb *fb) @@ -146,16 +164,6 @@ static void __init ct_ca9x4_init(void) { int i; -#ifdef CONFIG_CACHE_L2X0 - void __iomem *l2x0_base = ioremap(CT_CA9X4_L2CC, SZ_4K); - - /* set RAM latencies to 1 cycle for this core tile. */ - writel(0, l2x0_base + L2X0_TAG_LATENCY_CTRL); - writel(0, l2x0_base + L2X0_DATA_LATENCY_CTRL); - - l2x0_init(l2x0_base, 0x00400000, 0xfe0fffff); -#endif - for (i = 0; i < ARRAY_SIZE(ct_ca9x4_amba_devs); i++) amba_device_register(ct_ca9x4_amba_devs[i], &iomem_resource); diff --git a/arch/arm/mach-vexpress/tc2_pm.c b/arch/arm/mach-vexpress/tc2_pm.c index 29e7785a54bc..b743a0ae02ce 100644 --- a/arch/arm/mach-vexpress/tc2_pm.c +++ b/arch/arm/mach-vexpress/tc2_pm.c @@ -209,7 +209,7 @@ static int tc2_core_in_reset(unsigned int cpu, unsigned int cluster) #define POLL_MSEC 10 #define TIMEOUT_MSEC 1000 -static int tc2_pm_power_down_finish(unsigned int cpu, unsigned int cluster) +static int tc2_pm_wait_for_powerdown(unsigned int cpu, unsigned int cluster) { unsigned tries; @@ -290,7 +290,7 @@ static void tc2_pm_powered_up(void) static const struct mcpm_platform_ops tc2_pm_power_ops = { .power_up = tc2_pm_power_up, .power_down = tc2_pm_power_down, - .power_down_finish = tc2_pm_power_down_finish, + .wait_for_powerdown = tc2_pm_wait_for_powerdown, .suspend = tc2_pm_suspend, .powered_up = tc2_pm_powered_up, }; diff --git a/arch/arm/mach-vexpress/v2m.c b/arch/arm/mach-vexpress/v2m.c index 38f4f6f37770..6ff681a24ba7 100644 --- a/arch/arm/mach-vexpress/v2m.c +++ b/arch/arm/mach-vexpress/v2m.c @@ -372,7 +372,6 @@ MACHINE_END static void __init v2m_dt_init(void) { - l2x0_of_init(0x00400000, 0xfe0fffff); of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); } @@ -383,6 +382,8 @@ static const char * const v2m_dt_match[] __initconst = { DT_MACHINE_START(VEXPRESS_DT, "ARM-Versatile Express") .dt_compat = v2m_dt_match, + .l2c_aux_val = 0x00400000, + .l2c_aux_mask = 0xfe0fffff, .smp = smp_ops(vexpress_smp_dt_ops), .smp_init = smp_init_ops(vexpress_smp_init_ops), .init_machine = v2m_dt_init, diff --git a/arch/arm/mach-zynq/common.c b/arch/arm/mach-zynq/common.c index edbd9d83f407..31a6fa40ba37 100644 --- a/arch/arm/mach-zynq/common.c +++ b/arch/arm/mach-zynq/common.c @@ -109,11 +109,6 @@ static void __init zynq_init_machine(void) struct soc_device *soc_dev; struct device *parent = NULL; - /* - * 64KB way size, 8-way associativity, parity disabled - */ - l2x0_of_init(0x02060000, 0xF0F0FFFF); - soc_dev_attr = kzalloc(sizeof(*soc_dev_attr), GFP_KERNEL); if (!soc_dev_attr) goto out; @@ -202,6 +197,9 @@ static const char * const zynq_dt_match[] = { }; DT_MACHINE_START(XILINX_EP107, "Xilinx Zynq Platform") + /* 64KB way size, 8-way associativity, parity disabled */ + .l2c_aux_val = 0x02000000, + .l2c_aux_mask = 0xf0ffffff, .smp = smp_ops(zynq_smp_ops), .map_io = zynq_map_io, .init_irq = zynq_irq_init, diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 5bf7c3c3b301..eda0dd0ab97b 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -897,6 +897,57 @@ config CACHE_PL310 This option enables optimisations for the PL310 cache controller. +config PL310_ERRATA_588369 + bool "PL310 errata: Clean & Invalidate maintenance operations do not invalidate clean lines" + depends on CACHE_L2X0 + help + The PL310 L2 cache controller implements three types of Clean & + Invalidate maintenance operations: by Physical Address + (offset 0x7F0), by Index/Way (0x7F8) and by Way (0x7FC). + They are architecturally defined to behave as the execution of a + clean operation followed immediately by an invalidate operation, + both performing to the same memory location. This functionality + is not correctly implemented in PL310 as clean lines are not + invalidated as a result of these operations. + +config PL310_ERRATA_727915 + bool "PL310 errata: Background Clean & Invalidate by Way operation can cause data corruption" + depends on CACHE_L2X0 + help + PL310 implements the Clean & Invalidate by Way L2 cache maintenance + operation (offset 0x7FC). This operation runs in background so that + PL310 can handle normal accesses while it is in progress. Under very + rare circumstances, due to this erratum, write data can be lost when + PL310 treats a cacheable write transaction during a Clean & + Invalidate by Way operation. + +config PL310_ERRATA_753970 + bool "PL310 errata: cache sync operation may be faulty" + depends on CACHE_PL310 + help + This option enables the workaround for the 753970 PL310 (r3p0) erratum. + + Under some condition the effect of cache sync operation on + the store buffer still remains when the operation completes. + This means that the store buffer is always asked to drain and + this prevents it from merging any further writes. The workaround + is to replace the normal offset of cache sync operation (0x730) + by another offset targeting an unmapped PL310 register 0x740. + This has the same effect as the cache sync operation: store buffer + drain and waiting for all buffers empty. + +config PL310_ERRATA_769419 + bool "PL310 errata: no automatic Store Buffer drain" + depends on CACHE_L2X0 + help + On revisions of the PL310 prior to r3p2, the Store Buffer does + not automatically drain. This can cause normal, non-cacheable + writes to be retained when the memory system is idle, leading + to suboptimal I/O performance for drivers using coherent DMA. + This option adds a write barrier to the cpu_idle loop so that, + on systems with an outer cache, the store buffer is drained + explicitly. + config CACHE_TAUROS2 bool "Enable the Tauros2 L2 cache controller" depends on (ARCH_DOVE || ARCH_MMP || CPU_PJ4) diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile index 7f39ce2f841f..91da64de440f 100644 --- a/arch/arm/mm/Makefile +++ b/arch/arm/mm/Makefile @@ -95,7 +95,8 @@ obj-$(CONFIG_CPU_V7M) += proc-v7m.o AFLAGS_proc-v6.o :=-Wa,-march=armv6 AFLAGS_proc-v7.o :=-Wa,-march=armv7-a +obj-$(CONFIG_OUTER_CACHE) += l2c-common.o obj-$(CONFIG_CACHE_FEROCEON_L2) += cache-feroceon-l2.o -obj-$(CONFIG_CACHE_L2X0) += cache-l2x0.o +obj-$(CONFIG_CACHE_L2X0) += cache-l2x0.o l2c-l2x0-resume.o obj-$(CONFIG_CACHE_XSC3L2) += cache-xsc3l2.o obj-$(CONFIG_CACHE_TAUROS2) += cache-tauros2.o diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index 924036473b16..b8cb1a2688a0 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -28,6 +28,7 @@ #include <asm/opcodes.h> #include "fault.h" +#include "mm.h" /* * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998 @@ -81,6 +82,7 @@ static unsigned long ai_word; static unsigned long ai_dword; static unsigned long ai_multi; static int ai_usermode; +static unsigned long cr_no_alignment; core_param(alignment, ai_usermode, int, 0600); @@ -91,7 +93,7 @@ core_param(alignment, ai_usermode, int, 0600); /* Return true if and only if the ARMv6 unaligned access model is in use. */ static bool cpu_is_v6_unaligned(void) { - return cpu_architecture() >= CPU_ARCH_ARMv6 && (cr_alignment & CR_U); + return cpu_architecture() >= CPU_ARCH_ARMv6 && get_cr() & CR_U; } static int safe_usermode(int new_usermode, bool warn) @@ -949,6 +951,13 @@ do_alignment(unsigned long addr, unsigned int fsr, struct pt_regs *regs) return 0; } +static int __init noalign_setup(char *__unused) +{ + set_cr(__clear_cr(CR_A)); + return 1; +} +__setup("noalign", noalign_setup); + /* * This needs to be done after sysctl_init, otherwise sys/ will be * overwritten. Actually, this shouldn't be in sys/ at all since @@ -966,14 +975,12 @@ static int __init alignment_init(void) return -ENOMEM; #endif -#ifdef CONFIG_CPU_CP15 if (cpu_is_v6_unaligned()) { - cr_alignment &= ~CR_A; - cr_no_alignment &= ~CR_A; - set_cr(cr_alignment); + set_cr(__clear_cr(CR_A)); ai_usermode = safe_usermode(ai_usermode, false); } -#endif + + cr_no_alignment = get_cr() & ~CR_A; hook_fault_code(FAULT_CODE_ALIGNMENT, do_alignment, SIGBUS, BUS_ADRALN, "alignment exception"); diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c index dc814a548056..e028a7f2ebcc 100644 --- a/arch/arm/mm/cache-feroceon-l2.c +++ b/arch/arm/mm/cache-feroceon-l2.c @@ -350,7 +350,6 @@ void __init feroceon_l2_init(int __l2_wt_override) outer_cache.inv_range = feroceon_l2_inv_range; outer_cache.clean_range = feroceon_l2_clean_range; outer_cache.flush_range = feroceon_l2_flush_range; - outer_cache.inv_all = l2_inv_all; enable_l2(); diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c index 7abde2ce8973..efc5cabf70e0 100644 --- a/arch/arm/mm/cache-l2x0.c +++ b/arch/arm/mm/cache-l2x0.c @@ -16,18 +16,33 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/cpu.h> #include <linux/err.h> #include <linux/init.h> +#include <linux/smp.h> #include <linux/spinlock.h> #include <linux/io.h> #include <linux/of.h> #include <linux/of_address.h> #include <asm/cacheflush.h> +#include <asm/cp15.h> +#include <asm/cputype.h> #include <asm/hardware/cache-l2x0.h> #include "cache-tauros3.h" #include "cache-aurora-l2.h" +struct l2c_init_data { + const char *type; + unsigned way_size_0; + unsigned num_lock; + void (*of_parse)(const struct device_node *, u32 *, u32 *); + void (*enable)(void __iomem *, u32, unsigned); + void (*fixup)(void __iomem *, u32, struct outer_cache_fns *); + void (*save)(void __iomem *); + struct outer_cache_fns outer_cache; +}; + #define CACHE_LINE_SIZE 32 static void __iomem *l2x0_base; @@ -36,96 +51,116 @@ static u32 l2x0_way_mask; /* Bitmask of active ways */ static u32 l2x0_size; static unsigned long sync_reg_offset = L2X0_CACHE_SYNC; -/* Aurora don't have the cache ID register available, so we have to - * pass it though the device tree */ -static u32 cache_id_part_number_from_dt; - struct l2x0_regs l2x0_saved_regs; -struct l2x0_of_data { - void (*setup)(const struct device_node *, u32 *, u32 *); - void (*save)(void); - struct outer_cache_fns outer_cache; -}; - -static bool of_init = false; - -static inline void cache_wait_way(void __iomem *reg, unsigned long mask) +/* + * Common code for all cache controllers. + */ +static inline void l2c_wait_mask(void __iomem *reg, unsigned long mask) { /* wait for cache operation by line or way to complete */ while (readl_relaxed(reg) & mask) cpu_relax(); } -#ifdef CONFIG_CACHE_PL310 -static inline void cache_wait(void __iomem *reg, unsigned long mask) +/* + * By default, we write directly to secure registers. Platforms must + * override this if they are running non-secure. + */ +static void l2c_write_sec(unsigned long val, void __iomem *base, unsigned reg) { - /* cache operations by line are atomic on PL310 */ + if (val == readl_relaxed(base + reg)) + return; + if (outer_cache.write_sec) + outer_cache.write_sec(val, reg); + else + writel_relaxed(val, base + reg); } -#else -#define cache_wait cache_wait_way -#endif -static inline void cache_sync(void) +/* + * This should only be called when we have a requirement that the + * register be written due to a work-around, as platforms running + * in non-secure mode may not be able to access this register. + */ +static inline void l2c_set_debug(void __iomem *base, unsigned long val) { - void __iomem *base = l2x0_base; - - writel_relaxed(0, base + sync_reg_offset); - cache_wait(base + L2X0_CACHE_SYNC, 1); + l2c_write_sec(val, base, L2X0_DEBUG_CTRL); } -static inline void l2x0_clean_line(unsigned long addr) +static void __l2c_op_way(void __iomem *reg) { - void __iomem *base = l2x0_base; - cache_wait(base + L2X0_CLEAN_LINE_PA, 1); - writel_relaxed(addr, base + L2X0_CLEAN_LINE_PA); + writel_relaxed(l2x0_way_mask, reg); + l2c_wait_mask(reg, l2x0_way_mask); } -static inline void l2x0_inv_line(unsigned long addr) +static inline void l2c_unlock(void __iomem *base, unsigned num) { - void __iomem *base = l2x0_base; - cache_wait(base + L2X0_INV_LINE_PA, 1); - writel_relaxed(addr, base + L2X0_INV_LINE_PA); + unsigned i; + + for (i = 0; i < num; i++) { + writel_relaxed(0, base + L2X0_LOCKDOWN_WAY_D_BASE + + i * L2X0_LOCKDOWN_STRIDE); + writel_relaxed(0, base + L2X0_LOCKDOWN_WAY_I_BASE + + i * L2X0_LOCKDOWN_STRIDE); + } } -#if defined(CONFIG_PL310_ERRATA_588369) || defined(CONFIG_PL310_ERRATA_727915) -static inline void debug_writel(unsigned long val) +/* + * Enable the L2 cache controller. This function must only be + * called when the cache controller is known to be disabled. + */ +static void l2c_enable(void __iomem *base, u32 aux, unsigned num_lock) { - if (outer_cache.set_debug) - outer_cache.set_debug(val); + unsigned long flags; + + l2c_write_sec(aux, base, L2X0_AUX_CTRL); + + l2c_unlock(base, num_lock); + + local_irq_save(flags); + __l2c_op_way(base + L2X0_INV_WAY); + writel_relaxed(0, base + sync_reg_offset); + l2c_wait_mask(base + sync_reg_offset, 1); + local_irq_restore(flags); + + l2c_write_sec(L2X0_CTRL_EN, base, L2X0_CTRL); } -static void pl310_set_debug(unsigned long val) +static void l2c_disable(void) { - writel_relaxed(val, l2x0_base + L2X0_DEBUG_CTRL); + void __iomem *base = l2x0_base; + + outer_cache.flush_all(); + l2c_write_sec(0, base, L2X0_CTRL); + dsb(st); } -#else -/* Optimised out for non-errata case */ -static inline void debug_writel(unsigned long val) + +#ifdef CONFIG_CACHE_PL310 +static inline void cache_wait(void __iomem *reg, unsigned long mask) { + /* cache operations by line are atomic on PL310 */ } - -#define pl310_set_debug NULL +#else +#define cache_wait l2c_wait_mask #endif -#ifdef CONFIG_PL310_ERRATA_588369 -static inline void l2x0_flush_line(unsigned long addr) +static inline void cache_sync(void) { void __iomem *base = l2x0_base; - /* Clean by PA followed by Invalidate by PA */ - cache_wait(base + L2X0_CLEAN_LINE_PA, 1); - writel_relaxed(addr, base + L2X0_CLEAN_LINE_PA); - cache_wait(base + L2X0_INV_LINE_PA, 1); - writel_relaxed(addr, base + L2X0_INV_LINE_PA); + writel_relaxed(0, base + sync_reg_offset); + cache_wait(base + L2X0_CACHE_SYNC, 1); } -#else -static inline void l2x0_flush_line(unsigned long addr) +#if defined(CONFIG_PL310_ERRATA_588369) || defined(CONFIG_PL310_ERRATA_727915) +static inline void debug_writel(unsigned long val) +{ + l2c_set_debug(l2x0_base, val); +} +#else +/* Optimised out for non-errata case */ +static inline void debug_writel(unsigned long val) { - void __iomem *base = l2x0_base; - cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1); - writel_relaxed(addr, base + L2X0_CLEAN_INV_LINE_PA); } #endif @@ -141,8 +176,7 @@ static void l2x0_cache_sync(void) static void __l2x0_flush_all(void) { debug_writel(0x03); - writel_relaxed(l2x0_way_mask, l2x0_base + L2X0_CLEAN_INV_WAY); - cache_wait_way(l2x0_base + L2X0_CLEAN_INV_WAY, l2x0_way_mask); + __l2c_op_way(l2x0_base + L2X0_CLEAN_INV_WAY); cache_sync(); debug_writel(0x00); } @@ -157,275 +191,883 @@ static void l2x0_flush_all(void) raw_spin_unlock_irqrestore(&l2x0_lock, flags); } -static void l2x0_clean_all(void) +static void l2x0_disable(void) { unsigned long flags; - /* clean all ways */ raw_spin_lock_irqsave(&l2x0_lock, flags); - writel_relaxed(l2x0_way_mask, l2x0_base + L2X0_CLEAN_WAY); - cache_wait_way(l2x0_base + L2X0_CLEAN_WAY, l2x0_way_mask); - cache_sync(); + __l2x0_flush_all(); + l2c_write_sec(0, l2x0_base, L2X0_CTRL); + dsb(st); raw_spin_unlock_irqrestore(&l2x0_lock, flags); } -static void l2x0_inv_all(void) +static void l2c_save(void __iomem *base) { - unsigned long flags; + l2x0_saved_regs.aux_ctrl = readl_relaxed(l2x0_base + L2X0_AUX_CTRL); +} - /* invalidate all ways */ - raw_spin_lock_irqsave(&l2x0_lock, flags); - /* Invalidating when L2 is enabled is a nono */ - BUG_ON(readl(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN); - writel_relaxed(l2x0_way_mask, l2x0_base + L2X0_INV_WAY); - cache_wait_way(l2x0_base + L2X0_INV_WAY, l2x0_way_mask); - cache_sync(); - raw_spin_unlock_irqrestore(&l2x0_lock, flags); +/* + * L2C-210 specific code. + * + * The L2C-2x0 PA, set/way and sync operations are atomic, but we must + * ensure that no background operation is running. The way operations + * are all background tasks. + * + * While a background operation is in progress, any new operation is + * ignored (unspecified whether this causes an error.) Thankfully, not + * used on SMP. + * + * Never has a different sync register other than L2X0_CACHE_SYNC, but + * we use sync_reg_offset here so we can share some of this with L2C-310. + */ +static void __l2c210_cache_sync(void __iomem *base) +{ + writel_relaxed(0, base + sync_reg_offset); } -static void l2x0_inv_range(unsigned long start, unsigned long end) +static void __l2c210_op_pa_range(void __iomem *reg, unsigned long start, + unsigned long end) +{ + while (start < end) { + writel_relaxed(start, reg); + start += CACHE_LINE_SIZE; + } +} + +static void l2c210_inv_range(unsigned long start, unsigned long end) { void __iomem *base = l2x0_base; - unsigned long flags; - raw_spin_lock_irqsave(&l2x0_lock, flags); if (start & (CACHE_LINE_SIZE - 1)) { start &= ~(CACHE_LINE_SIZE - 1); - debug_writel(0x03); - l2x0_flush_line(start); - debug_writel(0x00); + writel_relaxed(start, base + L2X0_CLEAN_INV_LINE_PA); start += CACHE_LINE_SIZE; } if (end & (CACHE_LINE_SIZE - 1)) { end &= ~(CACHE_LINE_SIZE - 1); - debug_writel(0x03); - l2x0_flush_line(end); - debug_writel(0x00); + writel_relaxed(end, base + L2X0_CLEAN_INV_LINE_PA); } + __l2c210_op_pa_range(base + L2X0_INV_LINE_PA, start, end); + __l2c210_cache_sync(base); +} + +static void l2c210_clean_range(unsigned long start, unsigned long end) +{ + void __iomem *base = l2x0_base; + + start &= ~(CACHE_LINE_SIZE - 1); + __l2c210_op_pa_range(base + L2X0_CLEAN_LINE_PA, start, end); + __l2c210_cache_sync(base); +} + +static void l2c210_flush_range(unsigned long start, unsigned long end) +{ + void __iomem *base = l2x0_base; + + start &= ~(CACHE_LINE_SIZE - 1); + __l2c210_op_pa_range(base + L2X0_CLEAN_INV_LINE_PA, start, end); + __l2c210_cache_sync(base); +} + +static void l2c210_flush_all(void) +{ + void __iomem *base = l2x0_base; + + BUG_ON(!irqs_disabled()); + + __l2c_op_way(base + L2X0_CLEAN_INV_WAY); + __l2c210_cache_sync(base); +} + +static void l2c210_sync(void) +{ + __l2c210_cache_sync(l2x0_base); +} + +static void l2c210_resume(void) +{ + void __iomem *base = l2x0_base; + + if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN)) + l2c_enable(base, l2x0_saved_regs.aux_ctrl, 1); +} + +static const struct l2c_init_data l2c210_data __initconst = { + .type = "L2C-210", + .way_size_0 = SZ_8K, + .num_lock = 1, + .enable = l2c_enable, + .save = l2c_save, + .outer_cache = { + .inv_range = l2c210_inv_range, + .clean_range = l2c210_clean_range, + .flush_range = l2c210_flush_range, + .flush_all = l2c210_flush_all, + .disable = l2c_disable, + .sync = l2c210_sync, + .resume = l2c210_resume, + }, +}; + +/* + * L2C-220 specific code. + * + * All operations are background operations: they have to be waited for. + * Conflicting requests generate a slave error (which will cause an + * imprecise abort.) Never uses sync_reg_offset, so we hard-code the + * sync register here. + * + * However, we can re-use the l2c210_resume call. + */ +static inline void __l2c220_cache_sync(void __iomem *base) +{ + writel_relaxed(0, base + L2X0_CACHE_SYNC); + l2c_wait_mask(base + L2X0_CACHE_SYNC, 1); +} + +static void l2c220_op_way(void __iomem *base, unsigned reg) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&l2x0_lock, flags); + __l2c_op_way(base + reg); + __l2c220_cache_sync(base); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); +} + +static unsigned long l2c220_op_pa_range(void __iomem *reg, unsigned long start, + unsigned long end, unsigned long flags) +{ + raw_spinlock_t *lock = &l2x0_lock; + while (start < end) { unsigned long blk_end = start + min(end - start, 4096UL); while (start < blk_end) { - l2x0_inv_line(start); + l2c_wait_mask(reg, 1); + writel_relaxed(start, reg); start += CACHE_LINE_SIZE; } if (blk_end < end) { - raw_spin_unlock_irqrestore(&l2x0_lock, flags); - raw_spin_lock_irqsave(&l2x0_lock, flags); + raw_spin_unlock_irqrestore(lock, flags); + raw_spin_lock_irqsave(lock, flags); } } - cache_wait(base + L2X0_INV_LINE_PA, 1); - cache_sync(); - raw_spin_unlock_irqrestore(&l2x0_lock, flags); + + return flags; } -static void l2x0_clean_range(unsigned long start, unsigned long end) +static void l2c220_inv_range(unsigned long start, unsigned long end) { void __iomem *base = l2x0_base; unsigned long flags; - if ((end - start) >= l2x0_size) { - l2x0_clean_all(); - return; - } - raw_spin_lock_irqsave(&l2x0_lock, flags); - start &= ~(CACHE_LINE_SIZE - 1); - while (start < end) { - unsigned long blk_end = start + min(end - start, 4096UL); - - while (start < blk_end) { - l2x0_clean_line(start); + if ((start | end) & (CACHE_LINE_SIZE - 1)) { + if (start & (CACHE_LINE_SIZE - 1)) { + start &= ~(CACHE_LINE_SIZE - 1); + writel_relaxed(start, base + L2X0_CLEAN_INV_LINE_PA); start += CACHE_LINE_SIZE; } - if (blk_end < end) { - raw_spin_unlock_irqrestore(&l2x0_lock, flags); - raw_spin_lock_irqsave(&l2x0_lock, flags); + if (end & (CACHE_LINE_SIZE - 1)) { + end &= ~(CACHE_LINE_SIZE - 1); + l2c_wait_mask(base + L2X0_CLEAN_INV_LINE_PA, 1); + writel_relaxed(end, base + L2X0_CLEAN_INV_LINE_PA); } } - cache_wait(base + L2X0_CLEAN_LINE_PA, 1); - cache_sync(); + + flags = l2c220_op_pa_range(base + L2X0_INV_LINE_PA, + start, end, flags); + l2c_wait_mask(base + L2X0_INV_LINE_PA, 1); + __l2c220_cache_sync(base); raw_spin_unlock_irqrestore(&l2x0_lock, flags); } -static void l2x0_flush_range(unsigned long start, unsigned long end) +static void l2c220_clean_range(unsigned long start, unsigned long end) { void __iomem *base = l2x0_base; unsigned long flags; + start &= ~(CACHE_LINE_SIZE - 1); if ((end - start) >= l2x0_size) { - l2x0_flush_all(); + l2c220_op_way(base, L2X0_CLEAN_WAY); return; } raw_spin_lock_irqsave(&l2x0_lock, flags); + flags = l2c220_op_pa_range(base + L2X0_CLEAN_LINE_PA, + start, end, flags); + l2c_wait_mask(base + L2X0_CLEAN_INV_LINE_PA, 1); + __l2c220_cache_sync(base); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); +} + +static void l2c220_flush_range(unsigned long start, unsigned long end) +{ + void __iomem *base = l2x0_base; + unsigned long flags; + start &= ~(CACHE_LINE_SIZE - 1); + if ((end - start) >= l2x0_size) { + l2c220_op_way(base, L2X0_CLEAN_INV_WAY); + return; + } + + raw_spin_lock_irqsave(&l2x0_lock, flags); + flags = l2c220_op_pa_range(base + L2X0_CLEAN_INV_LINE_PA, + start, end, flags); + l2c_wait_mask(base + L2X0_CLEAN_INV_LINE_PA, 1); + __l2c220_cache_sync(base); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); +} + +static void l2c220_flush_all(void) +{ + l2c220_op_way(l2x0_base, L2X0_CLEAN_INV_WAY); +} + +static void l2c220_sync(void) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&l2x0_lock, flags); + __l2c220_cache_sync(l2x0_base); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); +} + +static void l2c220_enable(void __iomem *base, u32 aux, unsigned num_lock) +{ + /* + * Always enable non-secure access to the lockdown registers - + * we write to them as part of the L2C enable sequence so they + * need to be accessible. + */ + aux |= L220_AUX_CTRL_NS_LOCKDOWN; + + l2c_enable(base, aux, num_lock); +} + +static const struct l2c_init_data l2c220_data = { + .type = "L2C-220", + .way_size_0 = SZ_8K, + .num_lock = 1, + .enable = l2c220_enable, + .save = l2c_save, + .outer_cache = { + .inv_range = l2c220_inv_range, + .clean_range = l2c220_clean_range, + .flush_range = l2c220_flush_range, + .flush_all = l2c220_flush_all, + .disable = l2c_disable, + .sync = l2c220_sync, + .resume = l2c210_resume, + }, +}; + +/* + * L2C-310 specific code. + * + * Very similar to L2C-210, the PA, set/way and sync operations are atomic, + * and the way operations are all background tasks. However, issuing an + * operation while a background operation is in progress results in a + * SLVERR response. We can reuse: + * + * __l2c210_cache_sync (using sync_reg_offset) + * l2c210_sync + * l2c210_inv_range (if 588369 is not applicable) + * l2c210_clean_range + * l2c210_flush_range (if 588369 is not applicable) + * l2c210_flush_all (if 727915 is not applicable) + * + * Errata: + * 588369: PL310 R0P0->R1P0, fixed R2P0. + * Affects: all clean+invalidate operations + * clean and invalidate skips the invalidate step, so we need to issue + * separate operations. We also require the above debug workaround + * enclosing this code fragment on affected parts. On unaffected parts, + * we must not use this workaround without the debug register writes + * to avoid exposing a problem similar to 727915. + * + * 727915: PL310 R2P0->R3P0, fixed R3P1. + * Affects: clean+invalidate by way + * clean and invalidate by way runs in the background, and a store can + * hit the line between the clean operation and invalidate operation, + * resulting in the store being lost. + * + * 752271: PL310 R3P0->R3P1-50REL0, fixed R3P2. + * Affects: 8x64-bit (double fill) line fetches + * double fill line fetches can fail to cause dirty data to be evicted + * from the cache before the new data overwrites the second line. + * + * 753970: PL310 R3P0, fixed R3P1. + * Affects: sync + * prevents merging writes after the sync operation, until another L2C + * operation is performed (or a number of other conditions.) + * + * 769419: PL310 R0P0->R3P1, fixed R3P2. + * Affects: store buffer + * store buffer is not automatically drained. + */ +static void l2c310_inv_range_erratum(unsigned long start, unsigned long end) +{ + void __iomem *base = l2x0_base; + + if ((start | end) & (CACHE_LINE_SIZE - 1)) { + unsigned long flags; + + /* Erratum 588369 for both clean+invalidate operations */ + raw_spin_lock_irqsave(&l2x0_lock, flags); + l2c_set_debug(base, 0x03); + + if (start & (CACHE_LINE_SIZE - 1)) { + start &= ~(CACHE_LINE_SIZE - 1); + writel_relaxed(start, base + L2X0_CLEAN_LINE_PA); + writel_relaxed(start, base + L2X0_INV_LINE_PA); + start += CACHE_LINE_SIZE; + } + + if (end & (CACHE_LINE_SIZE - 1)) { + end &= ~(CACHE_LINE_SIZE - 1); + writel_relaxed(end, base + L2X0_CLEAN_LINE_PA); + writel_relaxed(end, base + L2X0_INV_LINE_PA); + } + + l2c_set_debug(base, 0x00); + raw_spin_unlock_irqrestore(&l2x0_lock, flags); + } + + __l2c210_op_pa_range(base + L2X0_INV_LINE_PA, start, end); + __l2c210_cache_sync(base); +} + +static void l2c310_flush_range_erratum(unsigned long start, unsigned long end) +{ + raw_spinlock_t *lock = &l2x0_lock; + unsigned long flags; + void __iomem *base = l2x0_base; + + raw_spin_lock_irqsave(lock, flags); while (start < end) { unsigned long blk_end = start + min(end - start, 4096UL); - debug_writel(0x03); + l2c_set_debug(base, 0x03); while (start < blk_end) { - l2x0_flush_line(start); + writel_relaxed(start, base + L2X0_CLEAN_LINE_PA); + writel_relaxed(start, base + L2X0_INV_LINE_PA); start += CACHE_LINE_SIZE; } - debug_writel(0x00); + l2c_set_debug(base, 0x00); if (blk_end < end) { - raw_spin_unlock_irqrestore(&l2x0_lock, flags); - raw_spin_lock_irqsave(&l2x0_lock, flags); + raw_spin_unlock_irqrestore(lock, flags); + raw_spin_lock_irqsave(lock, flags); } } - cache_wait(base + L2X0_CLEAN_INV_LINE_PA, 1); - cache_sync(); - raw_spin_unlock_irqrestore(&l2x0_lock, flags); + raw_spin_unlock_irqrestore(lock, flags); + __l2c210_cache_sync(base); } -static void l2x0_disable(void) +static void l2c310_flush_all_erratum(void) { + void __iomem *base = l2x0_base; unsigned long flags; raw_spin_lock_irqsave(&l2x0_lock, flags); - __l2x0_flush_all(); - writel_relaxed(0, l2x0_base + L2X0_CTRL); - dsb(st); + l2c_set_debug(base, 0x03); + __l2c_op_way(base + L2X0_CLEAN_INV_WAY); + l2c_set_debug(base, 0x00); + __l2c210_cache_sync(base); raw_spin_unlock_irqrestore(&l2x0_lock, flags); } -static void l2x0_unlock(u32 cache_id) +static void __init l2c310_save(void __iomem *base) { - int lockregs; - int i; + unsigned revision; - switch (cache_id & L2X0_CACHE_ID_PART_MASK) { - case L2X0_CACHE_ID_PART_L310: - lockregs = 8; - break; - case AURORA_CACHE_ID: - lockregs = 4; + l2c_save(base); + + l2x0_saved_regs.tag_latency = readl_relaxed(base + + L310_TAG_LATENCY_CTRL); + l2x0_saved_regs.data_latency = readl_relaxed(base + + L310_DATA_LATENCY_CTRL); + l2x0_saved_regs.filter_end = readl_relaxed(base + + L310_ADDR_FILTER_END); + l2x0_saved_regs.filter_start = readl_relaxed(base + + L310_ADDR_FILTER_START); + + revision = readl_relaxed(base + L2X0_CACHE_ID) & + L2X0_CACHE_ID_RTL_MASK; + + /* From r2p0, there is Prefetch offset/control register */ + if (revision >= L310_CACHE_ID_RTL_R2P0) + l2x0_saved_regs.prefetch_ctrl = readl_relaxed(base + + L310_PREFETCH_CTRL); + + /* From r3p0, there is Power control register */ + if (revision >= L310_CACHE_ID_RTL_R3P0) + l2x0_saved_regs.pwr_ctrl = readl_relaxed(base + + L310_POWER_CTRL); +} + +static void l2c310_resume(void) +{ + void __iomem *base = l2x0_base; + + if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN)) { + unsigned revision; + + /* restore pl310 setup */ + writel_relaxed(l2x0_saved_regs.tag_latency, + base + L310_TAG_LATENCY_CTRL); + writel_relaxed(l2x0_saved_regs.data_latency, + base + L310_DATA_LATENCY_CTRL); + writel_relaxed(l2x0_saved_regs.filter_end, + base + L310_ADDR_FILTER_END); + writel_relaxed(l2x0_saved_regs.filter_start, + base + L310_ADDR_FILTER_START); + + revision = readl_relaxed(base + L2X0_CACHE_ID) & + L2X0_CACHE_ID_RTL_MASK; + + if (revision >= L310_CACHE_ID_RTL_R2P0) + l2c_write_sec(l2x0_saved_regs.prefetch_ctrl, base, + L310_PREFETCH_CTRL); + if (revision >= L310_CACHE_ID_RTL_R3P0) + l2c_write_sec(l2x0_saved_regs.pwr_ctrl, base, + L310_POWER_CTRL); + + l2c_enable(base, l2x0_saved_regs.aux_ctrl, 8); + + /* Re-enable full-line-of-zeros for Cortex-A9 */ + if (l2x0_saved_regs.aux_ctrl & L310_AUX_CTRL_FULL_LINE_ZERO) + set_auxcr(get_auxcr() | BIT(3) | BIT(2) | BIT(1)); + } +} + +static int l2c310_cpu_enable_flz(struct notifier_block *nb, unsigned long act, void *data) +{ + switch (act & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + set_auxcr(get_auxcr() | BIT(3) | BIT(2) | BIT(1)); break; - default: - /* L210 and unknown types */ - lockregs = 1; + case CPU_DYING: + set_auxcr(get_auxcr() & ~(BIT(3) | BIT(2) | BIT(1))); break; } + return NOTIFY_OK; +} - for (i = 0; i < lockregs; i++) { - writel_relaxed(0x0, l2x0_base + L2X0_LOCKDOWN_WAY_D_BASE + - i * L2X0_LOCKDOWN_STRIDE); - writel_relaxed(0x0, l2x0_base + L2X0_LOCKDOWN_WAY_I_BASE + - i * L2X0_LOCKDOWN_STRIDE); +static void __init l2c310_enable(void __iomem *base, u32 aux, unsigned num_lock) +{ + unsigned rev = readl_relaxed(base + L2X0_CACHE_ID) & L2X0_CACHE_ID_PART_MASK; + bool cortex_a9 = read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A9; + + if (rev >= L310_CACHE_ID_RTL_R2P0) { + if (cortex_a9) { + aux |= L310_AUX_CTRL_EARLY_BRESP; + pr_info("L2C-310 enabling early BRESP for Cortex-A9\n"); + } else if (aux & L310_AUX_CTRL_EARLY_BRESP) { + pr_warn("L2C-310 early BRESP only supported with Cortex-A9\n"); + aux &= ~L310_AUX_CTRL_EARLY_BRESP; + } + } + + if (cortex_a9) { + u32 aux_cur = readl_relaxed(base + L2X0_AUX_CTRL); + u32 acr = get_auxcr(); + + pr_debug("Cortex-A9 ACR=0x%08x\n", acr); + + if (acr & BIT(3) && !(aux_cur & L310_AUX_CTRL_FULL_LINE_ZERO)) + pr_err("L2C-310: full line of zeros enabled in Cortex-A9 but not L2C-310 - invalid\n"); + + if (aux & L310_AUX_CTRL_FULL_LINE_ZERO && !(acr & BIT(3))) + pr_err("L2C-310: enabling full line of zeros but not enabled in Cortex-A9\n"); + + if (!(aux & L310_AUX_CTRL_FULL_LINE_ZERO) && !outer_cache.write_sec) { + aux |= L310_AUX_CTRL_FULL_LINE_ZERO; + pr_info("L2C-310 full line of zeros enabled for Cortex-A9\n"); + } + } else if (aux & (L310_AUX_CTRL_FULL_LINE_ZERO | L310_AUX_CTRL_EARLY_BRESP)) { + pr_err("L2C-310: disabling Cortex-A9 specific feature bits\n"); + aux &= ~(L310_AUX_CTRL_FULL_LINE_ZERO | L310_AUX_CTRL_EARLY_BRESP); + } + + if (aux & (L310_AUX_CTRL_DATA_PREFETCH | L310_AUX_CTRL_INSTR_PREFETCH)) { + u32 prefetch = readl_relaxed(base + L310_PREFETCH_CTRL); + + pr_info("L2C-310 %s%s prefetch enabled, offset %u lines\n", + aux & L310_AUX_CTRL_INSTR_PREFETCH ? "I" : "", + aux & L310_AUX_CTRL_DATA_PREFETCH ? "D" : "", + 1 + (prefetch & L310_PREFETCH_CTRL_OFFSET_MASK)); + } + + /* r3p0 or later has power control register */ + if (rev >= L310_CACHE_ID_RTL_R3P0) { + u32 power_ctrl; + + l2c_write_sec(L310_DYNAMIC_CLK_GATING_EN | L310_STNDBY_MODE_EN, + base, L310_POWER_CTRL); + power_ctrl = readl_relaxed(base + L310_POWER_CTRL); + pr_info("L2C-310 dynamic clock gating %sabled, standby mode %sabled\n", + power_ctrl & L310_DYNAMIC_CLK_GATING_EN ? "en" : "dis", + power_ctrl & L310_STNDBY_MODE_EN ? "en" : "dis"); + } + + /* + * Always enable non-secure access to the lockdown registers - + * we write to them as part of the L2C enable sequence so they + * need to be accessible. + */ + aux |= L310_AUX_CTRL_NS_LOCKDOWN; + + l2c_enable(base, aux, num_lock); + + if (aux & L310_AUX_CTRL_FULL_LINE_ZERO) { + set_auxcr(get_auxcr() | BIT(3) | BIT(2) | BIT(1)); + cpu_notifier(l2c310_cpu_enable_flz, 0); } } -void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask) +static void __init l2c310_fixup(void __iomem *base, u32 cache_id, + struct outer_cache_fns *fns) { - u32 aux; - u32 cache_id; - u32 way_size = 0; - int ways; - int way_size_shift = L2X0_WAY_SIZE_SHIFT; - const char *type; + unsigned revision = cache_id & L2X0_CACHE_ID_RTL_MASK; + const char *errata[8]; + unsigned n = 0; + + if (IS_ENABLED(CONFIG_PL310_ERRATA_588369) && + revision < L310_CACHE_ID_RTL_R2P0 && + /* For bcm compatibility */ + fns->inv_range == l2c210_inv_range) { + fns->inv_range = l2c310_inv_range_erratum; + fns->flush_range = l2c310_flush_range_erratum; + errata[n++] = "588369"; + } - l2x0_base = base; - if (cache_id_part_number_from_dt) - cache_id = cache_id_part_number_from_dt; - else - cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID); - aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL); + if (IS_ENABLED(CONFIG_PL310_ERRATA_727915) && + revision >= L310_CACHE_ID_RTL_R2P0 && + revision < L310_CACHE_ID_RTL_R3P1) { + fns->flush_all = l2c310_flush_all_erratum; + errata[n++] = "727915"; + } + + if (revision >= L310_CACHE_ID_RTL_R3P0 && + revision < L310_CACHE_ID_RTL_R3P2) { + u32 val = readl_relaxed(base + L310_PREFETCH_CTRL); + /* I don't think bit23 is required here... but iMX6 does so */ + if (val & (BIT(30) | BIT(23))) { + val &= ~(BIT(30) | BIT(23)); + l2c_write_sec(val, base, L310_PREFETCH_CTRL); + errata[n++] = "752271"; + } + } + + if (IS_ENABLED(CONFIG_PL310_ERRATA_753970) && + revision == L310_CACHE_ID_RTL_R3P0) { + sync_reg_offset = L2X0_DUMMY_REG; + errata[n++] = "753970"; + } + + if (IS_ENABLED(CONFIG_PL310_ERRATA_769419)) + errata[n++] = "769419"; + + if (n) { + unsigned i; + pr_info("L2C-310 errat%s", n > 1 ? "a" : "um"); + for (i = 0; i < n; i++) + pr_cont(" %s", errata[i]); + pr_cont(" enabled\n"); + } +} + +static void l2c310_disable(void) +{ + /* + * If full-line-of-zeros is enabled, we must first disable it in the + * Cortex-A9 auxiliary control register before disabling the L2 cache. + */ + if (l2x0_saved_regs.aux_ctrl & L310_AUX_CTRL_FULL_LINE_ZERO) + set_auxcr(get_auxcr() & ~(BIT(3) | BIT(2) | BIT(1))); + + l2c_disable(); +} + +static const struct l2c_init_data l2c310_init_fns __initconst = { + .type = "L2C-310", + .way_size_0 = SZ_8K, + .num_lock = 8, + .enable = l2c310_enable, + .fixup = l2c310_fixup, + .save = l2c310_save, + .outer_cache = { + .inv_range = l2c210_inv_range, + .clean_range = l2c210_clean_range, + .flush_range = l2c210_flush_range, + .flush_all = l2c210_flush_all, + .disable = l2c310_disable, + .sync = l2c210_sync, + .resume = l2c310_resume, + }, +}; + +static void __init __l2c_init(const struct l2c_init_data *data, + u32 aux_val, u32 aux_mask, u32 cache_id) +{ + struct outer_cache_fns fns; + unsigned way_size_bits, ways; + u32 aux, old_aux; + + /* + * Sanity check the aux values. aux_mask is the bits we preserve + * from reading the hardware register, and aux_val is the bits we + * set. + */ + if (aux_val & aux_mask) + pr_alert("L2C: platform provided aux values permit register corruption.\n"); + + old_aux = aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL); aux &= aux_mask; aux |= aux_val; + if (old_aux != aux) + pr_warn("L2C: DT/platform modifies aux control register: 0x%08x -> 0x%08x\n", + old_aux, aux); + /* Determine the number of ways */ switch (cache_id & L2X0_CACHE_ID_PART_MASK) { case L2X0_CACHE_ID_PART_L310: + if ((aux_val | ~aux_mask) & (L2C_AUX_CTRL_WAY_SIZE_MASK | L310_AUX_CTRL_ASSOCIATIVITY_16)) + pr_warn("L2C: DT/platform tries to modify or specify cache size\n"); if (aux & (1 << 16)) ways = 16; else ways = 8; - type = "L310"; -#ifdef CONFIG_PL310_ERRATA_753970 - /* Unmapped register. */ - sync_reg_offset = L2X0_DUMMY_REG; -#endif - if ((cache_id & L2X0_CACHE_ID_RTL_MASK) <= L2X0_CACHE_ID_RTL_R3P0) - outer_cache.set_debug = pl310_set_debug; break; + case L2X0_CACHE_ID_PART_L210: + case L2X0_CACHE_ID_PART_L220: ways = (aux >> 13) & 0xf; - type = "L210"; break; case AURORA_CACHE_ID: - sync_reg_offset = AURORA_SYNC_REG; ways = (aux >> 13) & 0xf; ways = 2 << ((ways + 1) >> 2); - way_size_shift = AURORA_WAY_SIZE_SHIFT; - type = "Aurora"; break; + default: /* Assume unknown chips have 8 ways */ ways = 8; - type = "L2x0 series"; break; } l2x0_way_mask = (1 << ways) - 1; /* - * L2 cache Size = Way size * Number of ways + * way_size_0 is the size that a way_size value of zero would be + * given the calculation: way_size = way_size_0 << way_size_bits. + * So, if way_size_bits=0 is reserved, but way_size_bits=1 is 16k, + * then way_size_0 would be 8k. + * + * L2 cache size = number of ways * way size. */ - way_size = (aux & L2X0_AUX_CTRL_WAY_SIZE_MASK) >> 17; - way_size = 1 << (way_size + way_size_shift); + way_size_bits = (aux & L2C_AUX_CTRL_WAY_SIZE_MASK) >> + L2C_AUX_CTRL_WAY_SIZE_SHIFT; + l2x0_size = ways * (data->way_size_0 << way_size_bits); - l2x0_size = ways * way_size * SZ_1K; + fns = data->outer_cache; + fns.write_sec = outer_cache.write_sec; + if (data->fixup) + data->fixup(l2x0_base, cache_id, &fns); /* - * Check if l2x0 controller is already enabled. - * If you are booting from non-secure mode - * accessing the below registers will fault. + * Check if l2x0 controller is already enabled. If we are booting + * in non-secure mode accessing the below registers will fault. */ - if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) { - /* Make sure that I&D is not locked down when starting */ - l2x0_unlock(cache_id); + if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) + data->enable(l2x0_base, aux, data->num_lock); - /* l2x0 controller is disabled */ - writel_relaxed(aux, l2x0_base + L2X0_AUX_CTRL); + outer_cache = fns; - l2x0_inv_all(); - - /* enable L2X0 */ - writel_relaxed(L2X0_CTRL_EN, l2x0_base + L2X0_CTRL); - } + /* + * It is strange to save the register state before initialisation, + * but hey, this is what the DT implementations decided to do. + */ + if (data->save) + data->save(l2x0_base); /* Re-read it in case some bits are reserved. */ aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL); - /* Save the value for resuming. */ - l2x0_saved_regs.aux_ctrl = aux; + pr_info("%s cache controller enabled, %d ways, %d kB\n", + data->type, ways, l2x0_size >> 10); + pr_info("%s: CACHE_ID 0x%08x, AUX_CTRL 0x%08x\n", + data->type, cache_id, aux); +} + +void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask) +{ + const struct l2c_init_data *data; + u32 cache_id; + + l2x0_base = base; + + cache_id = readl_relaxed(base + L2X0_CACHE_ID); + + switch (cache_id & L2X0_CACHE_ID_PART_MASK) { + default: + case L2X0_CACHE_ID_PART_L210: + data = &l2c210_data; + break; - if (!of_init) { - outer_cache.inv_range = l2x0_inv_range; - outer_cache.clean_range = l2x0_clean_range; - outer_cache.flush_range = l2x0_flush_range; - outer_cache.sync = l2x0_cache_sync; - outer_cache.flush_all = l2x0_flush_all; - outer_cache.inv_all = l2x0_inv_all; - outer_cache.disable = l2x0_disable; + case L2X0_CACHE_ID_PART_L220: + data = &l2c220_data; + break; + + case L2X0_CACHE_ID_PART_L310: + data = &l2c310_init_fns; + break; } - pr_info("%s cache controller enabled\n", type); - pr_info("l2x0: %d ways, CACHE_ID 0x%08x, AUX_CTRL 0x%08x, Cache size: %d kB\n", - ways, cache_id, aux, l2x0_size >> 10); + __l2c_init(data, aux_val, aux_mask, cache_id); } #ifdef CONFIG_OF static int l2_wt_override; +/* Aurora don't have the cache ID register available, so we have to + * pass it though the device tree */ +static u32 cache_id_part_number_from_dt; + +static void __init l2x0_of_parse(const struct device_node *np, + u32 *aux_val, u32 *aux_mask) +{ + u32 data[2] = { 0, 0 }; + u32 tag = 0; + u32 dirty = 0; + u32 val = 0, mask = 0; + + of_property_read_u32(np, "arm,tag-latency", &tag); + if (tag) { + mask |= L2X0_AUX_CTRL_TAG_LATENCY_MASK; + val |= (tag - 1) << L2X0_AUX_CTRL_TAG_LATENCY_SHIFT; + } + + of_property_read_u32_array(np, "arm,data-latency", + data, ARRAY_SIZE(data)); + if (data[0] && data[1]) { + mask |= L2X0_AUX_CTRL_DATA_RD_LATENCY_MASK | + L2X0_AUX_CTRL_DATA_WR_LATENCY_MASK; + val |= ((data[0] - 1) << L2X0_AUX_CTRL_DATA_RD_LATENCY_SHIFT) | + ((data[1] - 1) << L2X0_AUX_CTRL_DATA_WR_LATENCY_SHIFT); + } + + of_property_read_u32(np, "arm,dirty-latency", &dirty); + if (dirty) { + mask |= L2X0_AUX_CTRL_DIRTY_LATENCY_MASK; + val |= (dirty - 1) << L2X0_AUX_CTRL_DIRTY_LATENCY_SHIFT; + } + + *aux_val &= ~mask; + *aux_val |= val; + *aux_mask &= ~mask; +} + +static const struct l2c_init_data of_l2c210_data __initconst = { + .type = "L2C-210", + .way_size_0 = SZ_8K, + .num_lock = 1, + .of_parse = l2x0_of_parse, + .enable = l2c_enable, + .save = l2c_save, + .outer_cache = { + .inv_range = l2c210_inv_range, + .clean_range = l2c210_clean_range, + .flush_range = l2c210_flush_range, + .flush_all = l2c210_flush_all, + .disable = l2c_disable, + .sync = l2c210_sync, + .resume = l2c210_resume, + }, +}; + +static const struct l2c_init_data of_l2c220_data __initconst = { + .type = "L2C-220", + .way_size_0 = SZ_8K, + .num_lock = 1, + .of_parse = l2x0_of_parse, + .enable = l2c220_enable, + .save = l2c_save, + .outer_cache = { + .inv_range = l2c220_inv_range, + .clean_range = l2c220_clean_range, + .flush_range = l2c220_flush_range, + .flush_all = l2c220_flush_all, + .disable = l2c_disable, + .sync = l2c220_sync, + .resume = l2c210_resume, + }, +}; + +static void __init l2c310_of_parse(const struct device_node *np, + u32 *aux_val, u32 *aux_mask) +{ + u32 data[3] = { 0, 0, 0 }; + u32 tag[3] = { 0, 0, 0 }; + u32 filter[2] = { 0, 0 }; + + of_property_read_u32_array(np, "arm,tag-latency", tag, ARRAY_SIZE(tag)); + if (tag[0] && tag[1] && tag[2]) + writel_relaxed( + L310_LATENCY_CTRL_RD(tag[0] - 1) | + L310_LATENCY_CTRL_WR(tag[1] - 1) | + L310_LATENCY_CTRL_SETUP(tag[2] - 1), + l2x0_base + L310_TAG_LATENCY_CTRL); + + of_property_read_u32_array(np, "arm,data-latency", + data, ARRAY_SIZE(data)); + if (data[0] && data[1] && data[2]) + writel_relaxed( + L310_LATENCY_CTRL_RD(data[0] - 1) | + L310_LATENCY_CTRL_WR(data[1] - 1) | + L310_LATENCY_CTRL_SETUP(data[2] - 1), + l2x0_base + L310_DATA_LATENCY_CTRL); + + of_property_read_u32_array(np, "arm,filter-ranges", + filter, ARRAY_SIZE(filter)); + if (filter[1]) { + writel_relaxed(ALIGN(filter[0] + filter[1], SZ_1M), + l2x0_base + L310_ADDR_FILTER_END); + writel_relaxed((filter[0] & ~(SZ_1M - 1)) | L310_ADDR_FILTER_EN, + l2x0_base + L310_ADDR_FILTER_START); + } +} + +static const struct l2c_init_data of_l2c310_data __initconst = { + .type = "L2C-310", + .way_size_0 = SZ_8K, + .num_lock = 8, + .of_parse = l2c310_of_parse, + .enable = l2c310_enable, + .fixup = l2c310_fixup, + .save = l2c310_save, + .outer_cache = { + .inv_range = l2c210_inv_range, + .clean_range = l2c210_clean_range, + .flush_range = l2c210_flush_range, + .flush_all = l2c210_flush_all, + .disable = l2c310_disable, + .sync = l2c210_sync, + .resume = l2c310_resume, + }, +}; + /* * Note that the end addresses passed to Linux primitives are * noninclusive, while the hardware cache range operations use @@ -524,6 +1166,100 @@ static void aurora_flush_range(unsigned long start, unsigned long end) } } +static void aurora_save(void __iomem *base) +{ + l2x0_saved_regs.ctrl = readl_relaxed(base + L2X0_CTRL); + l2x0_saved_regs.aux_ctrl = readl_relaxed(base + L2X0_AUX_CTRL); +} + +static void aurora_resume(void) +{ + void __iomem *base = l2x0_base; + + if (!(readl(base + L2X0_CTRL) & L2X0_CTRL_EN)) { + writel_relaxed(l2x0_saved_regs.aux_ctrl, base + L2X0_AUX_CTRL); + writel_relaxed(l2x0_saved_regs.ctrl, base + L2X0_CTRL); + } +} + +/* + * For Aurora cache in no outer mode, enable via the CP15 coprocessor + * broadcasting of cache commands to L2. + */ +static void __init aurora_enable_no_outer(void __iomem *base, u32 aux, + unsigned num_lock) +{ + u32 u; + + asm volatile("mrc p15, 1, %0, c15, c2, 0" : "=r" (u)); + u |= AURORA_CTRL_FW; /* Set the FW bit */ + asm volatile("mcr p15, 1, %0, c15, c2, 0" : : "r" (u)); + + isb(); + + l2c_enable(base, aux, num_lock); +} + +static void __init aurora_fixup(void __iomem *base, u32 cache_id, + struct outer_cache_fns *fns) +{ + sync_reg_offset = AURORA_SYNC_REG; +} + +static void __init aurora_of_parse(const struct device_node *np, + u32 *aux_val, u32 *aux_mask) +{ + u32 val = AURORA_ACR_REPLACEMENT_TYPE_SEMIPLRU; + u32 mask = AURORA_ACR_REPLACEMENT_MASK; + + of_property_read_u32(np, "cache-id-part", + &cache_id_part_number_from_dt); + + /* Determine and save the write policy */ + l2_wt_override = of_property_read_bool(np, "wt-override"); + + if (l2_wt_override) { + val |= AURORA_ACR_FORCE_WRITE_THRO_POLICY; + mask |= AURORA_ACR_FORCE_WRITE_POLICY_MASK; + } + + *aux_val &= ~mask; + *aux_val |= val; + *aux_mask &= ~mask; +} + +static const struct l2c_init_data of_aurora_with_outer_data __initconst = { + .type = "Aurora", + .way_size_0 = SZ_4K, + .num_lock = 4, + .of_parse = aurora_of_parse, + .enable = l2c_enable, + .fixup = aurora_fixup, + .save = aurora_save, + .outer_cache = { + .inv_range = aurora_inv_range, + .clean_range = aurora_clean_range, + .flush_range = aurora_flush_range, + .flush_all = l2x0_flush_all, + .disable = l2x0_disable, + .sync = l2x0_cache_sync, + .resume = aurora_resume, + }, +}; + +static const struct l2c_init_data of_aurora_no_outer_data __initconst = { + .type = "Aurora", + .way_size_0 = SZ_4K, + .num_lock = 4, + .of_parse = aurora_of_parse, + .enable = aurora_enable_no_outer, + .fixup = aurora_fixup, + .save = aurora_save, + .outer_cache = { + .resume = aurora_resume, + }, +}; + /* * For certain Broadcom SoCs, depending on the address range, different offsets * need to be added to the address before passing it to L2 for @@ -588,16 +1324,16 @@ static void bcm_inv_range(unsigned long start, unsigned long end) /* normal case, no cross section between start and end */ if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) { - l2x0_inv_range(new_start, new_end); + l2c210_inv_range(new_start, new_end); return; } /* They cross sections, so it can only be a cross from section * 2 to section 3 */ - l2x0_inv_range(new_start, + l2c210_inv_range(new_start, bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1)); - l2x0_inv_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR), + l2c210_inv_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR), new_end); } @@ -610,26 +1346,21 @@ static void bcm_clean_range(unsigned long start, unsigned long end) if (unlikely(end <= start)) return; - if ((end - start) >= l2x0_size) { - l2x0_clean_all(); - return; - } - new_start = bcm_l2_phys_addr(start); new_end = bcm_l2_phys_addr(end); /* normal case, no cross section between start and end */ if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) { - l2x0_clean_range(new_start, new_end); + l2c210_clean_range(new_start, new_end); return; } /* They cross sections, so it can only be a cross from section * 2 to section 3 */ - l2x0_clean_range(new_start, + l2c210_clean_range(new_start, bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1)); - l2x0_clean_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR), + l2c210_clean_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR), new_end); } @@ -643,7 +1374,7 @@ static void bcm_flush_range(unsigned long start, unsigned long end) return; if ((end - start) >= l2x0_size) { - l2x0_flush_all(); + outer_cache.flush_all(); return; } @@ -652,283 +1383,67 @@ static void bcm_flush_range(unsigned long start, unsigned long end) /* normal case, no cross section between start and end */ if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) { - l2x0_flush_range(new_start, new_end); + l2c210_flush_range(new_start, new_end); return; } /* They cross sections, so it can only be a cross from section * 2 to section 3 */ - l2x0_flush_range(new_start, + l2c210_flush_range(new_start, bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1)); - l2x0_flush_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR), + l2c210_flush_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR), new_end); } -static void __init l2x0_of_setup(const struct device_node *np, - u32 *aux_val, u32 *aux_mask) -{ - u32 data[2] = { 0, 0 }; - u32 tag = 0; - u32 dirty = 0; - u32 val = 0, mask = 0; - - of_property_read_u32(np, "arm,tag-latency", &tag); - if (tag) { - mask |= L2X0_AUX_CTRL_TAG_LATENCY_MASK; - val |= (tag - 1) << L2X0_AUX_CTRL_TAG_LATENCY_SHIFT; - } - - of_property_read_u32_array(np, "arm,data-latency", - data, ARRAY_SIZE(data)); - if (data[0] && data[1]) { - mask |= L2X0_AUX_CTRL_DATA_RD_LATENCY_MASK | - L2X0_AUX_CTRL_DATA_WR_LATENCY_MASK; - val |= ((data[0] - 1) << L2X0_AUX_CTRL_DATA_RD_LATENCY_SHIFT) | - ((data[1] - 1) << L2X0_AUX_CTRL_DATA_WR_LATENCY_SHIFT); - } - - of_property_read_u32(np, "arm,dirty-latency", &dirty); - if (dirty) { - mask |= L2X0_AUX_CTRL_DIRTY_LATENCY_MASK; - val |= (dirty - 1) << L2X0_AUX_CTRL_DIRTY_LATENCY_SHIFT; - } - - *aux_val &= ~mask; - *aux_val |= val; - *aux_mask &= ~mask; -} - -static void __init pl310_of_setup(const struct device_node *np, - u32 *aux_val, u32 *aux_mask) -{ - u32 data[3] = { 0, 0, 0 }; - u32 tag[3] = { 0, 0, 0 }; - u32 filter[2] = { 0, 0 }; - - of_property_read_u32_array(np, "arm,tag-latency", tag, ARRAY_SIZE(tag)); - if (tag[0] && tag[1] && tag[2]) - writel_relaxed( - ((tag[0] - 1) << L2X0_LATENCY_CTRL_RD_SHIFT) | - ((tag[1] - 1) << L2X0_LATENCY_CTRL_WR_SHIFT) | - ((tag[2] - 1) << L2X0_LATENCY_CTRL_SETUP_SHIFT), - l2x0_base + L2X0_TAG_LATENCY_CTRL); - - of_property_read_u32_array(np, "arm,data-latency", - data, ARRAY_SIZE(data)); - if (data[0] && data[1] && data[2]) - writel_relaxed( - ((data[0] - 1) << L2X0_LATENCY_CTRL_RD_SHIFT) | - ((data[1] - 1) << L2X0_LATENCY_CTRL_WR_SHIFT) | - ((data[2] - 1) << L2X0_LATENCY_CTRL_SETUP_SHIFT), - l2x0_base + L2X0_DATA_LATENCY_CTRL); - - of_property_read_u32_array(np, "arm,filter-ranges", - filter, ARRAY_SIZE(filter)); - if (filter[1]) { - writel_relaxed(ALIGN(filter[0] + filter[1], SZ_1M), - l2x0_base + L2X0_ADDR_FILTER_END); - writel_relaxed((filter[0] & ~(SZ_1M - 1)) | L2X0_ADDR_FILTER_EN, - l2x0_base + L2X0_ADDR_FILTER_START); - } -} - -static void __init pl310_save(void) -{ - u32 l2x0_revision = readl_relaxed(l2x0_base + L2X0_CACHE_ID) & - L2X0_CACHE_ID_RTL_MASK; - - l2x0_saved_regs.tag_latency = readl_relaxed(l2x0_base + - L2X0_TAG_LATENCY_CTRL); - l2x0_saved_regs.data_latency = readl_relaxed(l2x0_base + - L2X0_DATA_LATENCY_CTRL); - l2x0_saved_regs.filter_end = readl_relaxed(l2x0_base + - L2X0_ADDR_FILTER_END); - l2x0_saved_regs.filter_start = readl_relaxed(l2x0_base + - L2X0_ADDR_FILTER_START); - - if (l2x0_revision >= L2X0_CACHE_ID_RTL_R2P0) { - /* - * From r2p0, there is Prefetch offset/control register - */ - l2x0_saved_regs.prefetch_ctrl = readl_relaxed(l2x0_base + - L2X0_PREFETCH_CTRL); - /* - * From r3p0, there is Power control register - */ - if (l2x0_revision >= L2X0_CACHE_ID_RTL_R3P0) - l2x0_saved_regs.pwr_ctrl = readl_relaxed(l2x0_base + - L2X0_POWER_CTRL); - } -} +/* Broadcom L2C-310 start from ARMs R3P2 or later, and require no fixups */ +static const struct l2c_init_data of_bcm_l2x0_data __initconst = { + .type = "BCM-L2C-310", + .way_size_0 = SZ_8K, + .num_lock = 8, + .of_parse = l2c310_of_parse, + .enable = l2c310_enable, + .save = l2c310_save, + .outer_cache = { + .inv_range = bcm_inv_range, + .clean_range = bcm_clean_range, + .flush_range = bcm_flush_range, + .flush_all = l2c210_flush_all, + .disable = l2c310_disable, + .sync = l2c210_sync, + .resume = l2c310_resume, + }, +}; -static void aurora_save(void) +static void __init tauros3_save(void __iomem *base) { - l2x0_saved_regs.ctrl = readl_relaxed(l2x0_base + L2X0_CTRL); - l2x0_saved_regs.aux_ctrl = readl_relaxed(l2x0_base + L2X0_AUX_CTRL); -} + l2c_save(base); -static void __init tauros3_save(void) -{ l2x0_saved_regs.aux2_ctrl = - readl_relaxed(l2x0_base + TAUROS3_AUX2_CTRL); + readl_relaxed(base + TAUROS3_AUX2_CTRL); l2x0_saved_regs.prefetch_ctrl = - readl_relaxed(l2x0_base + L2X0_PREFETCH_CTRL); -} - -static void l2x0_resume(void) -{ - if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) { - /* restore aux ctrl and enable l2 */ - l2x0_unlock(readl_relaxed(l2x0_base + L2X0_CACHE_ID)); - - writel_relaxed(l2x0_saved_regs.aux_ctrl, l2x0_base + - L2X0_AUX_CTRL); - - l2x0_inv_all(); - - writel_relaxed(L2X0_CTRL_EN, l2x0_base + L2X0_CTRL); - } -} - -static void pl310_resume(void) -{ - u32 l2x0_revision; - - if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) { - /* restore pl310 setup */ - writel_relaxed(l2x0_saved_regs.tag_latency, - l2x0_base + L2X0_TAG_LATENCY_CTRL); - writel_relaxed(l2x0_saved_regs.data_latency, - l2x0_base + L2X0_DATA_LATENCY_CTRL); - writel_relaxed(l2x0_saved_regs.filter_end, - l2x0_base + L2X0_ADDR_FILTER_END); - writel_relaxed(l2x0_saved_regs.filter_start, - l2x0_base + L2X0_ADDR_FILTER_START); - - l2x0_revision = readl_relaxed(l2x0_base + L2X0_CACHE_ID) & - L2X0_CACHE_ID_RTL_MASK; - - if (l2x0_revision >= L2X0_CACHE_ID_RTL_R2P0) { - writel_relaxed(l2x0_saved_regs.prefetch_ctrl, - l2x0_base + L2X0_PREFETCH_CTRL); - if (l2x0_revision >= L2X0_CACHE_ID_RTL_R3P0) - writel_relaxed(l2x0_saved_regs.pwr_ctrl, - l2x0_base + L2X0_POWER_CTRL); - } - } - - l2x0_resume(); -} - -static void aurora_resume(void) -{ - if (!(readl(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) { - writel_relaxed(l2x0_saved_regs.aux_ctrl, - l2x0_base + L2X0_AUX_CTRL); - writel_relaxed(l2x0_saved_regs.ctrl, l2x0_base + L2X0_CTRL); - } + readl_relaxed(base + L310_PREFETCH_CTRL); } static void tauros3_resume(void) { - if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) { + void __iomem *base = l2x0_base; + + if (!(readl_relaxed(base + L2X0_CTRL) & L2X0_CTRL_EN)) { writel_relaxed(l2x0_saved_regs.aux2_ctrl, - l2x0_base + TAUROS3_AUX2_CTRL); + base + TAUROS3_AUX2_CTRL); writel_relaxed(l2x0_saved_regs.prefetch_ctrl, - l2x0_base + L2X0_PREFETCH_CTRL); - } + base + L310_PREFETCH_CTRL); - l2x0_resume(); -} - -static void __init aurora_broadcast_l2_commands(void) -{ - __u32 u; - /* Enable Broadcasting of cache commands to L2*/ - __asm__ __volatile__("mrc p15, 1, %0, c15, c2, 0" : "=r"(u)); - u |= AURORA_CTRL_FW; /* Set the FW bit */ - __asm__ __volatile__("mcr p15, 1, %0, c15, c2, 0\n" : : "r"(u)); - isb(); -} - -static void __init aurora_of_setup(const struct device_node *np, - u32 *aux_val, u32 *aux_mask) -{ - u32 val = AURORA_ACR_REPLACEMENT_TYPE_SEMIPLRU; - u32 mask = AURORA_ACR_REPLACEMENT_MASK; - - of_property_read_u32(np, "cache-id-part", - &cache_id_part_number_from_dt); - - /* Determine and save the write policy */ - l2_wt_override = of_property_read_bool(np, "wt-override"); - - if (l2_wt_override) { - val |= AURORA_ACR_FORCE_WRITE_THRO_POLICY; - mask |= AURORA_ACR_FORCE_WRITE_POLICY_MASK; + l2c_enable(base, l2x0_saved_regs.aux_ctrl, 8); } - - *aux_val &= ~mask; - *aux_val |= val; - *aux_mask &= ~mask; } -static const struct l2x0_of_data pl310_data = { - .setup = pl310_of_setup, - .save = pl310_save, - .outer_cache = { - .resume = pl310_resume, - .inv_range = l2x0_inv_range, - .clean_range = l2x0_clean_range, - .flush_range = l2x0_flush_range, - .sync = l2x0_cache_sync, - .flush_all = l2x0_flush_all, - .inv_all = l2x0_inv_all, - .disable = l2x0_disable, - }, -}; - -static const struct l2x0_of_data l2x0_data = { - .setup = l2x0_of_setup, - .save = NULL, - .outer_cache = { - .resume = l2x0_resume, - .inv_range = l2x0_inv_range, - .clean_range = l2x0_clean_range, - .flush_range = l2x0_flush_range, - .sync = l2x0_cache_sync, - .flush_all = l2x0_flush_all, - .inv_all = l2x0_inv_all, - .disable = l2x0_disable, - }, -}; - -static const struct l2x0_of_data aurora_with_outer_data = { - .setup = aurora_of_setup, - .save = aurora_save, - .outer_cache = { - .resume = aurora_resume, - .inv_range = aurora_inv_range, - .clean_range = aurora_clean_range, - .flush_range = aurora_flush_range, - .sync = l2x0_cache_sync, - .flush_all = l2x0_flush_all, - .inv_all = l2x0_inv_all, - .disable = l2x0_disable, - }, -}; - -static const struct l2x0_of_data aurora_no_outer_data = { - .setup = aurora_of_setup, - .save = aurora_save, - .outer_cache = { - .resume = aurora_resume, - }, -}; - -static const struct l2x0_of_data tauros3_data = { - .setup = NULL, +static const struct l2c_init_data of_tauros3_data __initconst = { + .type = "Tauros3", + .way_size_0 = SZ_8K, + .num_lock = 8, + .enable = l2c_enable, .save = tauros3_save, /* Tauros3 broadcasts L1 cache operations to L2 */ .outer_cache = { @@ -936,43 +1451,26 @@ static const struct l2x0_of_data tauros3_data = { }, }; -static const struct l2x0_of_data bcm_l2x0_data = { - .setup = pl310_of_setup, - .save = pl310_save, - .outer_cache = { - .resume = pl310_resume, - .inv_range = bcm_inv_range, - .clean_range = bcm_clean_range, - .flush_range = bcm_flush_range, - .sync = l2x0_cache_sync, - .flush_all = l2x0_flush_all, - .inv_all = l2x0_inv_all, - .disable = l2x0_disable, - }, -}; - +#define L2C_ID(name, fns) { .compatible = name, .data = (void *)&fns } static const struct of_device_id l2x0_ids[] __initconst = { - { .compatible = "arm,l210-cache", .data = (void *)&l2x0_data }, - { .compatible = "arm,l220-cache", .data = (void *)&l2x0_data }, - { .compatible = "arm,pl310-cache", .data = (void *)&pl310_data }, - { .compatible = "bcm,bcm11351-a2-pl310-cache", /* deprecated name */ - .data = (void *)&bcm_l2x0_data}, - { .compatible = "brcm,bcm11351-a2-pl310-cache", - .data = (void *)&bcm_l2x0_data}, - { .compatible = "marvell,aurora-outer-cache", - .data = (void *)&aurora_with_outer_data}, - { .compatible = "marvell,aurora-system-cache", - .data = (void *)&aurora_no_outer_data}, - { .compatible = "marvell,tauros3-cache", - .data = (void *)&tauros3_data }, + L2C_ID("arm,l210-cache", of_l2c210_data), + L2C_ID("arm,l220-cache", of_l2c220_data), + L2C_ID("arm,pl310-cache", of_l2c310_data), + L2C_ID("brcm,bcm11351-a2-pl310-cache", of_bcm_l2x0_data), + L2C_ID("marvell,aurora-outer-cache", of_aurora_with_outer_data), + L2C_ID("marvell,aurora-system-cache", of_aurora_no_outer_data), + L2C_ID("marvell,tauros3-cache", of_tauros3_data), + /* Deprecated IDs */ + L2C_ID("bcm,bcm11351-a2-pl310-cache", of_bcm_l2x0_data), {} }; int __init l2x0_of_init(u32 aux_val, u32 aux_mask) { + const struct l2c_init_data *data; struct device_node *np; - const struct l2x0_of_data *data; struct resource res; + u32 cache_id, old_aux; np = of_find_matching_node(NULL, l2x0_ids); if (!np) @@ -989,23 +1487,29 @@ int __init l2x0_of_init(u32 aux_val, u32 aux_mask) data = of_match_node(l2x0_ids, np)->data; - /* L2 configuration can only be changed if the cache is disabled */ - if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) { - if (data->setup) - data->setup(np, &aux_val, &aux_mask); - - /* For aurora cache in no outer mode select the - * correct mode using the coprocessor*/ - if (data == &aurora_no_outer_data) - aurora_broadcast_l2_commands(); + old_aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL); + if (old_aux != ((old_aux & aux_mask) | aux_val)) { + pr_warn("L2C: platform modifies aux control register: 0x%08x -> 0x%08x\n", + old_aux, (old_aux & aux_mask) | aux_val); + } else if (aux_mask != ~0U && aux_val != 0) { + pr_alert("L2C: platform provided aux values match the hardware, so have no effect. Please remove them.\n"); } - if (data->save) - data->save(); + /* All L2 caches are unified, so this property should be specified */ + if (!of_property_read_bool(np, "cache-unified")) + pr_err("L2C: device tree omits to specify unified cache\n"); + + /* L2 configuration can only be changed if the cache is disabled */ + if (!(readl_relaxed(l2x0_base + L2X0_CTRL) & L2X0_CTRL_EN)) + if (data->of_parse) + data->of_parse(np, &aux_val, &aux_mask); + + if (cache_id_part_number_from_dt) + cache_id = cache_id_part_number_from_dt; + else + cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID); - of_init = true; - memcpy(&outer_cache, &data->outer_cache, sizeof(outer_cache)); - l2x0_init(l2x0_base, aux_val, aux_mask); + __l2c_init(data, aux_val, aux_mask, cache_id); return 0; } diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S index 778bcf88ee79..615c99e38ba1 100644 --- a/arch/arm/mm/cache-v7.S +++ b/arch/arm/mm/cache-v7.S @@ -59,7 +59,7 @@ ENTRY(v7_invalidate_l1) bgt 2b cmp r2, #0 bgt 1b - dsb + dsb st isb mov pc, lr ENDPROC(v7_invalidate_l1) @@ -166,7 +166,7 @@ skip: finished: mov r10, #0 @ swith back to cache level 0 mcr p15, 2, r10, c0, c0, 0 @ select current cache level in cssr - dsb + dsb st isb mov pc, lr ENDPROC(v7_flush_dcache_all) @@ -335,7 +335,7 @@ ENTRY(v7_flush_kern_dcache_area) add r0, r0, r2 cmp r0, r1 blo 1b - dsb + dsb st mov pc, lr ENDPROC(v7_flush_kern_dcache_area) @@ -368,7 +368,7 @@ v7_dma_inv_range: add r0, r0, r2 cmp r0, r1 blo 1b - dsb + dsb st mov pc, lr ENDPROC(v7_dma_inv_range) @@ -390,7 +390,7 @@ v7_dma_clean_range: add r0, r0, r2 cmp r0, r1 blo 1b - dsb + dsb st mov pc, lr ENDPROC(v7_dma_clean_range) @@ -412,7 +412,7 @@ ENTRY(v7_dma_flush_range) add r0, r0, r2 cmp r0, r1 blo 1b - dsb + dsb st mov pc, lr ENDPROC(v7_dma_flush_range) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 5bef858568e6..4c88935654ca 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -885,7 +885,7 @@ static void dma_cache_maint_page(struct page *page, unsigned long offset, static void __dma_page_cpu_to_dev(struct page *page, unsigned long off, size_t size, enum dma_data_direction dir) { - unsigned long paddr; + phys_addr_t paddr; dma_cache_maint_page(page, off, size, dir, dmac_map_area); @@ -901,14 +901,15 @@ static void __dma_page_cpu_to_dev(struct page *page, unsigned long off, static void __dma_page_dev_to_cpu(struct page *page, unsigned long off, size_t size, enum dma_data_direction dir) { - unsigned long paddr = page_to_phys(page) + off; + phys_addr_t paddr = page_to_phys(page) + off; /* FIXME: non-speculating: not required */ - /* don't bother invalidating if DMA to device */ - if (dir != DMA_TO_DEVICE) + /* in any case, don't bother invalidating if DMA to device */ + if (dir != DMA_TO_DEVICE) { outer_inv_range(paddr, paddr + size); - dma_cache_maint_page(page, off, size, dir, dmac_unmap_area); + dma_cache_maint_page(page, off, size, dir, dmac_unmap_area); + } /* * Mark the D-cache clean for these pages to avoid extra flushing. diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 3387e60e4ea3..43d54f5b26b9 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -104,17 +104,20 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsig #define flush_icache_alias(pfn,vaddr,len) do { } while (0) #endif +#define FLAG_PA_IS_EXEC 1 +#define FLAG_PA_CORE_IN_MM 2 + static void flush_ptrace_access_other(void *args) { __flush_icache_all(); } -static -void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, - unsigned long uaddr, void *kaddr, unsigned long len) +static inline +void __flush_ptrace_access(struct page *page, unsigned long uaddr, void *kaddr, + unsigned long len, unsigned int flags) { if (cache_is_vivt()) { - if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { + if (flags & FLAG_PA_CORE_IN_MM) { unsigned long addr = (unsigned long)kaddr; __cpuc_coherent_kern_range(addr, addr + len); } @@ -128,7 +131,7 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, } /* VIPT non-aliasing D-cache */ - if (vma->vm_flags & VM_EXEC) { + if (flags & FLAG_PA_IS_EXEC) { unsigned long addr = (unsigned long)kaddr; if (icache_is_vipt_aliasing()) flush_icache_alias(page_to_pfn(page), uaddr, len); @@ -140,6 +143,26 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, } } +static +void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, + unsigned long uaddr, void *kaddr, unsigned long len) +{ + unsigned int flags = 0; + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) + flags |= FLAG_PA_CORE_IN_MM; + if (vma->vm_flags & VM_EXEC) + flags |= FLAG_PA_IS_EXEC; + __flush_ptrace_access(page, uaddr, kaddr, len, flags); +} + +void flush_uprobe_xol_access(struct page *page, unsigned long uaddr, + void *kaddr, unsigned long len) +{ + unsigned int flags = FLAG_PA_CORE_IN_MM|FLAG_PA_IS_EXEC; + + __flush_ptrace_access(page, uaddr, kaddr, len, flags); +} + /* * Copy user data from/to a page which is mapped into a different * processes address space. Really, we want to allow our "user diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index 21b9e1bf9b77..45aeaaca9052 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -18,6 +18,21 @@ #include <asm/tlbflush.h> #include "mm.h" +pte_t *fixmap_page_table; + +static inline void set_fixmap_pte(int idx, pte_t pte) +{ + unsigned long vaddr = __fix_to_virt(idx); + set_pte_ext(fixmap_page_table + idx, pte, 0); + local_flush_tlb_kernel_page(vaddr); +} + +static inline pte_t get_fixmap_pte(unsigned long vaddr) +{ + unsigned long idx = __virt_to_fix(vaddr); + return *(fixmap_page_table + idx); +} + void *kmap(struct page *page) { might_sleep(); @@ -63,20 +78,20 @@ void *kmap_atomic(struct page *page) type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + vaddr = __fix_to_virt(idx); #ifdef CONFIG_DEBUG_HIGHMEM /* * With debugging enabled, kunmap_atomic forces that entry to 0. * Make sure it was indeed properly unmapped. */ - BUG_ON(!pte_none(get_top_pte(vaddr))); + BUG_ON(!pte_none(*(fixmap_page_table + idx))); #endif /* * When debugging is off, kunmap_atomic leaves the previous mapping * in place, so the contained TLB flush ensures the TLB is updated * with the new mapping. */ - set_top_pte(vaddr, mk_pte(page, kmap_prot)); + set_fixmap_pte(idx, mk_pte(page, kmap_prot)); return (void *)vaddr; } @@ -94,8 +109,8 @@ void __kunmap_atomic(void *kvaddr) if (cache_is_vivt()) __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); #ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); - set_top_pte(vaddr, __pte(0)); + BUG_ON(vaddr != __fix_to_virt(idx)); + set_fixmap_pte(idx, __pte(0)); #else (void) idx; /* to kill a warning */ #endif @@ -117,11 +132,11 @@ void *kmap_atomic_pfn(unsigned long pfn) type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + vaddr = __fix_to_virt(idx); #ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(get_top_pte(vaddr))); + BUG_ON(!pte_none(*(fixmap_page_table + idx))); #endif - set_top_pte(vaddr, pfn_pte(pfn, kmap_prot)); + set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot)); return (void *)vaddr; } @@ -133,5 +148,5 @@ struct page *kmap_atomic_to_page(const void *ptr) if (vaddr < FIXADDR_START) return virt_to_page(ptr); - return pte_page(get_top_pte(vaddr)); + return pte_page(get_fixmap_pte(vaddr)); } diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 928d596d9ab4..659c75d808dc 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -23,6 +23,7 @@ #include <linux/dma-contiguous.h> #include <linux/sizes.h> +#include <asm/cp15.h> #include <asm/mach-types.h> #include <asm/memblock.h> #include <asm/prom.h> @@ -36,6 +37,14 @@ #include "mm.h" +#ifdef CONFIG_CPU_CP15_MMU +unsigned long __init __clear_cr(unsigned long mask) +{ + cr_alignment = cr_alignment & ~mask; + return cr_alignment; +} +#endif + static phys_addr_t phys_initrd_start __initdata = 0; static unsigned long phys_initrd_size __initdata = 0; @@ -81,24 +90,21 @@ __tagtable(ATAG_INITRD2, parse_tag_initrd2); * initialization functions, as well as show_mem() for the skipping * of holes in the memory map. It is populated by arm_add_memory(). */ -struct meminfo meminfo; - void show_mem(unsigned int filter) { int free = 0, total = 0, reserved = 0; - int shared = 0, cached = 0, slab = 0, i; - struct meminfo * mi = &meminfo; + int shared = 0, cached = 0, slab = 0; + struct memblock_region *reg; printk("Mem-info:\n"); show_free_areas(filter); - for_each_bank (i, mi) { - struct membank *bank = &mi->bank[i]; + for_each_memblock (memory, reg) { unsigned int pfn1, pfn2; struct page *page, *end; - pfn1 = bank_pfn_start(bank); - pfn2 = bank_pfn_end(bank); + pfn1 = memblock_region_memory_base_pfn(reg); + pfn2 = memblock_region_memory_end_pfn(reg); page = pfn_to_page(pfn1); end = pfn_to_page(pfn2 - 1) + 1; @@ -115,8 +121,9 @@ void show_mem(unsigned int filter) free++; else shared += page_count(page) - 1; - page++; - } while (page < end); + pfn1++; + page = pfn_to_page(pfn1); + } while (pfn1 < pfn2); } printk("%d pages of RAM\n", total); @@ -130,16 +137,9 @@ void show_mem(unsigned int filter) static void __init find_limits(unsigned long *min, unsigned long *max_low, unsigned long *max_high) { - struct meminfo *mi = &meminfo; - int i; - - /* This assumes the meminfo array is properly sorted */ - *min = bank_pfn_start(&mi->bank[0]); - for_each_bank (i, mi) - if (mi->bank[i].highmem) - break; - *max_low = bank_pfn_end(&mi->bank[i - 1]); - *max_high = bank_pfn_end(&mi->bank[mi->nr_banks - 1]); + *max_low = PFN_DOWN(memblock_get_current_limit()); + *min = PFN_UP(memblock_start_of_DRAM()); + *max_high = PFN_DOWN(memblock_end_of_DRAM()); } #ifdef CONFIG_ZONE_DMA @@ -274,14 +274,8 @@ phys_addr_t __init arm_memblock_steal(phys_addr_t size, phys_addr_t align) return phys; } -void __init arm_memblock_init(struct meminfo *mi, - const struct machine_desc *mdesc) +void __init arm_memblock_init(const struct machine_desc *mdesc) { - int i; - - for (i = 0; i < mi->nr_banks; i++) - memblock_add(mi->bank[i].start, mi->bank[i].size); - /* Register the kernel text, kernel data and initrd with memblock. */ #ifdef CONFIG_XIP_KERNEL memblock_reserve(__pa(_sdata), _end - _sdata); @@ -412,54 +406,53 @@ free_memmap(unsigned long start_pfn, unsigned long end_pfn) /* * The mem_map array can get very big. Free the unused area of the memory map. */ -static void __init free_unused_memmap(struct meminfo *mi) +static void __init free_unused_memmap(void) { - unsigned long bank_start, prev_bank_end = 0; - unsigned int i; + unsigned long start, prev_end = 0; + struct memblock_region *reg; /* * This relies on each bank being in address order. * The banks are sorted previously in bootmem_init(). */ - for_each_bank(i, mi) { - struct membank *bank = &mi->bank[i]; - - bank_start = bank_pfn_start(bank); + for_each_memblock(memory, reg) { + start = memblock_region_memory_base_pfn(reg); #ifdef CONFIG_SPARSEMEM /* * Take care not to free memmap entries that don't exist * due to SPARSEMEM sections which aren't present. */ - bank_start = min(bank_start, - ALIGN(prev_bank_end, PAGES_PER_SECTION)); + start = min(start, + ALIGN(prev_end, PAGES_PER_SECTION)); #else /* * Align down here since the VM subsystem insists that the * memmap entries are valid from the bank start aligned to * MAX_ORDER_NR_PAGES. */ - bank_start = round_down(bank_start, MAX_ORDER_NR_PAGES); + start = round_down(start, MAX_ORDER_NR_PAGES); #endif /* * If we had a previous bank, and there is a space * between the current bank and the previous, free it. */ - if (prev_bank_end && prev_bank_end < bank_start) - free_memmap(prev_bank_end, bank_start); + if (prev_end && prev_end < start) + free_memmap(prev_end, start); /* * Align up here since the VM subsystem insists that the * memmap entries are valid from the bank end aligned to * MAX_ORDER_NR_PAGES. */ - prev_bank_end = ALIGN(bank_pfn_end(bank), MAX_ORDER_NR_PAGES); + prev_end = ALIGN(memblock_region_memory_end_pfn(reg), + MAX_ORDER_NR_PAGES); } #ifdef CONFIG_SPARSEMEM - if (!IS_ALIGNED(prev_bank_end, PAGES_PER_SECTION)) - free_memmap(prev_bank_end, - ALIGN(prev_bank_end, PAGES_PER_SECTION)); + if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) + free_memmap(prev_end, + ALIGN(prev_end, PAGES_PER_SECTION)); #endif } @@ -535,7 +528,7 @@ void __init mem_init(void) set_max_mapnr(pfn_to_page(max_pfn) - mem_map); /* this will put all unused low memory onto the freelists */ - free_unused_memmap(&meminfo); + free_unused_memmap(); free_all_bootmem(); #ifdef CONFIG_SA1111 diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index f9c32ba73544..d1e5ad7ab3bc 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -438,6 +438,13 @@ void __arm_iounmap(volatile void __iomem *io_addr) EXPORT_SYMBOL(__arm_iounmap); #ifdef CONFIG_PCI +static int pci_ioremap_mem_type = MT_DEVICE; + +void pci_ioremap_set_mem_type(int mem_type) +{ + pci_ioremap_mem_type = mem_type; +} + int pci_ioremap_io(unsigned int offset, phys_addr_t phys_addr) { BUG_ON(offset + SZ_64K > IO_SPACE_LIMIT); @@ -445,7 +452,7 @@ int pci_ioremap_io(unsigned int offset, phys_addr_t phys_addr) return ioremap_page_range(PCI_IO_VIRT_BASE + offset, PCI_IO_VIRT_BASE + offset + SZ_64K, phys_addr, - __pgprot(get_mem_type(MT_DEVICE)->prot_pte)); + __pgprot(get_mem_type(pci_ioremap_mem_type)->prot_pte)); } EXPORT_SYMBOL_GPL(pci_ioremap_io); #endif diff --git a/arch/arm/mm/l2c-common.c b/arch/arm/mm/l2c-common.c new file mode 100644 index 000000000000..10a3cf28c362 --- /dev/null +++ b/arch/arm/mm/l2c-common.c @@ -0,0 +1,20 @@ +/* + * Copyright (C) 2010 ARM Ltd. + * Written by Catalin Marinas <catalin.marinas@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/bug.h> +#include <linux/smp.h> +#include <asm/outercache.h> + +void outer_disable(void) +{ + WARN_ON(!irqs_disabled()); + WARN_ON(num_online_cpus() > 1); + + if (outer_cache.disable) + outer_cache.disable(); +} diff --git a/arch/arm/mm/l2c-l2x0-resume.S b/arch/arm/mm/l2c-l2x0-resume.S new file mode 100644 index 000000000000..99b05f21a59a --- /dev/null +++ b/arch/arm/mm/l2c-l2x0-resume.S @@ -0,0 +1,58 @@ +/* + * L2C-310 early resume code. This can be used by platforms to restore + * the settings of their L2 cache controller before restoring the + * processor state. + * + * This code can only be used to if you are running in the secure world. + */ +#include <linux/linkage.h> +#include <asm/hardware/cache-l2x0.h> + + .text + +ENTRY(l2c310_early_resume) + adr r0, 1f + ldr r2, [r0] + add r0, r2, r0 + + ldmia r0, {r1, r2, r3, r4, r5, r6, r7, r8} + @ r1 = phys address of L2C-310 controller + @ r2 = aux_ctrl + @ r3 = tag_latency + @ r4 = data_latency + @ r5 = filter_start + @ r6 = filter_end + @ r7 = prefetch_ctrl + @ r8 = pwr_ctrl + + @ Check that the address has been initialised + teq r1, #0 + moveq pc, lr + + @ The prefetch and power control registers are revision dependent + @ and can be written whether or not the L2 cache is enabled + ldr r0, [r1, #L2X0_CACHE_ID] + and r0, r0, #L2X0_CACHE_ID_RTL_MASK + cmp r0, #L310_CACHE_ID_RTL_R2P0 + strcs r7, [r1, #L310_PREFETCH_CTRL] + cmp r0, #L310_CACHE_ID_RTL_R3P0 + strcs r8, [r1, #L310_POWER_CTRL] + + @ Don't setup the L2 cache if it is already enabled + ldr r0, [r1, #L2X0_CTRL] + tst r0, #L2X0_CTRL_EN + movne pc, lr + + str r3, [r1, #L310_TAG_LATENCY_CTRL] + str r4, [r1, #L310_DATA_LATENCY_CTRL] + str r6, [r1, #L310_ADDR_FILTER_END] + str r5, [r1, #L310_ADDR_FILTER_START] + + str r2, [r1, #L2X0_AUX_CTRL] + mov r9, #L2X0_CTRL_EN + str r9, [r1, #L2X0_CTRL] + mov pc, lr +ENDPROC(l2c310_early_resume) + + .align +1: .long l2x0_saved_regs - . diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h index 7ea641b7aa7d..ce727d47275c 100644 --- a/arch/arm/mm/mm.h +++ b/arch/arm/mm/mm.h @@ -2,6 +2,8 @@ #include <linux/list.h> #include <linux/vmalloc.h> +#include <asm/pgtable.h> + /* the upper-most page table pointer */ extern pmd_t *top_pmd; @@ -93,3 +95,5 @@ extern phys_addr_t arm_lowmem_limit; void __init bootmem_init(void); void arm_mm_memblock_reserve(void); void dma_contiguous_remap(void); + +unsigned long __clear_cr(unsigned long mask); diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index b68c6b22e1c8..ab14b79b03f0 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -35,6 +35,7 @@ #include <asm/mach/arch.h> #include <asm/mach/map.h> #include <asm/mach/pci.h> +#include <asm/fixmap.h> #include "mm.h" #include "tcm.h" @@ -117,28 +118,54 @@ static struct cachepolicy cache_policies[] __initdata = { }; #ifdef CONFIG_CPU_CP15 +static unsigned long initial_pmd_value __initdata = 0; + /* - * These are useful for identifying cache coherency - * problems by allowing the cache or the cache and - * writebuffer to be turned off. (Note: the write - * buffer should not be on and the cache off). + * Initialise the cache_policy variable with the initial state specified + * via the "pmd" value. This is used to ensure that on ARMv6 and later, + * the C code sets the page tables up with the same policy as the head + * assembly code, which avoids an illegal state where the TLBs can get + * confused. See comments in early_cachepolicy() for more information. */ -static int __init early_cachepolicy(char *p) +void __init init_default_cache_policy(unsigned long pmd) { int i; + initial_pmd_value = pmd; + + pmd &= PMD_SECT_TEX(1) | PMD_SECT_BUFFERABLE | PMD_SECT_CACHEABLE; + + for (i = 0; i < ARRAY_SIZE(cache_policies); i++) + if (cache_policies[i].pmd == pmd) { + cachepolicy = i; + break; + } + + if (i == ARRAY_SIZE(cache_policies)) + pr_err("ERROR: could not find cache policy\n"); +} + +/* + * These are useful for identifying cache coherency problems by allowing + * the cache or the cache and writebuffer to be turned off. (Note: the + * write buffer should not be on and the cache off). + */ +static int __init early_cachepolicy(char *p) +{ + int i, selected = -1; + for (i = 0; i < ARRAY_SIZE(cache_policies); i++) { int len = strlen(cache_policies[i].policy); if (memcmp(p, cache_policies[i].policy, len) == 0) { - cachepolicy = i; - cr_alignment &= ~cache_policies[i].cr_mask; - cr_no_alignment &= ~cache_policies[i].cr_mask; + selected = i; break; } } - if (i == ARRAY_SIZE(cache_policies)) - printk(KERN_ERR "ERROR: unknown or unsupported cache policy\n"); + + if (selected == -1) + pr_err("ERROR: unknown or unsupported cache policy\n"); + /* * This restriction is partly to do with the way we boot; it is * unpredictable to have memory mapped using two different sets of @@ -146,12 +173,18 @@ static int __init early_cachepolicy(char *p) * change these attributes once the initial assembly has setup the * page tables. */ - if (cpu_architecture() >= CPU_ARCH_ARMv6) { - printk(KERN_WARNING "Only cachepolicy=writeback supported on ARMv6 and later\n"); - cachepolicy = CPOLICY_WRITEBACK; + if (cpu_architecture() >= CPU_ARCH_ARMv6 && selected != cachepolicy) { + pr_warn("Only cachepolicy=%s supported on ARMv6 and later\n", + cache_policies[cachepolicy].policy); + return 0; + } + + if (selected != cachepolicy) { + unsigned long cr = __clear_cr(cache_policies[selected].cr_mask); + cachepolicy = selected; + flush_cache_all(); + set_cr(cr); } - flush_cache_all(); - set_cr(cr_alignment); return 0; } early_param("cachepolicy", early_cachepolicy); @@ -186,35 +219,6 @@ static int __init early_ecc(char *p) early_param("ecc", early_ecc); #endif -static int __init noalign_setup(char *__unused) -{ - cr_alignment &= ~CR_A; - cr_no_alignment &= ~CR_A; - set_cr(cr_alignment); - return 1; -} -__setup("noalign", noalign_setup); - -#ifndef CONFIG_SMP -void adjust_cr(unsigned long mask, unsigned long set) -{ - unsigned long flags; - - mask &= ~CR_A; - - set &= mask; - - local_irq_save(flags); - - cr_no_alignment = (cr_no_alignment & ~mask) | set; - cr_alignment = (cr_alignment & ~mask) | set; - - set_cr((get_cr() & ~mask) | set); - - local_irq_restore(flags); -} -#endif - #else /* ifdef CONFIG_CPU_CP15 */ static int __init early_cachepolicy(char *p) @@ -414,8 +418,17 @@ static void __init build_mem_type_table(void) cachepolicy = CPOLICY_WRITEBACK; ecc_mask = 0; } - if (is_smp()) - cachepolicy = CPOLICY_WRITEALLOC; + + if (is_smp()) { + if (cachepolicy != CPOLICY_WRITEALLOC) { + pr_warn("Forcing write-allocate cache policy for SMP\n"); + cachepolicy = CPOLICY_WRITEALLOC; + } + if (!(initial_pmd_value & PMD_SECT_S)) { + pr_warn("Forcing shared mappings for SMP\n"); + initial_pmd_value |= PMD_SECT_S; + } + } /* * Strip out features not present on earlier architectures. @@ -539,11 +552,12 @@ static void __init build_mem_type_table(void) mem_types[MT_CACHECLEAN].prot_sect |= PMD_SECT_APX|PMD_SECT_AP_WRITE; #endif - if (is_smp()) { - /* - * Mark memory with the "shared" attribute - * for SMP systems - */ + /* + * If the initial page tables were created with the S bit + * set, then we need to do the same here for the same + * reasons given in early_cachepolicy(). + */ + if (initial_pmd_value & PMD_SECT_S) { user_pgprot |= L_PTE_SHARED; kern_pgprot |= L_PTE_SHARED; vecs_pgprot |= L_PTE_SHARED; @@ -1061,74 +1075,47 @@ phys_addr_t arm_lowmem_limit __initdata = 0; void __init sanity_check_meminfo(void) { phys_addr_t memblock_limit = 0; - int i, j, highmem = 0; + int highmem = 0; phys_addr_t vmalloc_limit = __pa(vmalloc_min - 1) + 1; + struct memblock_region *reg; - for (i = 0, j = 0; i < meminfo.nr_banks; i++) { - struct membank *bank = &meminfo.bank[j]; - phys_addr_t size_limit; - - *bank = meminfo.bank[i]; - size_limit = bank->size; + for_each_memblock(memory, reg) { + phys_addr_t block_start = reg->base; + phys_addr_t block_end = reg->base + reg->size; + phys_addr_t size_limit = reg->size; - if (bank->start >= vmalloc_limit) + if (reg->base >= vmalloc_limit) highmem = 1; else - size_limit = vmalloc_limit - bank->start; + size_limit = vmalloc_limit - reg->base; - bank->highmem = highmem; -#ifdef CONFIG_HIGHMEM - /* - * Split those memory banks which are partially overlapping - * the vmalloc area greatly simplifying things later. - */ - if (!highmem && bank->size > size_limit) { - if (meminfo.nr_banks >= NR_BANKS) { - printk(KERN_CRIT "NR_BANKS too low, " - "ignoring high memory\n"); - } else { - memmove(bank + 1, bank, - (meminfo.nr_banks - i) * sizeof(*bank)); - meminfo.nr_banks++; - i++; - bank[1].size -= size_limit; - bank[1].start = vmalloc_limit; - bank[1].highmem = highmem = 1; - j++; + if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) { + + if (highmem) { + pr_notice("Ignoring RAM at %pa-%pa (!CONFIG_HIGHMEM)\n", + &block_start, &block_end); + memblock_remove(reg->base, reg->size); + continue; } - bank->size = size_limit; - } -#else - /* - * Highmem banks not allowed with !CONFIG_HIGHMEM. - */ - if (highmem) { - printk(KERN_NOTICE "Ignoring RAM at %.8llx-%.8llx " - "(!CONFIG_HIGHMEM).\n", - (unsigned long long)bank->start, - (unsigned long long)bank->start + bank->size - 1); - continue; - } - /* - * Check whether this memory bank would partially overlap - * the vmalloc area. - */ - if (bank->size > size_limit) { - printk(KERN_NOTICE "Truncating RAM at %.8llx-%.8llx " - "to -%.8llx (vmalloc region overlap).\n", - (unsigned long long)bank->start, - (unsigned long long)bank->start + bank->size - 1, - (unsigned long long)bank->start + size_limit - 1); - bank->size = size_limit; + if (reg->size > size_limit) { + phys_addr_t overlap_size = reg->size - size_limit; + + pr_notice("Truncating RAM at %pa-%pa to -%pa", + &block_start, &block_end, &vmalloc_limit); + memblock_remove(vmalloc_limit, overlap_size); + block_end = vmalloc_limit; + } } -#endif - if (!bank->highmem) { - phys_addr_t bank_end = bank->start + bank->size; - if (bank_end > arm_lowmem_limit) - arm_lowmem_limit = bank_end; + if (!highmem) { + if (block_end > arm_lowmem_limit) { + if (reg->size > size_limit) + arm_lowmem_limit = vmalloc_limit; + else + arm_lowmem_limit = block_end; + } /* * Find the first non-section-aligned page, and point @@ -1144,35 +1131,15 @@ void __init sanity_check_meminfo(void) * occurs before any free memory is mapped. */ if (!memblock_limit) { - if (!IS_ALIGNED(bank->start, SECTION_SIZE)) - memblock_limit = bank->start; - else if (!IS_ALIGNED(bank_end, SECTION_SIZE)) - memblock_limit = bank_end; + if (!IS_ALIGNED(block_start, SECTION_SIZE)) + memblock_limit = block_start; + else if (!IS_ALIGNED(block_end, SECTION_SIZE)) + memblock_limit = arm_lowmem_limit; } - } - j++; - } -#ifdef CONFIG_HIGHMEM - if (highmem) { - const char *reason = NULL; - if (cache_is_vipt_aliasing()) { - /* - * Interactions between kmap and other mappings - * make highmem support with aliasing VIPT caches - * rather difficult. - */ - reason = "with VIPT aliasing cache"; - } - if (reason) { - printk(KERN_CRIT "HIGHMEM is not supported %s, ignoring high memory\n", - reason); - while (j > 0 && meminfo.bank[j - 1].highmem) - j--; } } -#endif - meminfo.nr_banks = j; + high_memory = __va(arm_lowmem_limit - 1) + 1; /* @@ -1359,6 +1326,9 @@ static void __init kmap_init(void) #ifdef CONFIG_HIGHMEM pkmap_page_table = early_pte_alloc(pmd_off_k(PKMAP_BASE), PKMAP_BASE, _PAGE_KERNEL_TABLE); + + fixmap_page_table = early_pte_alloc(pmd_off_k(FIXADDR_START), + FIXADDR_START, _PAGE_KERNEL_TABLE); #endif } @@ -1461,7 +1431,7 @@ void __init early_paging_init(const struct machine_desc *mdesc, * just complicate the code. */ flush_cache_louis(); - dsb(); + dsb(ishst); isb(); /* remap level 1 table */ diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 55764a7ef1f0..da1874f9f8cf 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -88,30 +88,35 @@ static unsigned long irbar_read(void) void __init sanity_check_meminfo_mpu(void) { int i; - struct membank *bank = meminfo.bank; phys_addr_t phys_offset = PHYS_OFFSET; phys_addr_t aligned_region_size, specified_mem_size, rounded_mem_size; - - /* Initially only use memory continuous from PHYS_OFFSET */ - if (bank_phys_start(&bank[0]) != phys_offset) - panic("First memory bank must be contiguous from PHYS_OFFSET"); - - /* Banks have already been sorted by start address */ - for (i = 1; i < meminfo.nr_banks; i++) { - if (bank[i].start <= bank_phys_end(&bank[0]) && - bank_phys_end(&bank[i]) > bank_phys_end(&bank[0])) { - bank[0].size = bank_phys_end(&bank[i]) - bank[0].start; + struct memblock_region *reg; + bool first = true; + phys_addr_t mem_start; + phys_addr_t mem_end; + + for_each_memblock(memory, reg) { + if (first) { + /* + * Initially only use memory continuous from + * PHYS_OFFSET */ + if (reg->base != phys_offset) + panic("First memory bank must be contiguous from PHYS_OFFSET"); + + mem_start = reg->base; + mem_end = reg->base + reg->size; + specified_mem_size = reg->size; + first = false; } else { - pr_notice("Ignoring RAM after 0x%.8lx. " - "First non-contiguous (ignored) bank start: 0x%.8lx\n", - (unsigned long)bank_phys_end(&bank[0]), - (unsigned long)bank_phys_start(&bank[i])); - break; + /* + * memblock auto merges contiguous blocks, remove + * all blocks afterwards + */ + pr_notice("Ignoring RAM after %pa, memory at %pa ignored\n", + &mem_start, ®->base); + memblock_remove(reg->base, reg->size); } } - /* All contiguous banks are now merged in to the first bank */ - meminfo.nr_banks = 1; - specified_mem_size = bank[0].size; /* * MPU has curious alignment requirements: Size must be power of 2, and @@ -128,23 +133,24 @@ void __init sanity_check_meminfo_mpu(void) */ aligned_region_size = (phys_offset - 1) ^ (phys_offset); /* Find the max power-of-two sized region that fits inside our bank */ - rounded_mem_size = (1 << __fls(bank[0].size)) - 1; + rounded_mem_size = (1 << __fls(specified_mem_size)) - 1; /* The actual region size is the smaller of the two */ aligned_region_size = aligned_region_size < rounded_mem_size ? aligned_region_size + 1 : rounded_mem_size + 1; - if (aligned_region_size != specified_mem_size) - pr_warn("Truncating memory from 0x%.8lx to 0x%.8lx (MPU region constraints)", - (unsigned long)specified_mem_size, - (unsigned long)aligned_region_size); + if (aligned_region_size != specified_mem_size) { + pr_warn("Truncating memory from %pa to %pa (MPU region constraints)", + &specified_mem_size, &aligned_region_size); + memblock_remove(mem_start + aligned_region_size, + specified_mem_size - aligned_round_size); + + mem_end = mem_start + aligned_region_size; + } - meminfo.bank[0].size = aligned_region_size; - pr_debug("MPU Region from 0x%.8lx size 0x%.8lx (end 0x%.8lx))\n", - (unsigned long)phys_offset, - (unsigned long)aligned_region_size, - (unsigned long)bank_phys_end(&bank[0])); + pr_debug("MPU Region from %pa size %pa (end %pa))\n", + &phys_offset, &aligned_region_size, &mem_end); } @@ -292,7 +298,7 @@ void __init sanity_check_meminfo(void) { phys_addr_t end; sanity_check_meminfo_mpu(); - end = bank_phys_end(&meminfo.bank[meminfo.nr_banks - 1]); + end = memblock_end_of_DRAM(); high_memory = __va(end - 1) + 1; } diff --git a/arch/arm/mm/proc-v7-3level.S b/arch/arm/mm/proc-v7-3level.S index 01a719e18bb0..22e3ad63500c 100644 --- a/arch/arm/mm/proc-v7-3level.S +++ b/arch/arm/mm/proc-v7-3level.S @@ -64,6 +64,14 @@ ENTRY(cpu_v7_switch_mm) mov pc, lr ENDPROC(cpu_v7_switch_mm) +#ifdef __ARMEB__ +#define rl r3 +#define rh r2 +#else +#define rl r2 +#define rh r3 +#endif + /* * cpu_v7_set_pte_ext(ptep, pte) * @@ -73,13 +81,13 @@ ENDPROC(cpu_v7_switch_mm) */ ENTRY(cpu_v7_set_pte_ext) #ifdef CONFIG_MMU - tst r2, #L_PTE_VALID + tst rl, #L_PTE_VALID beq 1f - tst r3, #1 << (57 - 32) @ L_PTE_NONE - bicne r2, #L_PTE_VALID + tst rh, #1 << (57 - 32) @ L_PTE_NONE + bicne rl, #L_PTE_VALID bne 1f - tst r3, #1 << (55 - 32) @ L_PTE_DIRTY - orreq r2, #L_PTE_RDONLY + tst rh, #1 << (55 - 32) @ L_PTE_DIRTY + orreq rl, #L_PTE_RDONLY 1: strd r2, r3, [r0] ALT_SMP(W(nop)) ALT_UP (mcr p15, 0, r0, c7, c10, 1) @ flush_pte diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index 195731d3813b..3db2c2f04a30 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -169,9 +169,31 @@ ENDPROC(cpu_pj4b_do_idle) globl_equ cpu_pj4b_do_idle, cpu_v7_do_idle #endif globl_equ cpu_pj4b_dcache_clean_area, cpu_v7_dcache_clean_area - globl_equ cpu_pj4b_do_suspend, cpu_v7_do_suspend - globl_equ cpu_pj4b_do_resume, cpu_v7_do_resume - globl_equ cpu_pj4b_suspend_size, cpu_v7_suspend_size +#ifdef CONFIG_ARM_CPU_SUSPEND +ENTRY(cpu_pj4b_do_suspend) + stmfd sp!, {r6 - r10} + mrc p15, 1, r6, c15, c1, 0 @ save CP15 - extra features + mrc p15, 1, r7, c15, c2, 0 @ save CP15 - Aux Func Modes Ctrl 0 + mrc p15, 1, r8, c15, c1, 2 @ save CP15 - Aux Debug Modes Ctrl 2 + mrc p15, 1, r9, c15, c1, 1 @ save CP15 - Aux Debug Modes Ctrl 1 + mrc p15, 0, r10, c9, c14, 0 @ save CP15 - PMC + stmia r0!, {r6 - r10} + ldmfd sp!, {r6 - r10} + b cpu_v7_do_suspend +ENDPROC(cpu_pj4b_do_suspend) + +ENTRY(cpu_pj4b_do_resume) + ldmia r0!, {r6 - r10} + mcr p15, 1, r6, c15, c1, 0 @ save CP15 - extra features + mcr p15, 1, r7, c15, c2, 0 @ save CP15 - Aux Func Modes Ctrl 0 + mcr p15, 1, r8, c15, c1, 2 @ save CP15 - Aux Debug Modes Ctrl 2 + mcr p15, 1, r9, c15, c1, 1 @ save CP15 - Aux Debug Modes Ctrl 1 + mcr p15, 0, r10, c9, c14, 0 @ save CP15 - PMC + b cpu_v7_do_resume +ENDPROC(cpu_pj4b_do_resume) +#endif +.globl cpu_pj4b_suspend_size +.equ cpu_pj4b_suspend_size, 4 * 14 #endif @@ -194,6 +216,7 @@ __v7_cr7mp_setup: __v7_ca7mp_setup: __v7_ca12mp_setup: __v7_ca15mp_setup: +__v7_ca17mp_setup: mov r10, #0 1: #ifdef CONFIG_SMP @@ -505,6 +528,16 @@ __v7_ca15mp_proc_info: .size __v7_ca15mp_proc_info, . - __v7_ca15mp_proc_info /* + * ARM Ltd. Cortex A17 processor. + */ + .type __v7_ca17mp_proc_info, #object +__v7_ca17mp_proc_info: + .long 0x410fc0e0 + .long 0xff0ffff0 + __v7_proc __v7_ca17mp_setup + .size __v7_ca17mp_proc_info, . - __v7_ca17mp_proc_info + + /* * Qualcomm Inc. Krait processors. */ .type __krait_proc_info, #object diff --git a/arch/arm/plat-samsung/s5p-sleep.S b/arch/arm/plat-samsung/s5p-sleep.S index c5001659bdf8..25c68ceb9e2b 100644 --- a/arch/arm/plat-samsung/s5p-sleep.S +++ b/arch/arm/plat-samsung/s5p-sleep.S @@ -22,7 +22,6 @@ */ #include <linux/linkage.h> -#include <asm/asm-offsets.h> .data .align diff --git a/arch/arm/vfp/entry.S b/arch/arm/vfp/entry.S index f0759e70fb86..fe6ca574d093 100644 --- a/arch/arm/vfp/entry.S +++ b/arch/arm/vfp/entry.S @@ -22,11 +22,10 @@ @ r9 = normal "successful" return address @ r10 = this threads thread_info structure @ lr = unrecognised instruction return address -@ IRQs disabled. +@ IRQs enabled. @ ENTRY(do_vfp) inc_preempt_count r10, r4 - enable_irq ldr r4, .LCvfp ldr r11, [r10, #TI_CPU] @ CPU number add r10, r10, #TI_VFPSTATE @ r10 = workspace diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f5f63b715d91..7295419165e1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -30,12 +30,17 @@ config ARM64 select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK + select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_BUGVERBOSE select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS + select HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_GRAPH_TRACER select HAVE_GENERIC_DMA_COHERENT select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_MEMBLOCK @@ -43,6 +48,7 @@ config ARM64 select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_SYSCALL_TRACEPOINTS select IRQ_DOMAIN select MODULES_USE_ELF_RELA select NO_BOOTMEM @@ -245,6 +251,9 @@ config ARCH_WANT_HUGE_PMD_SHARE config HAVE_ARCH_TRANSPARENT_HUGEPAGE def_bool y +config ARCH_HAS_CACHE_LINE_SIZE + def_bool y + source "mm/Kconfig" config XEN_DOM0 @@ -283,6 +292,20 @@ config CMDLINE_FORCE This is useful if you cannot or don't want to change the command-line options your boot loader passes to the kernel. +config EFI + bool "UEFI runtime support" + depends on OF && !CPU_BIG_ENDIAN + select LIBFDT + select UCS2_STRING + select EFI_PARAMS_FROM_FDT + default y + help + This option provides support for runtime services provided + by UEFI firmware (such as non-volatile variables, realtime + clock, and platform reset). A UEFI stub is also provided to + allow the kernel to be booted as an EFI application. This + is only useful on systems that have UEFI firmware. + endmenu menu "Userspace binary formats" @@ -334,6 +357,8 @@ source "net/Kconfig" source "drivers/Kconfig" +source "drivers/firmware/Kconfig" + source "fs/Kconfig" source "arch/arm64/kvm/Kconfig" @@ -343,5 +368,8 @@ source "arch/arm64/Kconfig.debug" source "security/Kconfig" source "crypto/Kconfig" +if CRYPTO +source "arch/arm64/crypto/Kconfig" +endif source "lib/Kconfig" diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 2fceb71ac3b7..8185a913c5ed 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -45,6 +45,7 @@ export TEXT_OFFSET GZFLAGS core-y += arch/arm64/kernel/ arch/arm64/mm/ core-$(CONFIG_KVM) += arch/arm64/kvm/ core-$(CONFIG_XEN) += arch/arm64/xen/ +core-$(CONFIG_CRYPTO) += arch/arm64/crypto/ libs-y := arch/arm64/lib/ $(libs-y) libs-y += $(LIBGCC) diff --git a/arch/arm64/boot/dts/apm-storm.dtsi b/arch/arm64/boot/dts/apm-storm.dtsi index f8c40a66e65d..c5f0a47a1375 100644 --- a/arch/arm64/boot/dts/apm-storm.dtsi +++ b/arch/arm64/boot/dts/apm-storm.dtsi @@ -257,6 +257,19 @@ enable-offset = <0x0>; enable-mask = <0x39>; }; + + rtcclk: rtcclk@17000000 { + compatible = "apm,xgene-device-clock"; + #clock-cells = <1>; + clocks = <&socplldiv2 0>; + reg = <0x0 0x17000000 0x0 0x2000>; + reg-names = "csr-reg"; + csr-offset = <0xc>; + csr-mask = <0x2>; + enable-offset = <0x10>; + enable-mask = <0x2>; + clock-output-names = "rtcclk"; + }; }; serial0: serial@1c020000 { @@ -342,5 +355,13 @@ phys = <&phy3 0>; phy-names = "sata-phy"; }; + + rtc: rtc@10510000 { + compatible = "apm,xgene-rtc"; + reg = <0x0 0x10510000 0x0 0x400>; + interrupts = <0x0 0x46 0x4>; + #clock-cells = <1>; + clocks = <&rtcclk 0>; + }; }; }; diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 7959dd0ca5d5..157e1d8d9a47 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1,11 +1,11 @@ # CONFIG_LOCALVERSION_AUTO is not set -# CONFIG_SWAP is not set CONFIG_SYSVIPC=y CONFIG_POSIX_MQUEUE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ_IDLE=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=14 @@ -27,6 +27,7 @@ CONFIG_ARCH_VEXPRESS=y CONFIG_ARCH_XGENE=y CONFIG_SMP=y CONFIG_PREEMPT=y +CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CMA=y CONFIG_CMDLINE="console=ttyAMA0" # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set @@ -44,7 +45,7 @@ CONFIG_IP_PNP_BOOTP=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_DMA_CMA=y -CONFIG_SCSI=y +CONFIG_VIRTIO_BLK=y # CONFIG_SCSI_PROC_FS is not set CONFIG_BLK_DEV_SD=y # CONFIG_SCSI_LOWLEVEL is not set @@ -56,20 +57,18 @@ CONFIG_SMC91X=y CONFIG_SMSC911X=y # CONFIG_WLAN is not set CONFIG_INPUT_EVDEV=y -# CONFIG_SERIO_I8042 is not set # CONFIG_SERIO_SERPORT is not set CONFIG_LEGACY_PTY_COUNT=16 CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_OF_PLATFORM=y CONFIG_SERIAL_AMBA_PL011=y CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_HW_RANDOM is not set # CONFIG_HWMON is not set CONFIG_REGULATOR=y CONFIG_REGULATOR_FIXED_VOLTAGE=y CONFIG_FB=y -# CONFIG_VGA_CONSOLE is not set CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set @@ -79,27 +78,38 @@ CONFIG_USB_ISP1760_HCD=y CONFIG_USB_STORAGE=y CONFIG_MMC=y CONFIG_MMC_ARMMMCI=y +CONFIG_VIRTIO_MMIO=y # CONFIG_IOMMU_SUPPORT is not set CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y -CONFIG_EXT4_FS=y # CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y CONFIG_FUSE_FS=y CONFIG_CUSE=y CONFIG_VFAT_FS=y CONFIG_TMPFS=y +CONFIG_HUGETLBFS=y # CONFIG_MISC_FILESYSTEMS is not set CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y CONFIG_NLS_ISO8859_1=y -CONFIG_MAGIC_SYSRQ=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=y +CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y +CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y +CONFIG_LOCKUP_DETECTOR=y # CONFIG_SCHED_DEBUG is not set -CONFIG_DEBUG_INFO=y # CONFIG_FTRACE is not set -CONFIG_ATOMIC64_SELFTEST=y -CONFIG_VIRTIO_MMIO=y -CONFIG_VIRTIO_BLK=y +CONFIG_CRYPTO_ANSI_CPRNG=y +CONFIG_ARM64_CRYPTO=y +CONFIG_CRYPTO_SHA1_ARM64_CE=y +CONFIG_CRYPTO_SHA2_ARM64_CE=y +CONFIG_CRYPTO_GHASH_ARM64_CE=y +CONFIG_CRYPTO_AES_ARM64_CE=y +CONFIG_CRYPTO_AES_ARM64_CE_CCM=y +CONFIG_CRYPTO_AES_ARM64_CE_BLK=y +CONFIG_CRYPTO_AES_ARM64_NEON_BLK=y diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig new file mode 100644 index 000000000000..5562652c5316 --- /dev/null +++ b/arch/arm64/crypto/Kconfig @@ -0,0 +1,53 @@ + +menuconfig ARM64_CRYPTO + bool "ARM64 Accelerated Cryptographic Algorithms" + depends on ARM64 + help + Say Y here to choose from a selection of cryptographic algorithms + implemented using ARM64 specific CPU features or instructions. + +if ARM64_CRYPTO + +config CRYPTO_SHA1_ARM64_CE + tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_HASH + +config CRYPTO_SHA2_ARM64_CE + tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_HASH + +config CRYPTO_GHASH_ARM64_CE + tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_HASH + +config CRYPTO_AES_ARM64_CE + tristate "AES core cipher using ARMv8 Crypto Extensions" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_AES + +config CRYPTO_AES_ARM64_CE_CCM + tristate "AES in CCM mode using ARMv8 Crypto Extensions" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_AES + select CRYPTO_AEAD + +config CRYPTO_AES_ARM64_CE_BLK + tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_BLKCIPHER + select CRYPTO_AES + select CRYPTO_ABLK_HELPER + +config CRYPTO_AES_ARM64_NEON_BLK + tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" + depends on ARM64 && KERNEL_MODE_NEON + select CRYPTO_BLKCIPHER + select CRYPTO_AES + select CRYPTO_ABLK_HELPER + +endif diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile new file mode 100644 index 000000000000..2070a56ecc46 --- /dev/null +++ b/arch/arm64/crypto/Makefile @@ -0,0 +1,38 @@ +# +# linux/arch/arm64/crypto/Makefile +# +# Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. +# + +obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o +sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o + +obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o +sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o + +obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o +ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o +CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto + +obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o +aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_CE_BLK) += aes-ce-blk.o +aes-ce-blk-y := aes-glue-ce.o aes-ce.o + +obj-$(CONFIG_CRYPTO_AES_ARM64_NEON_BLK) += aes-neon-blk.o +aes-neon-blk-y := aes-glue-neon.o aes-neon.o + +AFLAGS_aes-ce.o := -DINTERLEAVE=2 -DINTERLEAVE_INLINE +AFLAGS_aes-neon.o := -DINTERLEAVE=4 + +CFLAGS_aes-glue-ce.o := -DUSE_V8_CRYPTO_EXTENSIONS + +$(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE + $(call if_changed_dep,cc_o_c) diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S new file mode 100644 index 000000000000..432e4841cd81 --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-core.S @@ -0,0 +1,222 @@ +/* + * aesce-ccm-core.S - AES-CCM transform for ARMv8 with Crypto Extensions + * + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + + .text + .arch armv8-a+crypto + + /* + * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, + * u32 *macp, u8 const rk[], u32 rounds); + */ +ENTRY(ce_aes_ccm_auth_data) + ldr w8, [x3] /* leftover from prev round? */ + ld1 {v0.2d}, [x0] /* load mac */ + cbz w8, 1f + sub w8, w8, #16 + eor v1.16b, v1.16b, v1.16b +0: ldrb w7, [x1], #1 /* get 1 byte of input */ + subs w2, w2, #1 + add w8, w8, #1 + ins v1.b[0], w7 + ext v1.16b, v1.16b, v1.16b, #1 /* rotate in the input bytes */ + beq 8f /* out of input? */ + cbnz w8, 0b + eor v0.16b, v0.16b, v1.16b +1: ld1 {v3.2d}, [x4] /* load first round key */ + prfm pldl1strm, [x1] + cmp w5, #12 /* which key size? */ + add x6, x4, #16 + sub w7, w5, #2 /* modified # of rounds */ + bmi 2f + bne 5f + mov v5.16b, v3.16b + b 4f +2: mov v4.16b, v3.16b + ld1 {v5.2d}, [x6], #16 /* load 2nd round key */ +3: aese v0.16b, v4.16b + aesmc v0.16b, v0.16b +4: ld1 {v3.2d}, [x6], #16 /* load next round key */ + aese v0.16b, v5.16b + aesmc v0.16b, v0.16b +5: ld1 {v4.2d}, [x6], #16 /* load next round key */ + subs w7, w7, #3 + aese v0.16b, v3.16b + aesmc v0.16b, v0.16b + ld1 {v5.2d}, [x6], #16 /* load next round key */ + bpl 3b + aese v0.16b, v4.16b + subs w2, w2, #16 /* last data? */ + eor v0.16b, v0.16b, v5.16b /* final round */ + bmi 6f + ld1 {v1.16b}, [x1], #16 /* load next input block */ + eor v0.16b, v0.16b, v1.16b /* xor with mac */ + bne 1b +6: st1 {v0.2d}, [x0] /* store mac */ + beq 10f + adds w2, w2, #16 + beq 10f + mov w8, w2 +7: ldrb w7, [x1], #1 + umov w6, v0.b[0] + eor w6, w6, w7 + strb w6, [x0], #1 + subs w2, w2, #1 + beq 10f + ext v0.16b, v0.16b, v0.16b, #1 /* rotate out the mac bytes */ + b 7b +8: mov w7, w8 + add w8, w8, #16 +9: ext v1.16b, v1.16b, v1.16b, #1 + adds w7, w7, #1 + bne 9b + eor v0.16b, v0.16b, v1.16b + st1 {v0.2d}, [x0] +10: str w8, [x3] + ret +ENDPROC(ce_aes_ccm_auth_data) + + /* + * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[], + * u32 rounds); + */ +ENTRY(ce_aes_ccm_final) + ld1 {v3.2d}, [x2], #16 /* load first round key */ + ld1 {v0.2d}, [x0] /* load mac */ + cmp w3, #12 /* which key size? */ + sub w3, w3, #2 /* modified # of rounds */ + ld1 {v1.2d}, [x1] /* load 1st ctriv */ + bmi 0f + bne 3f + mov v5.16b, v3.16b + b 2f +0: mov v4.16b, v3.16b +1: ld1 {v5.2d}, [x2], #16 /* load next round key */ + aese v0.16b, v4.16b + aese v1.16b, v4.16b + aesmc v0.16b, v0.16b + aesmc v1.16b, v1.16b +2: ld1 {v3.2d}, [x2], #16 /* load next round key */ + aese v0.16b, v5.16b + aese v1.16b, v5.16b + aesmc v0.16b, v0.16b + aesmc v1.16b, v1.16b +3: ld1 {v4.2d}, [x2], #16 /* load next round key */ + subs w3, w3, #3 + aese v0.16b, v3.16b + aese v1.16b, v3.16b + aesmc v0.16b, v0.16b + aesmc v1.16b, v1.16b + bpl 1b + aese v0.16b, v4.16b + aese v1.16b, v4.16b + /* final round key cancels out */ + eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */ + st1 {v0.2d}, [x0] /* store result */ + ret +ENDPROC(ce_aes_ccm_final) + + .macro aes_ccm_do_crypt,enc + ldr x8, [x6, #8] /* load lower ctr */ + ld1 {v0.2d}, [x5] /* load mac */ + rev x8, x8 /* keep swabbed ctr in reg */ +0: /* outer loop */ + ld1 {v1.1d}, [x6] /* load upper ctr */ + prfm pldl1strm, [x1] + add x8, x8, #1 + rev x9, x8 + cmp w4, #12 /* which key size? */ + sub w7, w4, #2 /* get modified # of rounds */ + ins v1.d[1], x9 /* no carry in lower ctr */ + ld1 {v3.2d}, [x3] /* load first round key */ + add x10, x3, #16 + bmi 1f + bne 4f + mov v5.16b, v3.16b + b 3f +1: mov v4.16b, v3.16b + ld1 {v5.2d}, [x10], #16 /* load 2nd round key */ +2: /* inner loop: 3 rounds, 2x interleaved */ + aese v0.16b, v4.16b + aese v1.16b, v4.16b + aesmc v0.16b, v0.16b + aesmc v1.16b, v1.16b +3: ld1 {v3.2d}, [x10], #16 /* load next round key */ + aese v0.16b, v5.16b + aese v1.16b, v5.16b + aesmc v0.16b, v0.16b + aesmc v1.16b, v1.16b +4: ld1 {v4.2d}, [x10], #16 /* load next round key */ + subs w7, w7, #3 + aese v0.16b, v3.16b + aese v1.16b, v3.16b + aesmc v0.16b, v0.16b + aesmc v1.16b, v1.16b + ld1 {v5.2d}, [x10], #16 /* load next round key */ + bpl 2b + aese v0.16b, v4.16b + aese v1.16b, v4.16b + subs w2, w2, #16 + bmi 6f /* partial block? */ + ld1 {v2.16b}, [x1], #16 /* load next input block */ + .if \enc == 1 + eor v2.16b, v2.16b, v5.16b /* final round enc+mac */ + eor v1.16b, v1.16b, v2.16b /* xor with crypted ctr */ + .else + eor v2.16b, v2.16b, v1.16b /* xor with crypted ctr */ + eor v1.16b, v2.16b, v5.16b /* final round enc */ + .endif + eor v0.16b, v0.16b, v2.16b /* xor mac with pt ^ rk[last] */ + st1 {v1.16b}, [x0], #16 /* write output block */ + bne 0b + rev x8, x8 + st1 {v0.2d}, [x5] /* store mac */ + str x8, [x6, #8] /* store lsb end of ctr (BE) */ +5: ret + +6: eor v0.16b, v0.16b, v5.16b /* final round mac */ + eor v1.16b, v1.16b, v5.16b /* final round enc */ + st1 {v0.2d}, [x5] /* store mac */ + add w2, w2, #16 /* process partial tail block */ +7: ldrb w9, [x1], #1 /* get 1 byte of input */ + umov w6, v1.b[0] /* get top crypted ctr byte */ + umov w7, v0.b[0] /* get top mac byte */ + .if \enc == 1 + eor w7, w7, w9 + eor w9, w9, w6 + .else + eor w9, w9, w6 + eor w7, w7, w9 + .endif + strb w9, [x0], #1 /* store out byte */ + strb w7, [x5], #1 /* store mac byte */ + subs w2, w2, #1 + beq 5b + ext v0.16b, v0.16b, v0.16b, #1 /* shift out mac byte */ + ext v1.16b, v1.16b, v1.16b, #1 /* shift out ctr byte */ + b 7b + .endm + + /* + * void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, + * u8 const rk[], u32 rounds, u8 mac[], + * u8 ctr[]); + * void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, + * u8 const rk[], u32 rounds, u8 mac[], + * u8 ctr[]); + */ +ENTRY(ce_aes_ccm_encrypt) + aes_ccm_do_crypt 1 +ENDPROC(ce_aes_ccm_encrypt) + +ENTRY(ce_aes_ccm_decrypt) + aes_ccm_do_crypt 0 +ENDPROC(ce_aes_ccm_decrypt) diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c new file mode 100644 index 000000000000..9e6cdde9b43d --- /dev/null +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -0,0 +1,297 @@ +/* + * aes-ccm-glue.c - AES-CCM transform for ARMv8 with Crypto Extensions + * + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/unaligned.h> +#include <crypto/aes.h> +#include <crypto/algapi.h> +#include <crypto/scatterwalk.h> +#include <linux/crypto.h> +#include <linux/module.h> + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ + /* + * # of rounds specified by AES: + * 128 bit key 10 rounds + * 192 bit key 12 rounds + * 256 bit key 14 rounds + * => n byte key => 6 + (n/4) rounds + */ + return 6 + ctx->key_length / 4; +} + +asmlinkage void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes, + u32 *macp, u32 const rk[], u32 rounds); + +asmlinkage void ce_aes_ccm_encrypt(u8 out[], u8 const in[], u32 cbytes, + u32 const rk[], u32 rounds, u8 mac[], + u8 ctr[]); + +asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes, + u32 const rk[], u32 rounds, u8 mac[], + u8 ctr[]); + +asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[], + u32 rounds); + +static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct crypto_aes_ctx *ctx = crypto_aead_ctx(tfm); + int ret; + + ret = crypto_aes_expand_key(ctx, in_key, key_len); + if (!ret) + return 0; + + tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; +} + +static int ccm_setauthsize(struct crypto_aead *tfm, unsigned int authsize) +{ + if ((authsize & 1) || authsize < 4) + return -EINVAL; + return 0; +} + +static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + __be32 *n = (__be32 *)&maciv[AES_BLOCK_SIZE - 8]; + u32 l = req->iv[0] + 1; + + /* verify that CCM dimension 'L' is set correctly in the IV */ + if (l < 2 || l > 8) + return -EINVAL; + + /* verify that msglen can in fact be represented in L bytes */ + if (l < 4 && msglen >> (8 * l)) + return -EOVERFLOW; + + /* + * Even if the CCM spec allows L values of up to 8, the Linux cryptoapi + * uses a u32 type to represent msglen so the top 4 bytes are always 0. + */ + n[0] = 0; + n[1] = cpu_to_be32(msglen); + + memcpy(maciv, req->iv, AES_BLOCK_SIZE - l); + + /* + * Meaning of byte 0 according to CCM spec (RFC 3610/NIST 800-38C) + * - bits 0..2 : max # of bytes required to represent msglen, minus 1 + * (already set by caller) + * - bits 3..5 : size of auth tag (1 => 4 bytes, 2 => 6 bytes, etc) + * - bit 6 : indicates presence of authenticate-only data + */ + maciv[0] |= (crypto_aead_authsize(aead) - 2) << 2; + if (req->assoclen) + maciv[0] |= 0x40; + + memset(&req->iv[AES_BLOCK_SIZE - l], 0, l); + return 0; +} + +static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[]) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); + struct __packed { __be16 l; __be32 h; u16 len; } ltag; + struct scatter_walk walk; + u32 len = req->assoclen; + u32 macp = 0; + + /* prepend the AAD with a length tag */ + if (len < 0xff00) { + ltag.l = cpu_to_be16(len); + ltag.len = 2; + } else { + ltag.l = cpu_to_be16(0xfffe); + put_unaligned_be32(len, <ag.h); + ltag.len = 6; + } + + ce_aes_ccm_auth_data(mac, (u8 *)<ag, ltag.len, &macp, ctx->key_enc, + num_rounds(ctx)); + scatterwalk_start(&walk, req->assoc); + + do { + u32 n = scatterwalk_clamp(&walk, len); + u8 *p; + + if (!n) { + scatterwalk_start(&walk, sg_next(walk.sg)); + n = scatterwalk_clamp(&walk, len); + } + p = scatterwalk_map(&walk); + ce_aes_ccm_auth_data(mac, p, n, &macp, ctx->key_enc, + num_rounds(ctx)); + len -= n; + + scatterwalk_unmap(p); + scatterwalk_advance(&walk, n); + scatterwalk_done(&walk, 0, len); + } while (len); +} + +static int ccm_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); + struct blkcipher_desc desc = { .info = req->iv }; + struct blkcipher_walk walk; + u8 __aligned(8) mac[AES_BLOCK_SIZE]; + u8 buf[AES_BLOCK_SIZE]; + u32 len = req->cryptlen; + int err; + + err = ccm_init_mac(req, mac, len); + if (err) + return err; + + kernel_neon_begin_partial(6); + + if (req->assoclen) + ccm_calculate_auth_mac(req, mac); + + /* preserve the original iv for the final round */ + memcpy(buf, req->iv, AES_BLOCK_SIZE); + + blkcipher_walk_init(&walk, req->dst, req->src, len); + err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, + AES_BLOCK_SIZE); + + while (walk.nbytes) { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; + + if (walk.nbytes == len) + tail = 0; + + ce_aes_ccm_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes - tail, ctx->key_enc, + num_rounds(ctx), mac, walk.iv); + + len -= walk.nbytes - tail; + err = blkcipher_walk_done(&desc, &walk, tail); + } + if (!err) + ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + + kernel_neon_end(); + + if (err) + return err; + + /* copy authtag to end of dst */ + scatterwalk_map_and_copy(mac, req->dst, req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int ccm_decrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead); + unsigned int authsize = crypto_aead_authsize(aead); + struct blkcipher_desc desc = { .info = req->iv }; + struct blkcipher_walk walk; + u8 __aligned(8) mac[AES_BLOCK_SIZE]; + u8 buf[AES_BLOCK_SIZE]; + u32 len = req->cryptlen - authsize; + int err; + + err = ccm_init_mac(req, mac, len); + if (err) + return err; + + kernel_neon_begin_partial(6); + + if (req->assoclen) + ccm_calculate_auth_mac(req, mac); + + /* preserve the original iv for the final round */ + memcpy(buf, req->iv, AES_BLOCK_SIZE); + + blkcipher_walk_init(&walk, req->dst, req->src, len); + err = blkcipher_aead_walk_virt_block(&desc, &walk, aead, + AES_BLOCK_SIZE); + + while (walk.nbytes) { + u32 tail = walk.nbytes % AES_BLOCK_SIZE; + + if (walk.nbytes == len) + tail = 0; + + ce_aes_ccm_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + walk.nbytes - tail, ctx->key_enc, + num_rounds(ctx), mac, walk.iv); + + len -= walk.nbytes - tail; + err = blkcipher_walk_done(&desc, &walk, tail); + } + if (!err) + ce_aes_ccm_final(mac, buf, ctx->key_enc, num_rounds(ctx)); + + kernel_neon_end(); + + if (err) + return err; + + /* compare calculated auth tag with the stored one */ + scatterwalk_map_and_copy(buf, req->src, req->cryptlen - authsize, + authsize, 0); + + if (memcmp(mac, buf, authsize)) + return -EBADMSG; + return 0; +} + +static struct crypto_alg ccm_aes_alg = { + .cra_name = "ccm(aes)", + .cra_driver_name = "ccm-aes-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_AEAD, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_aead_type, + .cra_module = THIS_MODULE, + .cra_aead = { + .ivsize = AES_BLOCK_SIZE, + .maxauthsize = AES_BLOCK_SIZE, + .setkey = ccm_setkey, + .setauthsize = ccm_setauthsize, + .encrypt = ccm_encrypt, + .decrypt = ccm_decrypt, + } +}; + +static int __init aes_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_AES)) + return -ENODEV; + return crypto_register_alg(&ccm_aes_alg); +} + +static void __exit aes_mod_exit(void) +{ + crypto_unregister_alg(&ccm_aes_alg); +} + +module_init(aes_mod_init); +module_exit(aes_mod_exit); + +MODULE_DESCRIPTION("Synchronous AES in CCM mode using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("ccm(aes)"); diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-cipher.c new file mode 100644 index 000000000000..2075e1acae6b --- /dev/null +++ b/arch/arm64/crypto/aes-ce-cipher.c @@ -0,0 +1,155 @@ +/* + * aes-ce-cipher.c - core AES cipher using ARMv8 Crypto Extensions + * + * Copyright (C) 2013 - 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <crypto/aes.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +struct aes_block { + u8 b[AES_BLOCK_SIZE]; +}; + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ + /* + * # of rounds specified by AES: + * 128 bit key 10 rounds + * 192 bit key 12 rounds + * 256 bit key 14 rounds + * => n byte key => 6 + (n/4) rounds + */ + return 6 + ctx->key_length / 4; +} + +static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + struct aes_block *out = (struct aes_block *)dst; + struct aes_block const *in = (struct aes_block *)src; + void *dummy0; + int dummy1; + + kernel_neon_begin_partial(4); + + __asm__(" ld1 {v0.16b}, %[in] ;" + " ld1 {v1.2d}, [%[key]], #16 ;" + " cmp %w[rounds], #10 ;" + " bmi 0f ;" + " bne 3f ;" + " mov v3.16b, v1.16b ;" + " b 2f ;" + "0: mov v2.16b, v1.16b ;" + " ld1 {v3.2d}, [%[key]], #16 ;" + "1: aese v0.16b, v2.16b ;" + " aesmc v0.16b, v0.16b ;" + "2: ld1 {v1.2d}, [%[key]], #16 ;" + " aese v0.16b, v3.16b ;" + " aesmc v0.16b, v0.16b ;" + "3: ld1 {v2.2d}, [%[key]], #16 ;" + " subs %w[rounds], %w[rounds], #3 ;" + " aese v0.16b, v1.16b ;" + " aesmc v0.16b, v0.16b ;" + " ld1 {v3.2d}, [%[key]], #16 ;" + " bpl 1b ;" + " aese v0.16b, v2.16b ;" + " eor v0.16b, v0.16b, v3.16b ;" + " st1 {v0.16b}, %[out] ;" + + : [out] "=Q"(*out), + [key] "=r"(dummy0), + [rounds] "=r"(dummy1) + : [in] "Q"(*in), + "1"(ctx->key_enc), + "2"(num_rounds(ctx) - 2) + : "cc"); + + kernel_neon_end(); +} + +static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + struct aes_block *out = (struct aes_block *)dst; + struct aes_block const *in = (struct aes_block *)src; + void *dummy0; + int dummy1; + + kernel_neon_begin_partial(4); + + __asm__(" ld1 {v0.16b}, %[in] ;" + " ld1 {v1.2d}, [%[key]], #16 ;" + " cmp %w[rounds], #10 ;" + " bmi 0f ;" + " bne 3f ;" + " mov v3.16b, v1.16b ;" + " b 2f ;" + "0: mov v2.16b, v1.16b ;" + " ld1 {v3.2d}, [%[key]], #16 ;" + "1: aesd v0.16b, v2.16b ;" + " aesimc v0.16b, v0.16b ;" + "2: ld1 {v1.2d}, [%[key]], #16 ;" + " aesd v0.16b, v3.16b ;" + " aesimc v0.16b, v0.16b ;" + "3: ld1 {v2.2d}, [%[key]], #16 ;" + " subs %w[rounds], %w[rounds], #3 ;" + " aesd v0.16b, v1.16b ;" + " aesimc v0.16b, v0.16b ;" + " ld1 {v3.2d}, [%[key]], #16 ;" + " bpl 1b ;" + " aesd v0.16b, v2.16b ;" + " eor v0.16b, v0.16b, v3.16b ;" + " st1 {v0.16b}, %[out] ;" + + : [out] "=Q"(*out), + [key] "=r"(dummy0), + [rounds] "=r"(dummy1) + : [in] "Q"(*in), + "1"(ctx->key_dec), + "2"(num_rounds(ctx) - 2) + : "cc"); + + kernel_neon_end(); +} + +static struct crypto_alg aes_alg = { + .cra_name = "aes", + .cra_driver_name = "aes-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_module = THIS_MODULE, + .cra_cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = crypto_aes_set_key, + .cia_encrypt = aes_cipher_encrypt, + .cia_decrypt = aes_cipher_decrypt + } +}; + +static int __init aes_mod_init(void) +{ + return crypto_register_alg(&aes_alg); +} + +static void __exit aes_mod_exit(void) +{ + crypto_unregister_alg(&aes_alg); +} + +module_cpu_feature_match(AES, aes_mod_init); +module_exit(aes_mod_exit); diff --git a/arch/arm64/crypto/aes-ce.S b/arch/arm64/crypto/aes-ce.S new file mode 100644 index 000000000000..685a18f731eb --- /dev/null +++ b/arch/arm64/crypto/aes-ce.S @@ -0,0 +1,133 @@ +/* + * linux/arch/arm64/crypto/aes-ce.S - AES cipher for ARMv8 with + * Crypto Extensions + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#define AES_ENTRY(func) ENTRY(ce_ ## func) +#define AES_ENDPROC(func) ENDPROC(ce_ ## func) + + .arch armv8-a+crypto + + /* preload all round keys */ + .macro load_round_keys, rounds, rk + cmp \rounds, #12 + blo 2222f /* 128 bits */ + beq 1111f /* 192 bits */ + ld1 {v17.16b-v18.16b}, [\rk], #32 +1111: ld1 {v19.16b-v20.16b}, [\rk], #32 +2222: ld1 {v21.16b-v24.16b}, [\rk], #64 + ld1 {v25.16b-v28.16b}, [\rk], #64 + ld1 {v29.16b-v31.16b}, [\rk] + .endm + + /* prepare for encryption with key in rk[] */ + .macro enc_prepare, rounds, rk, ignore + load_round_keys \rounds, \rk + .endm + + /* prepare for encryption (again) but with new key in rk[] */ + .macro enc_switch_key, rounds, rk, ignore + load_round_keys \rounds, \rk + .endm + + /* prepare for decryption with key in rk[] */ + .macro dec_prepare, rounds, rk, ignore + load_round_keys \rounds, \rk + .endm + + .macro do_enc_Nx, de, mc, k, i0, i1, i2, i3 + aes\de \i0\().16b, \k\().16b + .ifnb \i1 + aes\de \i1\().16b, \k\().16b + .ifnb \i3 + aes\de \i2\().16b, \k\().16b + aes\de \i3\().16b, \k\().16b + .endif + .endif + aes\mc \i0\().16b, \i0\().16b + .ifnb \i1 + aes\mc \i1\().16b, \i1\().16b + .ifnb \i3 + aes\mc \i2\().16b, \i2\().16b + aes\mc \i3\().16b, \i3\().16b + .endif + .endif + .endm + + /* up to 4 interleaved encryption rounds with the same round key */ + .macro round_Nx, enc, k, i0, i1, i2, i3 + .ifc \enc, e + do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3 + .else + do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3 + .endif + .endm + + /* up to 4 interleaved final rounds */ + .macro fin_round_Nx, de, k, k2, i0, i1, i2, i3 + aes\de \i0\().16b, \k\().16b + .ifnb \i1 + aes\de \i1\().16b, \k\().16b + .ifnb \i3 + aes\de \i2\().16b, \k\().16b + aes\de \i3\().16b, \k\().16b + .endif + .endif + eor \i0\().16b, \i0\().16b, \k2\().16b + .ifnb \i1 + eor \i1\().16b, \i1\().16b, \k2\().16b + .ifnb \i3 + eor \i2\().16b, \i2\().16b, \k2\().16b + eor \i3\().16b, \i3\().16b, \k2\().16b + .endif + .endif + .endm + + /* up to 4 interleaved blocks */ + .macro do_block_Nx, enc, rounds, i0, i1, i2, i3 + cmp \rounds, #12 + blo 2222f /* 128 bits */ + beq 1111f /* 192 bits */ + round_Nx \enc, v17, \i0, \i1, \i2, \i3 + round_Nx \enc, v18, \i0, \i1, \i2, \i3 +1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3 + round_Nx \enc, v20, \i0, \i1, \i2, \i3 +2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 + round_Nx \enc, \key, \i0, \i1, \i2, \i3 + .endr + fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3 + .endm + + .macro encrypt_block, in, rounds, t0, t1, t2 + do_block_Nx e, \rounds, \in + .endm + + .macro encrypt_block2x, i0, i1, rounds, t0, t1, t2 + do_block_Nx e, \rounds, \i0, \i1 + .endm + + .macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 + do_block_Nx e, \rounds, \i0, \i1, \i2, \i3 + .endm + + .macro decrypt_block, in, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \in + .endm + + .macro decrypt_block2x, i0, i1, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \i0, \i1 + .endm + + .macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2 + do_block_Nx d, \rounds, \i0, \i1, \i2, \i3 + .endm + +#include "aes-modes.S" diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c new file mode 100644 index 000000000000..60f2f4c12256 --- /dev/null +++ b/arch/arm64/crypto/aes-glue.c @@ -0,0 +1,446 @@ +/* + * linux/arch/arm64/crypto/aes-glue.c - wrapper code for ARMv8 AES + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/hwcap.h> +#include <crypto/aes.h> +#include <crypto/ablk_helper.h> +#include <crypto/algapi.h> +#include <linux/module.h> +#include <linux/cpufeature.h> + +#ifdef USE_V8_CRYPTO_EXTENSIONS +#define MODE "ce" +#define PRIO 300 +#define aes_ecb_encrypt ce_aes_ecb_encrypt +#define aes_ecb_decrypt ce_aes_ecb_decrypt +#define aes_cbc_encrypt ce_aes_cbc_encrypt +#define aes_cbc_decrypt ce_aes_cbc_decrypt +#define aes_ctr_encrypt ce_aes_ctr_encrypt +#define aes_xts_encrypt ce_aes_xts_encrypt +#define aes_xts_decrypt ce_aes_xts_decrypt +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); +#else +#define MODE "neon" +#define PRIO 200 +#define aes_ecb_encrypt neon_aes_ecb_encrypt +#define aes_ecb_decrypt neon_aes_ecb_decrypt +#define aes_cbc_encrypt neon_aes_cbc_encrypt +#define aes_cbc_decrypt neon_aes_cbc_decrypt +#define aes_ctr_encrypt neon_aes_ctr_encrypt +#define aes_xts_encrypt neon_aes_xts_encrypt +#define aes_xts_decrypt neon_aes_xts_decrypt +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON"); +MODULE_ALIAS("ecb(aes)"); +MODULE_ALIAS("cbc(aes)"); +MODULE_ALIAS("ctr(aes)"); +MODULE_ALIAS("xts(aes)"); +#endif + +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +/* defined in aes-modes.S */ +asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, int first); +asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, int first); + +asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[], int first); +asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[], int first); + +asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 ctr[], int first); + +asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], + int rounds, int blocks, u8 const rk2[], u8 iv[], + int first); +asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], + int rounds, int blocks, u8 const rk2[], u8 iv[], + int first); + +struct crypto_aes_xts_ctx { + struct crypto_aes_ctx key1; + struct crypto_aes_ctx __aligned(8) key2; +}; + +static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm); + int ret; + + ret = crypto_aes_expand_key(&ctx->key1, in_key, key_len / 2); + if (!ret) + ret = crypto_aes_expand_key(&ctx->key2, &in_key[key_len / 2], + key_len / 2); + if (!ret) + return 0; + + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + int err, first, rounds = 6 + ctx->key_length / 4; + struct blkcipher_walk walk; + unsigned int blocks; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { + aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key_enc, rounds, blocks, first); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_neon_end(); + return err; +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + int err, first, rounds = 6 + ctx->key_length / 4; + struct blkcipher_walk walk; + unsigned int blocks; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { + aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key_dec, rounds, blocks, first); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_neon_end(); + return err; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + int err, first, rounds = 6 + ctx->key_length / 4; + struct blkcipher_walk walk; + unsigned int blocks; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { + aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key_enc, rounds, blocks, walk.iv, + first); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_neon_end(); + return err; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + int err, first, rounds = 6 + ctx->key_length / 4; + struct blkcipher_walk walk; + unsigned int blocks; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { + aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key_dec, rounds, blocks, walk.iv, + first); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_neon_end(); + return err; +} + +static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + int err, first, rounds = 6 + ctx->key_length / 4; + struct blkcipher_walk walk; + int blocks; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); + + first = 1; + kernel_neon_begin(); + while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { + aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key_enc, rounds, blocks, walk.iv, + first); + first = 0; + nbytes -= blocks * AES_BLOCK_SIZE; + if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE) + break; + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (nbytes) { + u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; + u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; + u8 __aligned(8) tail[AES_BLOCK_SIZE]; + + /* + * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need + * to tell aes_ctr_encrypt() to only read half a block. + */ + blocks = (nbytes <= 8) ? -1 : 1; + + aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, rounds, + blocks, walk.iv, first); + memcpy(tdst, tail, nbytes); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_neon_end(); + + return err; +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + int err, first, rounds = 6 + ctx->key1.key_length / 4; + struct blkcipher_walk walk; + unsigned int blocks; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { + aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key1.key_enc, rounds, blocks, + (u8 *)ctx->key2.key_enc, walk.iv, first); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_neon_end(); + + return err; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + int err, first, rounds = 6 + ctx->key1.key_length / 4; + struct blkcipher_walk walk; + unsigned int blocks; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { + aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key1.key_dec, rounds, blocks, + (u8 *)ctx->key2.key_enc, walk.iv, first); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_neon_end(); + + return err; +} + +static struct crypto_alg aes_algs[] = { { + .cra_name = "__ecb-aes-" MODE, + .cra_driver_name = "__driver-ecb-aes-" MODE, + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = crypto_aes_set_key, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, +}, { + .cra_name = "__cbc-aes-" MODE, + .cra_driver_name = "__driver-cbc-aes-" MODE, + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = crypto_aes_set_key, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, +}, { + .cra_name = "__ctr-aes-" MODE, + .cra_driver_name = "__driver-ctr-aes-" MODE, + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = crypto_aes_set_key, + .encrypt = ctr_encrypt, + .decrypt = ctr_encrypt, + }, +}, { + .cra_name = "__xts-aes-" MODE, + .cra_driver_name = "__driver-xts-aes-" MODE, + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_set_key, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, +}, { + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-" MODE, + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + } +}, { + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-" MODE, + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + } +}, { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-" MODE, + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + } +}, { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-" MODE, + .cra_priority = PRIO, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + } +} }; + +static int __init aes_init(void) +{ + return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); +} + +static void __exit aes_exit(void) +{ + crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); +} + +#ifdef USE_V8_CRYPTO_EXTENSIONS +module_cpu_feature_match(AES, aes_init); +#else +module_init(aes_init); +#endif +module_exit(aes_exit); diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S new file mode 100644 index 000000000000..f6e372c528eb --- /dev/null +++ b/arch/arm64/crypto/aes-modes.S @@ -0,0 +1,532 @@ +/* + * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* included by aes-ce.S and aes-neon.S */ + + .text + .align 4 + +/* + * There are several ways to instantiate this code: + * - no interleave, all inline + * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) + * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) + * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) + * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) + * + * Macros imported by this code: + * - enc_prepare - setup NEON registers for encryption + * - dec_prepare - setup NEON registers for decryption + * - enc_switch_key - change to new key after having prepared for encryption + * - encrypt_block - encrypt a single block + * - decrypt block - decrypt a single block + * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) + * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) + * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) + * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) + */ + +#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) +#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp +#define FRAME_POP ldp x29, x30, [sp],#16 + +#if INTERLEAVE == 2 + +aes_encrypt_block2x: + encrypt_block2x v0, v1, w3, x2, x6, w7 + ret +ENDPROC(aes_encrypt_block2x) + +aes_decrypt_block2x: + decrypt_block2x v0, v1, w3, x2, x6, w7 + ret +ENDPROC(aes_decrypt_block2x) + +#elif INTERLEAVE == 4 + +aes_encrypt_block4x: + encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 + ret +ENDPROC(aes_encrypt_block4x) + +aes_decrypt_block4x: + decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 + ret +ENDPROC(aes_decrypt_block4x) + +#else +#error INTERLEAVE should equal 2 or 4 +#endif + + .macro do_encrypt_block2x + bl aes_encrypt_block2x + .endm + + .macro do_decrypt_block2x + bl aes_decrypt_block2x + .endm + + .macro do_encrypt_block4x + bl aes_encrypt_block4x + .endm + + .macro do_decrypt_block4x + bl aes_decrypt_block4x + .endm + +#else +#define FRAME_PUSH +#define FRAME_POP + + .macro do_encrypt_block2x + encrypt_block2x v0, v1, w3, x2, x6, w7 + .endm + + .macro do_decrypt_block2x + decrypt_block2x v0, v1, w3, x2, x6, w7 + .endm + + .macro do_encrypt_block4x + encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 + .endm + + .macro do_decrypt_block4x + decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 + .endm + +#endif + + /* + * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, int first) + * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, int first) + */ + +AES_ENTRY(aes_ecb_encrypt) + FRAME_PUSH + cbz w5, .LecbencloopNx + + enc_prepare w3, x2, x5 + +.LecbencloopNx: +#if INTERLEAVE >= 2 + subs w4, w4, #INTERLEAVE + bmi .Lecbenc1x +#if INTERLEAVE == 2 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ + do_encrypt_block2x + st1 {v0.16b-v1.16b}, [x0], #32 +#else + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ + do_encrypt_block4x + st1 {v0.16b-v3.16b}, [x0], #64 +#endif + b .LecbencloopNx +.Lecbenc1x: + adds w4, w4, #INTERLEAVE + beq .Lecbencout +#endif +.Lecbencloop: + ld1 {v0.16b}, [x1], #16 /* get next pt block */ + encrypt_block v0, w3, x2, x5, w6 + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lecbencloop +.Lecbencout: + FRAME_POP + ret +AES_ENDPROC(aes_ecb_encrypt) + + +AES_ENTRY(aes_ecb_decrypt) + FRAME_PUSH + cbz w5, .LecbdecloopNx + + dec_prepare w3, x2, x5 + +.LecbdecloopNx: +#if INTERLEAVE >= 2 + subs w4, w4, #INTERLEAVE + bmi .Lecbdec1x +#if INTERLEAVE == 2 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ + do_decrypt_block2x + st1 {v0.16b-v1.16b}, [x0], #32 +#else + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ + do_decrypt_block4x + st1 {v0.16b-v3.16b}, [x0], #64 +#endif + b .LecbdecloopNx +.Lecbdec1x: + adds w4, w4, #INTERLEAVE + beq .Lecbdecout +#endif +.Lecbdecloop: + ld1 {v0.16b}, [x1], #16 /* get next ct block */ + decrypt_block v0, w3, x2, x5, w6 + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lecbdecloop +.Lecbdecout: + FRAME_POP + ret +AES_ENDPROC(aes_ecb_decrypt) + + + /* + * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[], int first) + * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[], int first) + */ + +AES_ENTRY(aes_cbc_encrypt) + cbz w6, .Lcbcencloop + + ld1 {v0.16b}, [x5] /* get iv */ + enc_prepare w3, x2, x5 + +.Lcbcencloop: + ld1 {v1.16b}, [x1], #16 /* get next pt block */ + eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ + encrypt_block v0, w3, x2, x5, w6 + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lcbcencloop + ret +AES_ENDPROC(aes_cbc_encrypt) + + +AES_ENTRY(aes_cbc_decrypt) + FRAME_PUSH + cbz w6, .LcbcdecloopNx + + ld1 {v7.16b}, [x5] /* get iv */ + dec_prepare w3, x2, x5 + +.LcbcdecloopNx: +#if INTERLEAVE >= 2 + subs w4, w4, #INTERLEAVE + bmi .Lcbcdec1x +#if INTERLEAVE == 2 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ + mov v2.16b, v0.16b + mov v3.16b, v1.16b + do_decrypt_block2x + eor v0.16b, v0.16b, v7.16b + eor v1.16b, v1.16b, v2.16b + mov v7.16b, v3.16b + st1 {v0.16b-v1.16b}, [x0], #32 +#else + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ + mov v4.16b, v0.16b + mov v5.16b, v1.16b + mov v6.16b, v2.16b + do_decrypt_block4x + sub x1, x1, #16 + eor v0.16b, v0.16b, v7.16b + eor v1.16b, v1.16b, v4.16b + ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ + eor v2.16b, v2.16b, v5.16b + eor v3.16b, v3.16b, v6.16b + st1 {v0.16b-v3.16b}, [x0], #64 +#endif + b .LcbcdecloopNx +.Lcbcdec1x: + adds w4, w4, #INTERLEAVE + beq .Lcbcdecout +#endif +.Lcbcdecloop: + ld1 {v1.16b}, [x1], #16 /* get next ct block */ + mov v0.16b, v1.16b /* ...and copy to v0 */ + decrypt_block v0, w3, x2, x5, w6 + eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ + mov v7.16b, v1.16b /* ct is next iv */ + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + bne .Lcbcdecloop +.Lcbcdecout: + FRAME_POP + ret +AES_ENDPROC(aes_cbc_decrypt) + + + /* + * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 ctr[], int first) + */ + +AES_ENTRY(aes_ctr_encrypt) + FRAME_PUSH + cbnz w6, .Lctrfirst /* 1st time around? */ + umov x5, v4.d[1] /* keep swabbed ctr in reg */ + rev x5, x5 +#if INTERLEAVE >= 2 + cmn w5, w4 /* 32 bit overflow? */ + bcs .Lctrinc + add x5, x5, #1 /* increment BE ctr */ + b .LctrincNx +#else + b .Lctrinc +#endif +.Lctrfirst: + enc_prepare w3, x2, x6 + ld1 {v4.16b}, [x5] + umov x5, v4.d[1] /* keep swabbed ctr in reg */ + rev x5, x5 +#if INTERLEAVE >= 2 + cmn w5, w4 /* 32 bit overflow? */ + bcs .Lctrloop +.LctrloopNx: + subs w4, w4, #INTERLEAVE + bmi .Lctr1x +#if INTERLEAVE == 2 + mov v0.8b, v4.8b + mov v1.8b, v4.8b + rev x7, x5 + add x5, x5, #1 + ins v0.d[1], x7 + rev x7, x5 + add x5, x5, #1 + ins v1.d[1], x7 + ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ + do_encrypt_block2x + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + st1 {v0.16b-v1.16b}, [x0], #32 +#else + ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ + dup v7.4s, w5 + mov v0.16b, v4.16b + add v7.4s, v7.4s, v8.4s + mov v1.16b, v4.16b + rev32 v8.16b, v7.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b + mov v1.s[3], v8.s[0] + mov v2.s[3], v8.s[1] + mov v3.s[3], v8.s[2] + ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ + do_encrypt_block4x + eor v0.16b, v5.16b, v0.16b + ld1 {v5.16b}, [x1], #16 /* get 1 input block */ + eor v1.16b, v6.16b, v1.16b + eor v2.16b, v7.16b, v2.16b + eor v3.16b, v5.16b, v3.16b + st1 {v0.16b-v3.16b}, [x0], #64 + add x5, x5, #INTERLEAVE +#endif + cbz w4, .LctroutNx +.LctrincNx: + rev x7, x5 + ins v4.d[1], x7 + b .LctrloopNx +.LctroutNx: + sub x5, x5, #1 + rev x7, x5 + ins v4.d[1], x7 + b .Lctrout +.Lctr1x: + adds w4, w4, #INTERLEAVE + beq .Lctrout +#endif +.Lctrloop: + mov v0.16b, v4.16b + encrypt_block v0, w3, x2, x6, w7 + subs w4, w4, #1 + bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */ + ld1 {v3.16b}, [x1], #16 + eor v3.16b, v0.16b, v3.16b + st1 {v3.16b}, [x0], #16 + beq .Lctrout +.Lctrinc: + adds x5, x5, #1 /* increment BE ctr */ + rev x7, x5 + ins v4.d[1], x7 + bcc .Lctrloop /* no overflow? */ + umov x7, v4.d[0] /* load upper word of ctr */ + rev x7, x7 /* ... to handle the carry */ + add x7, x7, #1 + rev x7, x7 + ins v4.d[0], x7 + b .Lctrloop +.Lctrhalfblock: + ld1 {v3.8b}, [x1] + eor v3.8b, v0.8b, v3.8b + st1 {v3.8b}, [x0] +.Lctrout: + FRAME_POP + ret +AES_ENDPROC(aes_ctr_encrypt) + .ltorg + + + /* + * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, + * int blocks, u8 const rk2[], u8 iv[], int first) + * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, + * int blocks, u8 const rk2[], u8 iv[], int first) + */ + + .macro next_tweak, out, in, const, tmp + sshr \tmp\().2d, \in\().2d, #63 + and \tmp\().16b, \tmp\().16b, \const\().16b + add \out\().2d, \in\().2d, \in\().2d + ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 + eor \out\().16b, \out\().16b, \tmp\().16b + .endm + +.Lxts_mul_x: + .word 1, 0, 0x87, 0 + +AES_ENTRY(aes_xts_encrypt) + FRAME_PUSH + cbz w7, .LxtsencloopNx + + ld1 {v4.16b}, [x6] + enc_prepare w3, x5, x6 + encrypt_block v4, w3, x5, x6, w7 /* first tweak */ + enc_switch_key w3, x2, x6 + ldr q7, .Lxts_mul_x + b .LxtsencNx + +.LxtsencloopNx: + ldr q7, .Lxts_mul_x + next_tweak v4, v4, v7, v8 +.LxtsencNx: +#if INTERLEAVE >= 2 + subs w4, w4, #INTERLEAVE + bmi .Lxtsenc1x +#if INTERLEAVE == 2 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ + next_tweak v5, v4, v7, v8 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + do_encrypt_block2x + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + st1 {v0.16b-v1.16b}, [x0], #32 + cbz w4, .LxtsencoutNx + next_tweak v4, v5, v7, v8 + b .LxtsencNx +.LxtsencoutNx: + mov v4.16b, v5.16b + b .Lxtsencout +#else + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ + next_tweak v5, v4, v7, v8 + eor v0.16b, v0.16b, v4.16b + next_tweak v6, v5, v7, v8 + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + next_tweak v7, v6, v7, v8 + eor v3.16b, v3.16b, v7.16b + do_encrypt_block4x + eor v3.16b, v3.16b, v7.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + st1 {v0.16b-v3.16b}, [x0], #64 + mov v4.16b, v7.16b + cbz w4, .Lxtsencout + b .LxtsencloopNx +#endif +.Lxtsenc1x: + adds w4, w4, #INTERLEAVE + beq .Lxtsencout +#endif +.Lxtsencloop: + ld1 {v1.16b}, [x1], #16 + eor v0.16b, v1.16b, v4.16b + encrypt_block v0, w3, x2, x6, w7 + eor v0.16b, v0.16b, v4.16b + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + beq .Lxtsencout + next_tweak v4, v4, v7, v8 + b .Lxtsencloop +.Lxtsencout: + FRAME_POP + ret +AES_ENDPROC(aes_xts_encrypt) + + +AES_ENTRY(aes_xts_decrypt) + FRAME_PUSH + cbz w7, .LxtsdecloopNx + + ld1 {v4.16b}, [x6] + enc_prepare w3, x5, x6 + encrypt_block v4, w3, x5, x6, w7 /* first tweak */ + dec_prepare w3, x2, x6 + ldr q7, .Lxts_mul_x + b .LxtsdecNx + +.LxtsdecloopNx: + ldr q7, .Lxts_mul_x + next_tweak v4, v4, v7, v8 +.LxtsdecNx: +#if INTERLEAVE >= 2 + subs w4, w4, #INTERLEAVE + bmi .Lxtsdec1x +#if INTERLEAVE == 2 + ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ + next_tweak v5, v4, v7, v8 + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + do_decrypt_block2x + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + st1 {v0.16b-v1.16b}, [x0], #32 + cbz w4, .LxtsdecoutNx + next_tweak v4, v5, v7, v8 + b .LxtsdecNx +.LxtsdecoutNx: + mov v4.16b, v5.16b + b .Lxtsdecout +#else + ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ + next_tweak v5, v4, v7, v8 + eor v0.16b, v0.16b, v4.16b + next_tweak v6, v5, v7, v8 + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + next_tweak v7, v6, v7, v8 + eor v3.16b, v3.16b, v7.16b + do_decrypt_block4x + eor v3.16b, v3.16b, v7.16b + eor v0.16b, v0.16b, v4.16b + eor v1.16b, v1.16b, v5.16b + eor v2.16b, v2.16b, v6.16b + st1 {v0.16b-v3.16b}, [x0], #64 + mov v4.16b, v7.16b + cbz w4, .Lxtsdecout + b .LxtsdecloopNx +#endif +.Lxtsdec1x: + adds w4, w4, #INTERLEAVE + beq .Lxtsdecout +#endif +.Lxtsdecloop: + ld1 {v1.16b}, [x1], #16 + eor v0.16b, v1.16b, v4.16b + decrypt_block v0, w3, x2, x6, w7 + eor v0.16b, v0.16b, v4.16b + st1 {v0.16b}, [x0], #16 + subs w4, w4, #1 + beq .Lxtsdecout + next_tweak v4, v4, v7, v8 + b .Lxtsdecloop +.Lxtsdecout: + FRAME_POP + ret +AES_ENDPROC(aes_xts_decrypt) diff --git a/arch/arm64/crypto/aes-neon.S b/arch/arm64/crypto/aes-neon.S new file mode 100644 index 000000000000..b93170e1cc93 --- /dev/null +++ b/arch/arm64/crypto/aes-neon.S @@ -0,0 +1,382 @@ +/* + * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> + +#define AES_ENTRY(func) ENTRY(neon_ ## func) +#define AES_ENDPROC(func) ENDPROC(neon_ ## func) + + /* multiply by polynomial 'x' in GF(2^8) */ + .macro mul_by_x, out, in, temp, const + sshr \temp, \in, #7 + add \out, \in, \in + and \temp, \temp, \const + eor \out, \out, \temp + .endm + + /* preload the entire Sbox */ + .macro prepare, sbox, shiftrows, temp + adr \temp, \sbox + movi v12.16b, #0x40 + ldr q13, \shiftrows + movi v14.16b, #0x1b + ld1 {v16.16b-v19.16b}, [\temp], #64 + ld1 {v20.16b-v23.16b}, [\temp], #64 + ld1 {v24.16b-v27.16b}, [\temp], #64 + ld1 {v28.16b-v31.16b}, [\temp] + .endm + + /* do preload for encryption */ + .macro enc_prepare, ignore0, ignore1, temp + prepare .LForward_Sbox, .LForward_ShiftRows, \temp + .endm + + .macro enc_switch_key, ignore0, ignore1, temp + /* do nothing */ + .endm + + /* do preload for decryption */ + .macro dec_prepare, ignore0, ignore1, temp + prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp + .endm + + /* apply SubBytes transformation using the the preloaded Sbox */ + .macro sub_bytes, in + sub v9.16b, \in\().16b, v12.16b + tbl \in\().16b, {v16.16b-v19.16b}, \in\().16b + sub v10.16b, v9.16b, v12.16b + tbx \in\().16b, {v20.16b-v23.16b}, v9.16b + sub v11.16b, v10.16b, v12.16b + tbx \in\().16b, {v24.16b-v27.16b}, v10.16b + tbx \in\().16b, {v28.16b-v31.16b}, v11.16b + .endm + + /* apply MixColumns transformation */ + .macro mix_columns, in + mul_by_x v10.16b, \in\().16b, v9.16b, v14.16b + rev32 v8.8h, \in\().8h + eor \in\().16b, v10.16b, \in\().16b + shl v9.4s, v8.4s, #24 + shl v11.4s, \in\().4s, #24 + sri v9.4s, v8.4s, #8 + sri v11.4s, \in\().4s, #8 + eor v9.16b, v9.16b, v8.16b + eor v10.16b, v10.16b, v9.16b + eor \in\().16b, v10.16b, v11.16b + .endm + + /* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */ + .macro inv_mix_columns, in + mul_by_x v11.16b, \in\().16b, v10.16b, v14.16b + mul_by_x v11.16b, v11.16b, v10.16b, v14.16b + eor \in\().16b, \in\().16b, v11.16b + rev32 v11.8h, v11.8h + eor \in\().16b, \in\().16b, v11.16b + mix_columns \in + .endm + + .macro do_block, enc, in, rounds, rk, rkp, i + ld1 {v15.16b}, [\rk] + add \rkp, \rk, #16 + mov \i, \rounds +1111: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ + tbl \in\().16b, {\in\().16b}, v13.16b /* ShiftRows */ + sub_bytes \in + ld1 {v15.16b}, [\rkp], #16 + subs \i, \i, #1 + beq 2222f + .if \enc == 1 + mix_columns \in + .else + inv_mix_columns \in + .endif + b 1111b +2222: eor \in\().16b, \in\().16b, v15.16b /* ^round key */ + .endm + + .macro encrypt_block, in, rounds, rk, rkp, i + do_block 1, \in, \rounds, \rk, \rkp, \i + .endm + + .macro decrypt_block, in, rounds, rk, rkp, i + do_block 0, \in, \rounds, \rk, \rkp, \i + .endm + + /* + * Interleaved versions: functionally equivalent to the + * ones above, but applied to 2 or 4 AES states in parallel. + */ + + .macro sub_bytes_2x, in0, in1 + sub v8.16b, \in0\().16b, v12.16b + sub v9.16b, \in1\().16b, v12.16b + tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b + tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b + sub v10.16b, v8.16b, v12.16b + sub v11.16b, v9.16b, v12.16b + tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b + tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b + sub v8.16b, v10.16b, v12.16b + sub v9.16b, v11.16b, v12.16b + tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b + tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b + tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b + tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b + .endm + + .macro sub_bytes_4x, in0, in1, in2, in3 + sub v8.16b, \in0\().16b, v12.16b + tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b + sub v9.16b, \in1\().16b, v12.16b + tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b + sub v10.16b, \in2\().16b, v12.16b + tbl \in2\().16b, {v16.16b-v19.16b}, \in2\().16b + sub v11.16b, \in3\().16b, v12.16b + tbl \in3\().16b, {v16.16b-v19.16b}, \in3\().16b + tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b + tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b + sub v8.16b, v8.16b, v12.16b + tbx \in2\().16b, {v20.16b-v23.16b}, v10.16b + sub v9.16b, v9.16b, v12.16b + tbx \in3\().16b, {v20.16b-v23.16b}, v11.16b + sub v10.16b, v10.16b, v12.16b + tbx \in0\().16b, {v24.16b-v27.16b}, v8.16b + sub v11.16b, v11.16b, v12.16b + tbx \in1\().16b, {v24.16b-v27.16b}, v9.16b + sub v8.16b, v8.16b, v12.16b + tbx \in2\().16b, {v24.16b-v27.16b}, v10.16b + sub v9.16b, v9.16b, v12.16b + tbx \in3\().16b, {v24.16b-v27.16b}, v11.16b + sub v10.16b, v10.16b, v12.16b + tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b + sub v11.16b, v11.16b, v12.16b + tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b + tbx \in2\().16b, {v28.16b-v31.16b}, v10.16b + tbx \in3\().16b, {v28.16b-v31.16b}, v11.16b + .endm + + .macro mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const + sshr \tmp0\().16b, \in0\().16b, #7 + add \out0\().16b, \in0\().16b, \in0\().16b + sshr \tmp1\().16b, \in1\().16b, #7 + and \tmp0\().16b, \tmp0\().16b, \const\().16b + add \out1\().16b, \in1\().16b, \in1\().16b + and \tmp1\().16b, \tmp1\().16b, \const\().16b + eor \out0\().16b, \out0\().16b, \tmp0\().16b + eor \out1\().16b, \out1\().16b, \tmp1\().16b + .endm + + .macro mix_columns_2x, in0, in1 + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 + rev32 v10.8h, \in0\().8h + rev32 v11.8h, \in1\().8h + eor \in0\().16b, v8.16b, \in0\().16b + eor \in1\().16b, v9.16b, \in1\().16b + shl v12.4s, v10.4s, #24 + shl v13.4s, v11.4s, #24 + eor v8.16b, v8.16b, v10.16b + sri v12.4s, v10.4s, #8 + shl v10.4s, \in0\().4s, #24 + eor v9.16b, v9.16b, v11.16b + sri v13.4s, v11.4s, #8 + shl v11.4s, \in1\().4s, #24 + sri v10.4s, \in0\().4s, #8 + eor \in0\().16b, v8.16b, v12.16b + sri v11.4s, \in1\().4s, #8 + eor \in1\().16b, v9.16b, v13.16b + eor \in0\().16b, v10.16b, \in0\().16b + eor \in1\().16b, v11.16b, \in1\().16b + .endm + + .macro inv_mix_cols_2x, in0, in1 + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 + mul_by_x_2x v8, v9, v8, v9, v10, v11, v14 + eor \in0\().16b, \in0\().16b, v8.16b + eor \in1\().16b, \in1\().16b, v9.16b + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + eor \in0\().16b, \in0\().16b, v8.16b + eor \in1\().16b, \in1\().16b, v9.16b + mix_columns_2x \in0, \in1 + .endm + + .macro inv_mix_cols_4x, in0, in1, in2, in3 + mul_by_x_2x v8, v9, \in0, \in1, v10, v11, v14 + mul_by_x_2x v10, v11, \in2, \in3, v12, v13, v14 + mul_by_x_2x v8, v9, v8, v9, v12, v13, v14 + mul_by_x_2x v10, v11, v10, v11, v12, v13, v14 + eor \in0\().16b, \in0\().16b, v8.16b + eor \in1\().16b, \in1\().16b, v9.16b + eor \in2\().16b, \in2\().16b, v10.16b + eor \in3\().16b, \in3\().16b, v11.16b + rev32 v8.8h, v8.8h + rev32 v9.8h, v9.8h + rev32 v10.8h, v10.8h + rev32 v11.8h, v11.8h + eor \in0\().16b, \in0\().16b, v8.16b + eor \in1\().16b, \in1\().16b, v9.16b + eor \in2\().16b, \in2\().16b, v10.16b + eor \in3\().16b, \in3\().16b, v11.16b + mix_columns_2x \in0, \in1 + mix_columns_2x \in2, \in3 + .endm + + .macro do_block_2x, enc, in0, in1 rounds, rk, rkp, i + ld1 {v15.16b}, [\rk] + add \rkp, \rk, #16 + mov \i, \rounds +1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ + sub_bytes_2x \in0, \in1 + tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ + tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ + ld1 {v15.16b}, [\rkp], #16 + subs \i, \i, #1 + beq 2222f + .if \enc == 1 + mix_columns_2x \in0, \in1 + ldr q13, .LForward_ShiftRows + .else + inv_mix_cols_2x \in0, \in1 + ldr q13, .LReverse_ShiftRows + .endif + movi v12.16b, #0x40 + b 1111b +2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ + .endm + + .macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i + ld1 {v15.16b}, [\rk] + add \rkp, \rk, #16 + mov \i, \rounds +1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ + eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ + eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ + sub_bytes_4x \in0, \in1, \in2, \in3 + tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */ + tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */ + tbl \in2\().16b, {\in2\().16b}, v13.16b /* ShiftRows */ + tbl \in3\().16b, {\in3\().16b}, v13.16b /* ShiftRows */ + ld1 {v15.16b}, [\rkp], #16 + subs \i, \i, #1 + beq 2222f + .if \enc == 1 + mix_columns_2x \in0, \in1 + mix_columns_2x \in2, \in3 + ldr q13, .LForward_ShiftRows + .else + inv_mix_cols_4x \in0, \in1, \in2, \in3 + ldr q13, .LReverse_ShiftRows + .endif + movi v12.16b, #0x40 + b 1111b +2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */ + eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */ + eor \in2\().16b, \in2\().16b, v15.16b /* ^round key */ + eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */ + .endm + + .macro encrypt_block2x, in0, in1, rounds, rk, rkp, i + do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i + .endm + + .macro decrypt_block2x, in0, in1, rounds, rk, rkp, i + do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i + .endm + + .macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i + do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i + .endm + + .macro decrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i + do_block_4x 0, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i + .endm + +#include "aes-modes.S" + + .text + .align 4 +.LForward_ShiftRows: + .byte 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3 + .byte 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb + +.LReverse_ShiftRows: + .byte 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb + .byte 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3 + +.LForward_Sbox: + .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 + .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 + .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 + .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 + .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc + .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 + .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a + .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 + .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 + .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 + .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b + .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf + .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 + .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 + .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 + .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 + .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 + .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 + .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 + .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb + .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c + .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 + .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 + .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 + .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 + .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a + .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e + .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e + .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 + .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf + .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 + .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + +.LReverse_Sbox: + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S new file mode 100644 index 000000000000..b9e6eaf41c9b --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -0,0 +1,95 @@ +/* + * Accelerated GHASH implementation with ARMv8 PMULL instructions. + * + * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * Vinodh Gopal + * Erdinc Ozturk + * Deniz Karakoyunlu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + DATA .req v0 + SHASH .req v1 + IN1 .req v2 + T1 .req v2 + T2 .req v3 + T3 .req v4 + VZR .req v5 + + .text + .arch armv8-a+crypto + + /* + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, + * struct ghash_key const *k, const char *head) + */ +ENTRY(pmull_ghash_update) + ld1 {DATA.16b}, [x1] + ld1 {SHASH.16b}, [x3] + eor VZR.16b, VZR.16b, VZR.16b + + /* do the head block first, if supplied */ + cbz x4, 0f + ld1 {IN1.2d}, [x4] + b 1f + +0: ld1 {IN1.2d}, [x2], #16 + sub w0, w0, #1 +1: ext IN1.16b, IN1.16b, IN1.16b, #8 +CPU_LE( rev64 IN1.16b, IN1.16b ) + eor DATA.16b, DATA.16b, IN1.16b + + /* multiply DATA by SHASH in GF(2^128) */ + ext T2.16b, DATA.16b, DATA.16b, #8 + ext T3.16b, SHASH.16b, SHASH.16b, #8 + eor T2.16b, T2.16b, DATA.16b + eor T3.16b, T3.16b, SHASH.16b + + pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 + pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 + pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) + eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) + eor T2.16b, T2.16b, DATA.16b + + ext T3.16b, VZR.16b, T2.16b, #8 + ext T2.16b, T2.16b, VZR.16b, #8 + eor DATA.16b, DATA.16b, T3.16b + eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of + // carry-less multiplication + + /* first phase of the reduction */ + shl T3.2d, DATA.2d, #1 + eor T3.16b, T3.16b, DATA.16b + shl T3.2d, T3.2d, #5 + eor T3.16b, T3.16b, DATA.16b + shl T3.2d, T3.2d, #57 + ext T2.16b, VZR.16b, T3.16b, #8 + ext T3.16b, T3.16b, VZR.16b, #8 + eor DATA.16b, DATA.16b, T2.16b + eor T1.16b, T1.16b, T3.16b + + /* second phase of the reduction */ + ushr T2.2d, DATA.2d, #5 + eor T2.16b, T2.16b, DATA.16b + ushr T2.2d, T2.2d, #1 + eor T2.16b, T2.16b, DATA.16b + ushr T2.2d, T2.2d, #1 + eor T1.16b, T1.16b, T2.16b + eor DATA.16b, DATA.16b, T1.16b + + cbnz w0, 0b + + st1 {DATA.16b}, [x1] + ret +ENDPROC(pmull_ghash_update) diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c new file mode 100644 index 000000000000..b92baf3f68c7 --- /dev/null +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -0,0 +1,155 @@ +/* + * Accelerated GHASH implementation with ARMv8 PMULL instructions. + * + * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +struct ghash_key { + u64 a; + u64 b; +}; + +struct ghash_desc_ctx { + u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; + u8 buf[GHASH_BLOCK_SIZE]; + u32 count; +}; + +asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, const char *head); + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + + *ctx = (struct ghash_desc_ctx){}; + return 0; +} + +static int ghash_update(struct shash_desc *desc, const u8 *src, + unsigned int len) +{ + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; + + ctx->count += len; + + if ((partial + len) >= GHASH_BLOCK_SIZE) { + struct ghash_key *key = crypto_shash_ctx(desc->tfm); + int blocks; + + if (partial) { + int p = GHASH_BLOCK_SIZE - partial; + + memcpy(ctx->buf + partial, src, p); + src += p; + len -= p; + } + + blocks = len / GHASH_BLOCK_SIZE; + len %= GHASH_BLOCK_SIZE; + + kernel_neon_begin_partial(6); + pmull_ghash_update(blocks, ctx->digest, src, key, + partial ? ctx->buf : NULL); + kernel_neon_end(); + src += blocks * GHASH_BLOCK_SIZE; + } + if (len) + memcpy(ctx->buf + partial, src, len); + return 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; + + if (partial) { + struct ghash_key *key = crypto_shash_ctx(desc->tfm); + + memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); + + kernel_neon_begin_partial(6); + pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); + kernel_neon_end(); + } + put_unaligned_be64(ctx->digest[1], dst); + put_unaligned_be64(ctx->digest[0], dst + 8); + + *ctx = (struct ghash_desc_ctx){}; + return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *inkey, unsigned int keylen) +{ + struct ghash_key *key = crypto_shash_ctx(tfm); + u64 a, b; + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + /* perform multiplication by 'x' in GF(2^128) */ + b = get_unaligned_be64(inkey); + a = get_unaligned_be64(inkey + 8); + + key->a = (a << 1) | (b >> 63); + key->b = (b << 1) | (a >> 63); + + if (b >> 63) + key->b ^= 0xc200000000000000UL; + + return 0; +} + +static struct shash_alg ghash_alg = { + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), + .base = { + .cra_name = "ghash", + .cra_driver_name = "ghash-ce", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ghash_key), + .cra_module = THIS_MODULE, + }, +}; + +static int __init ghash_ce_mod_init(void) +{ + return crypto_register_shash(&ghash_alg); +} + +static void __exit ghash_ce_mod_exit(void) +{ + crypto_unregister_shash(&ghash_alg); +} + +module_cpu_feature_match(PMULL, ghash_ce_mod_init); +module_exit(ghash_ce_mod_exit); diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S new file mode 100644 index 000000000000..09d57d98609c --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -0,0 +1,153 @@ +/* + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + k0 .req v0 + k1 .req v1 + k2 .req v2 + k3 .req v3 + + t0 .req v4 + t1 .req v5 + + dga .req q6 + dgav .req v6 + dgb .req s7 + dgbv .req v7 + + dg0q .req q12 + dg0s .req s12 + dg0v .req v12 + dg1s .req s13 + dg1v .req v13 + dg2s .req s14 + + .macro add_only, op, ev, rc, s0, dg1 + .ifc \ev, ev + add t1.4s, v\s0\().4s, \rc\().4s + sha1h dg2s, dg0s + .ifnb \dg1 + sha1\op dg0q, \dg1, t0.4s + .else + sha1\op dg0q, dg1s, t0.4s + .endif + .else + .ifnb \s0 + add t0.4s, v\s0\().4s, \rc\().4s + .endif + sha1h dg1s, dg0s + sha1\op dg0q, dg2s, t1.4s + .endif + .endm + + .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 + sha1su0 v\s0\().4s, v\s1\().4s, v\s2\().4s + add_only \op, \ev, \rc, \s1, \dg1 + sha1su1 v\s0\().4s, v\s3\().4s + .endm + + /* + * The SHA1 round constants + */ + .align 4 +.Lsha1_rcon: + .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 + + /* + * void sha1_ce_transform(int blocks, u8 const *src, u32 *state, + * u8 *head, long bytes) + */ +ENTRY(sha1_ce_transform) + /* load round constants */ + adr x6, .Lsha1_rcon + ld1r {k0.4s}, [x6], #4 + ld1r {k1.4s}, [x6], #4 + ld1r {k2.4s}, [x6], #4 + ld1r {k3.4s}, [x6] + + /* load state */ + ldr dga, [x2] + ldr dgb, [x2, #16] + + /* load partial state (if supplied) */ + cbz x3, 0f + ld1 {v8.4s-v11.4s}, [x3] + b 1f + + /* load input */ +0: ld1 {v8.4s-v11.4s}, [x1], #64 + sub w0, w0, #1 + +1: +CPU_LE( rev32 v8.16b, v8.16b ) +CPU_LE( rev32 v9.16b, v9.16b ) +CPU_LE( rev32 v10.16b, v10.16b ) +CPU_LE( rev32 v11.16b, v11.16b ) + +2: add t0.4s, v8.4s, k0.4s + mov dg0v.16b, dgav.16b + + add_update c, ev, k0, 8, 9, 10, 11, dgb + add_update c, od, k0, 9, 10, 11, 8 + add_update c, ev, k0, 10, 11, 8, 9 + add_update c, od, k0, 11, 8, 9, 10 + add_update c, ev, k1, 8, 9, 10, 11 + + add_update p, od, k1, 9, 10, 11, 8 + add_update p, ev, k1, 10, 11, 8, 9 + add_update p, od, k1, 11, 8, 9, 10 + add_update p, ev, k1, 8, 9, 10, 11 + add_update p, od, k2, 9, 10, 11, 8 + + add_update m, ev, k2, 10, 11, 8, 9 + add_update m, od, k2, 11, 8, 9, 10 + add_update m, ev, k2, 8, 9, 10, 11 + add_update m, od, k2, 9, 10, 11, 8 + add_update m, ev, k3, 10, 11, 8, 9 + + add_update p, od, k3, 11, 8, 9, 10 + add_only p, ev, k3, 9 + add_only p, od, k3, 10 + add_only p, ev, k3, 11 + add_only p, od + + /* update state */ + add dgbv.2s, dgbv.2s, dg1v.2s + add dgav.4s, dgav.4s, dg0v.4s + + cbnz w0, 0b + + /* + * Final block: add padding and total bit count. + * Skip if we have no total byte count in x4. In that case, the input + * size was not a round multiple of the block size, and the padding is + * handled by the C code. + */ + cbz x4, 3f + movi v9.2d, #0 + mov x8, #0x80000000 + movi v10.2d, #0 + ror x7, x4, #29 // ror(lsl(x4, 3), 32) + fmov d8, x8 + mov x4, #0 + mov v11.d[0], xzr + mov v11.d[1], x7 + b 2b + + /* store new state */ +3: str dga, [x2] + str dgb, [x2, #16] + ret +ENDPROC(sha1_ce_transform) diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c new file mode 100644 index 000000000000..6fe83f37a750 --- /dev/null +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -0,0 +1,174 @@ +/* + * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/sha.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state, + u8 *head, long bytes); + +static int sha1_init(struct shash_desc *desc) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + *sctx = (struct sha1_state){ + .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, + }; + return 0; +} + +static int sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; + + sctx->count += len; + + if ((partial + len) >= SHA1_BLOCK_SIZE) { + int blocks; + + if (partial) { + int p = SHA1_BLOCK_SIZE - partial; + + memcpy(sctx->buffer + partial, data, p); + data += p; + len -= p; + } + + blocks = len / SHA1_BLOCK_SIZE; + len %= SHA1_BLOCK_SIZE; + + kernel_neon_begin_partial(16); + sha1_ce_transform(blocks, data, sctx->state, + partial ? sctx->buffer : NULL, 0); + kernel_neon_end(); + + data += blocks * SHA1_BLOCK_SIZE; + partial = 0; + } + if (len) + memcpy(sctx->buffer + partial, data, len); + return 0; +} + +static int sha1_final(struct shash_desc *desc, u8 *out) +{ + static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; + + struct sha1_state *sctx = shash_desc_ctx(desc); + __be64 bits = cpu_to_be64(sctx->count << 3); + __be32 *dst = (__be32 *)out; + int i; + + u32 padlen = SHA1_BLOCK_SIZE + - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE); + + sha1_update(desc, padding, padlen); + sha1_update(desc, (const u8 *)&bits, sizeof(bits)); + + for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) + put_unaligned_be32(sctx->state[i], dst++); + + *sctx = (struct sha1_state){}; + return 0; +} + +static int sha1_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + __be32 *dst = (__be32 *)out; + int blocks; + int i; + + if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) { + sha1_update(desc, data, len); + return sha1_final(desc, out); + } + + /* + * Use a fast path if the input is a multiple of 64 bytes. In + * this case, there is no need to copy data around, and we can + * perform the entire digest calculation in a single invocation + * of sha1_ce_transform() + */ + blocks = len / SHA1_BLOCK_SIZE; + + kernel_neon_begin_partial(16); + sha1_ce_transform(blocks, data, sctx->state, NULL, len); + kernel_neon_end(); + + for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) + put_unaligned_be32(sctx->state[i], dst++); + + *sctx = (struct sha1_state){}; + return 0; +} + +static int sha1_export(struct shash_desc *desc, void *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + struct sha1_state *dst = out; + + *dst = *sctx; + return 0; +} + +static int sha1_import(struct shash_desc *desc, const void *in) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + struct sha1_state const *src = in; + + *sctx = *src; + return 0; +} + +static struct shash_alg alg = { + .init = sha1_init, + .update = sha1_update, + .final = sha1_final, + .finup = sha1_finup, + .export = sha1_export, + .import = sha1_import, + .descsize = sizeof(struct sha1_state), + .digestsize = SHA1_DIGEST_SIZE, + .statesize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-ce", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init sha1_ce_mod_init(void) +{ + return crypto_register_shash(&alg); +} + +static void __exit sha1_ce_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_cpu_feature_match(SHA1, sha1_ce_mod_init); +module_exit(sha1_ce_mod_fini); diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S new file mode 100644 index 000000000000..7f29fc031ea8 --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -0,0 +1,156 @@ +/* + * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .arch armv8-a+crypto + + dga .req q20 + dgav .req v20 + dgb .req q21 + dgbv .req v21 + + t0 .req v22 + t1 .req v23 + + dg0q .req q24 + dg0v .req v24 + dg1q .req q25 + dg1v .req v25 + dg2q .req q26 + dg2v .req v26 + + .macro add_only, ev, rc, s0 + mov dg2v.16b, dg0v.16b + .ifeq \ev + add t1.4s, v\s0\().4s, \rc\().4s + sha256h dg0q, dg1q, t0.4s + sha256h2 dg1q, dg2q, t0.4s + .else + .ifnb \s0 + add t0.4s, v\s0\().4s, \rc\().4s + .endif + sha256h dg0q, dg1q, t1.4s + sha256h2 dg1q, dg2q, t1.4s + .endif + .endm + + .macro add_update, ev, rc, s0, s1, s2, s3 + sha256su0 v\s0\().4s, v\s1\().4s + add_only \ev, \rc, \s1 + sha256su1 v\s0\().4s, v\s2\().4s, v\s3\().4s + .endm + + /* + * The SHA-256 round constants + */ + .align 4 +.Lsha2_rcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + + /* + * void sha2_ce_transform(int blocks, u8 const *src, u32 *state, + * u8 *head, long bytes) + */ +ENTRY(sha2_ce_transform) + /* load round constants */ + adr x8, .Lsha2_rcon + ld1 { v0.4s- v3.4s}, [x8], #64 + ld1 { v4.4s- v7.4s}, [x8], #64 + ld1 { v8.4s-v11.4s}, [x8], #64 + ld1 {v12.4s-v15.4s}, [x8] + + /* load state */ + ldp dga, dgb, [x2] + + /* load partial input (if supplied) */ + cbz x3, 0f + ld1 {v16.4s-v19.4s}, [x3] + b 1f + + /* load input */ +0: ld1 {v16.4s-v19.4s}, [x1], #64 + sub w0, w0, #1 + +1: +CPU_LE( rev32 v16.16b, v16.16b ) +CPU_LE( rev32 v17.16b, v17.16b ) +CPU_LE( rev32 v18.16b, v18.16b ) +CPU_LE( rev32 v19.16b, v19.16b ) + +2: add t0.4s, v16.4s, v0.4s + mov dg0v.16b, dgav.16b + mov dg1v.16b, dgbv.16b + + add_update 0, v1, 16, 17, 18, 19 + add_update 1, v2, 17, 18, 19, 16 + add_update 0, v3, 18, 19, 16, 17 + add_update 1, v4, 19, 16, 17, 18 + + add_update 0, v5, 16, 17, 18, 19 + add_update 1, v6, 17, 18, 19, 16 + add_update 0, v7, 18, 19, 16, 17 + add_update 1, v8, 19, 16, 17, 18 + + add_update 0, v9, 16, 17, 18, 19 + add_update 1, v10, 17, 18, 19, 16 + add_update 0, v11, 18, 19, 16, 17 + add_update 1, v12, 19, 16, 17, 18 + + add_only 0, v13, 17 + add_only 1, v14, 18 + add_only 0, v15, 19 + add_only 1 + + /* update state */ + add dgav.4s, dgav.4s, dg0v.4s + add dgbv.4s, dgbv.4s, dg1v.4s + + /* handled all input blocks? */ + cbnz w0, 0b + + /* + * Final block: add padding and total bit count. + * Skip if we have no total byte count in x4. In that case, the input + * size was not a round multiple of the block size, and the padding is + * handled by the C code. + */ + cbz x4, 3f + movi v17.2d, #0 + mov x8, #0x80000000 + movi v18.2d, #0 + ror x7, x4, #29 // ror(lsl(x4, 3), 32) + fmov d16, x8 + mov x4, #0 + mov v19.d[0], xzr + mov v19.d[1], x7 + b 2b + + /* store new state */ +3: stp dga, dgb, [x2] + ret +ENDPROC(sha2_ce_transform) diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c new file mode 100644 index 000000000000..c294e67d3925 --- /dev/null +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -0,0 +1,255 @@ +/* + * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions + * + * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/neon.h> +#include <asm/unaligned.h> +#include <crypto/internal/hash.h> +#include <crypto/sha.h> +#include <linux/cpufeature.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state, + u8 *head, long bytes); + +static int sha224_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + *sctx = (struct sha256_state){ + .state = { + SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, + SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, + } + }; + return 0; +} + +static int sha256_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + *sctx = (struct sha256_state){ + .state = { + SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, + SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, + } + }; + return 0; +} + +static int sha2_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; + + sctx->count += len; + + if ((partial + len) >= SHA256_BLOCK_SIZE) { + int blocks; + + if (partial) { + int p = SHA256_BLOCK_SIZE - partial; + + memcpy(sctx->buf + partial, data, p); + data += p; + len -= p; + } + + blocks = len / SHA256_BLOCK_SIZE; + len %= SHA256_BLOCK_SIZE; + + kernel_neon_begin_partial(28); + sha2_ce_transform(blocks, data, sctx->state, + partial ? sctx->buf : NULL, 0); + kernel_neon_end(); + + data += blocks * SHA256_BLOCK_SIZE; + partial = 0; + } + if (len) + memcpy(sctx->buf + partial, data, len); + return 0; +} + +static void sha2_final(struct shash_desc *desc) +{ + static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; + + struct sha256_state *sctx = shash_desc_ctx(desc); + __be64 bits = cpu_to_be64(sctx->count << 3); + u32 padlen = SHA256_BLOCK_SIZE + - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE); + + sha2_update(desc, padding, padlen); + sha2_update(desc, (const u8 *)&bits, sizeof(bits)); +} + +static int sha224_final(struct shash_desc *desc, u8 *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + __be32 *dst = (__be32 *)out; + int i; + + sha2_final(desc); + + for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) + put_unaligned_be32(sctx->state[i], dst++); + + *sctx = (struct sha256_state){}; + return 0; +} + +static int sha256_final(struct shash_desc *desc, u8 *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + __be32 *dst = (__be32 *)out; + int i; + + sha2_final(desc); + + for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) + put_unaligned_be32(sctx->state[i], dst++); + + *sctx = (struct sha256_state){}; + return 0; +} + +static void sha2_finup(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + int blocks; + + if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) { + sha2_update(desc, data, len); + sha2_final(desc); + return; + } + + /* + * Use a fast path if the input is a multiple of 64 bytes. In + * this case, there is no need to copy data around, and we can + * perform the entire digest calculation in a single invocation + * of sha2_ce_transform() + */ + blocks = len / SHA256_BLOCK_SIZE; + + kernel_neon_begin_partial(28); + sha2_ce_transform(blocks, data, sctx->state, NULL, len); + kernel_neon_end(); + data += blocks * SHA256_BLOCK_SIZE; +} + +static int sha224_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + __be32 *dst = (__be32 *)out; + int i; + + sha2_finup(desc, data, len); + + for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) + put_unaligned_be32(sctx->state[i], dst++); + + *sctx = (struct sha256_state){}; + return 0; +} + +static int sha256_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + __be32 *dst = (__be32 *)out; + int i; + + sha2_finup(desc, data, len); + + for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) + put_unaligned_be32(sctx->state[i], dst++); + + *sctx = (struct sha256_state){}; + return 0; +} + +static int sha2_export(struct shash_desc *desc, void *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + struct sha256_state *dst = out; + + *dst = *sctx; + return 0; +} + +static int sha2_import(struct shash_desc *desc, const void *in) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + struct sha256_state const *src = in; + + *sctx = *src; + return 0; +} + +static struct shash_alg algs[] = { { + .init = sha224_init, + .update = sha2_update, + .final = sha224_final, + .finup = sha224_finup, + .export = sha2_export, + .import = sha2_import, + .descsize = sizeof(struct sha256_state), + .digestsize = SHA224_DIGEST_SIZE, + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ce", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .init = sha256_init, + .update = sha2_update, + .final = sha256_final, + .finup = sha256_finup, + .export = sha2_export, + .import = sha2_import, + .descsize = sizeof(struct sha256_state), + .digestsize = SHA256_DIGEST_SIZE, + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-ce", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static int __init sha2_ce_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha2_ce_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_cpu_feature_match(SHA2, sha2_ce_mod_init); +module_exit(sha2_ce_mod_fini); diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 83f71b3004a8..42c7eecd2bb6 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -40,6 +40,7 @@ generic-y += segment.h generic-y += sembuf.h generic-y += serial.h generic-y += shmbuf.h +generic-y += simd.h generic-y += sizes.h generic-y += socket.h generic-y += sockios.h diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index fd3e3924041b..5901480bfdca 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -21,6 +21,7 @@ #endif #include <asm/ptrace.h> +#include <asm/thread_info.h> /* * Stack pushing/popping (register pairs only). Equivalent to store decrement @@ -68,23 +69,31 @@ msr daifclr, #8 .endm - .macro disable_step, tmp + .macro disable_step_tsk, flgs, tmp + tbz \flgs, #TIF_SINGLESTEP, 9990f mrs \tmp, mdscr_el1 bic \tmp, \tmp, #1 msr mdscr_el1, \tmp + isb // Synchronise with enable_dbg +9990: .endm - .macro enable_step, tmp + .macro enable_step_tsk, flgs, tmp + tbz \flgs, #TIF_SINGLESTEP, 9990f + disable_dbg mrs \tmp, mdscr_el1 orr \tmp, \tmp, #1 msr mdscr_el1, \tmp +9990: .endm - .macro enable_dbg_if_not_stepping, tmp - mrs \tmp, mdscr_el1 - tbnz \tmp, #0, 9990f - enable_dbg -9990: +/* + * Enable both debug exceptions and interrupts. This is likely to be + * faster than two daifclr operations, since writes to this register + * are self-synchronising. + */ + .macro enable_dbg_and_irq + msr daifclr, #(8 | 2) .endm /* diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h index 57e8cb49824c..65f1569ac96e 100644 --- a/arch/arm64/include/asm/atomic.h +++ b/arch/arm64/include/asm/atomic.h @@ -157,7 +157,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) */ #define ATOMIC64_INIT(i) { (i) } -#define atomic64_read(v) (*(volatile long long *)&(v)->counter) +#define atomic64_read(v) (*(volatile long *)&(v)->counter) #define atomic64_set(v,i) (((v)->counter) = (i)) static inline void atomic64_add(u64 i, atomic64_t *v) diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 48b9e704af7c..6389d60574d9 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -25,12 +25,12 @@ #define wfi() asm volatile("wfi" : : : "memory") #define isb() asm volatile("isb" : : : "memory") -#define dmb(opt) asm volatile("dmb sy" : : : "memory") -#define dsb(opt) asm volatile("dsb sy" : : : "memory") +#define dmb(opt) asm volatile("dmb " #opt : : : "memory") +#define dsb(opt) asm volatile("dsb " #opt : : : "memory") -#define mb() dsb() -#define rmb() asm volatile("dsb ld" : : : "memory") -#define wmb() asm volatile("dsb st" : : : "memory") +#define mb() dsb(sy) +#define rmb() dsb(ld) +#define wmb() dsb(st) #ifndef CONFIG_SMP #define smp_mb() barrier() @@ -40,7 +40,7 @@ #define smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ - smp_mb(); \ + barrier(); \ ACCESS_ONCE(*p) = (v); \ } while (0) @@ -48,15 +48,15 @@ do { \ ({ \ typeof(*p) ___p1 = ACCESS_ONCE(*p); \ compiletime_assert_atomic_type(*p); \ - smp_mb(); \ + barrier(); \ ___p1; \ }) #else -#define smp_mb() asm volatile("dmb ish" : : : "memory") -#define smp_rmb() asm volatile("dmb ishld" : : : "memory") -#define smp_wmb() asm volatile("dmb ishst" : : : "memory") +#define smp_mb() dmb(ish) +#define smp_rmb() dmb(ishld) +#define smp_wmb() dmb(ishst) #define smp_store_release(p, v) \ do { \ diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 390308a67f0d..88cc05b5f3ac 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -16,6 +16,8 @@ #ifndef __ASM_CACHE_H #define __ASM_CACHE_H +#include <asm/cachetype.h> + #define L1_CACHE_SHIFT 6 #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) @@ -27,6 +29,15 @@ * the CPU. */ #define ARCH_DMA_MINALIGN L1_CACHE_BYTES -#define ARCH_SLAB_MINALIGN 8 + +#ifndef __ASSEMBLY__ + +static inline int cache_line_size(void) +{ + u32 cwg = cache_type_cwg(); + return cwg ? 4 << cwg : L1_CACHE_BYTES; +} + +#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 4c60e64a801c..a5176cf32dad 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -123,7 +123,7 @@ extern void flush_dcache_page(struct page *); static inline void __flush_icache_all(void) { asm("ic ialluis"); - dsb(); + dsb(ish); } #define flush_dcache_mmap_lock(mapping) \ @@ -150,7 +150,7 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) * set_pte_at() called from vmap_pte_range() does not * have a DSB after cleaning the cache line. */ - dsb(); + dsb(ish); } static inline void flush_cache_vunmap(unsigned long start, unsigned long end) diff --git a/arch/arm64/include/asm/cachetype.h b/arch/arm64/include/asm/cachetype.h index 85f5f511352a..4b23e758d5e0 100644 --- a/arch/arm64/include/asm/cachetype.h +++ b/arch/arm64/include/asm/cachetype.h @@ -20,12 +20,16 @@ #define CTR_L1IP_SHIFT 14 #define CTR_L1IP_MASK 3 +#define CTR_CWG_SHIFT 24 +#define CTR_CWG_MASK 15 #define ICACHE_POLICY_RESERVED 0 #define ICACHE_POLICY_AIVIVT 1 #define ICACHE_POLICY_VIPT 2 #define ICACHE_POLICY_PIPT 3 +#ifndef __ASSEMBLY__ + static inline u32 icache_policy(void) { return (read_cpuid_cachetype() >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK; @@ -45,4 +49,11 @@ static inline int icache_is_aivivt(void) return icache_policy() == ICACHE_POLICY_AIVIVT; } +static inline u32 cache_type_cwg(void) +{ + return (read_cpuid_cachetype() >> CTR_CWG_SHIFT) & CTR_CWG_MASK; +} + +#endif /* __ASSEMBLY__ */ + #endif /* __ASM_CACHETYPE_H */ diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h index 57c0fa7bf711..ddb9d7830558 100644 --- a/arch/arm64/include/asm/cmpxchg.h +++ b/arch/arm64/include/asm/cmpxchg.h @@ -72,7 +72,12 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size } #define xchg(ptr,x) \ - ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) +({ \ + __typeof__(*(ptr)) __ret; \ + __ret = (__typeof__(*(ptr))) \ + __xchg((unsigned long)(x), (ptr), sizeof(*(ptr))); \ + __ret; \ +}) static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index e71f81fe127a..253e33bc94fb 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -305,11 +305,6 @@ static inline int is_compat_thread(struct thread_info *thread) #else /* !CONFIG_COMPAT */ -static inline int is_compat_task(void) -{ - return 0; -} - static inline int is_compat_thread(struct thread_info *thread) { return 0; diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h new file mode 100644 index 000000000000..5a46c4e7f539 --- /dev/null +++ b/arch/arm64/include/asm/efi.h @@ -0,0 +1,14 @@ +#ifndef _ASM_EFI_H +#define _ASM_EFI_H + +#include <asm/io.h> + +#ifdef CONFIG_EFI +extern void efi_init(void); +extern void efi_idmap_init(void); +#else +#define efi_init() +#define efi_idmap_init() +#endif + +#endif /* _ASM_EFI_H */ diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index c4a7f940b387..72674f4c3871 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -18,9 +18,11 @@ #ifndef __ASM_ESR_H #define __ASM_ESR_H -#define ESR_EL1_EC_SHIFT (26) -#define ESR_EL1_IL (1U << 25) +#define ESR_EL1_WRITE (1 << 6) +#define ESR_EL1_CM (1 << 8) +#define ESR_EL1_IL (1 << 25) +#define ESR_EL1_EC_SHIFT (26) #define ESR_EL1_EC_UNKNOWN (0x00) #define ESR_EL1_EC_WFI (0x01) #define ESR_EL1_EC_CP15_32 (0x03) diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h index c43b4ac13008..50f559f574fe 100644 --- a/arch/arm64/include/asm/fpsimd.h +++ b/arch/arm64/include/asm/fpsimd.h @@ -37,8 +37,21 @@ struct fpsimd_state { u32 fpcr; }; }; + /* the id of the last cpu to have restored this state */ + unsigned int cpu; }; +/* + * Struct for stacking the bottom 'n' FP/SIMD registers. + */ +struct fpsimd_partial_state { + u32 fpsr; + u32 fpcr; + u32 num_regs; + __uint128_t vregs[32]; +}; + + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* Masks for extracting the FPSR and FPCR from the FPSCR */ #define VFP_FPSCR_STAT_MASK 0xf800009f @@ -58,6 +71,16 @@ extern void fpsimd_load_state(struct fpsimd_state *state); extern void fpsimd_thread_switch(struct task_struct *next); extern void fpsimd_flush_thread(void); +extern void fpsimd_preserve_current_state(void); +extern void fpsimd_restore_current_state(void); +extern void fpsimd_update_current_state(struct fpsimd_state *state); + +extern void fpsimd_flush_task_state(struct task_struct *target); + +extern void fpsimd_save_partial_state(struct fpsimd_partial_state *state, + u32 num_regs); +extern void fpsimd_load_partial_state(struct fpsimd_partial_state *state); + #endif #endif diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h index bbec599c96bd..768414d55e64 100644 --- a/arch/arm64/include/asm/fpsimdmacros.h +++ b/arch/arm64/include/asm/fpsimdmacros.h @@ -62,3 +62,38 @@ ldr w\tmpnr, [\state, #16 * 2 + 4] msr fpcr, x\tmpnr .endm + +.altmacro +.macro fpsimd_save_partial state, numnr, tmpnr1, tmpnr2 + mrs x\tmpnr1, fpsr + str w\numnr, [\state, #8] + mrs x\tmpnr2, fpcr + stp w\tmpnr1, w\tmpnr2, [\state] + adr x\tmpnr1, 0f + add \state, \state, x\numnr, lsl #4 + sub x\tmpnr1, x\tmpnr1, x\numnr, lsl #1 + br x\tmpnr1 + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 + .irp qb, %(qa + 1) + stp q\qa, q\qb, [\state, # -16 * \qa - 16] + .endr + .endr +0: +.endm + +.macro fpsimd_restore_partial state, tmpnr1, tmpnr2 + ldp w\tmpnr1, w\tmpnr2, [\state] + msr fpsr, x\tmpnr1 + msr fpcr, x\tmpnr2 + adr x\tmpnr1, 0f + ldr w\tmpnr2, [\state, #8] + add \state, \state, x\tmpnr2, lsl #4 + sub x\tmpnr1, x\tmpnr1, x\tmpnr2, lsl #1 + br x\tmpnr1 + .irp qa, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0 + .irp qb, %(qa + 1) + ldp q\qa, q\qb, [\state, # -16 * \qa - 16] + .endr + .endr +0: +.endm diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h new file mode 100644 index 000000000000..c5534facf941 --- /dev/null +++ b/arch/arm64/include/asm/ftrace.h @@ -0,0 +1,59 @@ +/* + * arch/arm64/include/asm/ftrace.h + * + * Copyright (C) 2013 Linaro Limited + * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#ifndef __ASM_FTRACE_H +#define __ASM_FTRACE_H + +#include <asm/insn.h> + +#define MCOUNT_ADDR ((unsigned long)_mcount) +#define MCOUNT_INSN_SIZE AARCH64_INSN_SIZE + +#ifndef __ASSEMBLY__ +#include <linux/compat.h> + +extern void _mcount(unsigned long); +extern void *return_address(unsigned int); + +struct dyn_arch_ftrace { + /* No extra data needed for arm64 */ +}; + +extern unsigned long ftrace_graph_call; + +static inline unsigned long ftrace_call_adjust(unsigned long addr) +{ + /* + * addr is the address of the mcount call instruction. + * recordmcount does the necessary offset calculation. + */ + return addr; +} + +#define ftrace_return_address(n) return_address(n) + +/* + * Because AArch32 mode does not share the same syscall table with AArch64, + * tracing compat syscalls may result in reporting bogus syscalls or even + * hang-up, so just do not trace them. + * See kernel/trace/trace_syscalls.c + * + * x86 code says: + * If the user realy wants these, then they should use the + * raw syscall tracepoints with filtering. + */ +#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) +{ + return is_compat_task(); +} +#endif /* ifndef __ASSEMBLY__ */ + +#endif /* __ASM_FTRACE_H */ diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h index ae4801d77514..0be67821f9ce 100644 --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h @@ -20,7 +20,7 @@ #include <linux/threads.h> #include <asm/irq.h> -#define NR_IPI 5 +#define NR_IPI 6 typedef struct { unsigned int __softirq_pending; diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index c44ad39ed310..dc1f73b13e74 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -21,6 +21,7 @@ /* A64 instructions are always 32 bits. */ #define AARCH64_INSN_SIZE 4 +#ifndef __ASSEMBLY__ /* * ARM Architecture Reference Manual for ARMv8 Profile-A, Issue A.a * Section C3.1 "A64 instruction index by encoding": @@ -104,5 +105,6 @@ bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn); int aarch64_insn_patch_text_nosync(void *addr, u32 insn); int aarch64_insn_patch_text_sync(void *addrs[], u32 insns[], int cnt); int aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt); +#endif /* __ASSEMBLY__ */ #endif /* __ASM_INSN_H */ diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index a1bef78f0303..e0ecdcf6632d 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -230,19 +230,11 @@ extern void __iomem *__ioremap(phys_addr_t phys_addr, size_t size, pgprot_t prot extern void __iounmap(volatile void __iomem *addr); extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size); -#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_DIRTY) -#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE)) -#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL_NC)) -#define PROT_NORMAL (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) - #define ioremap(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) #define ioremap_nocache(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE)) #define ioremap_wc(addr, size) __ioremap((addr), (size), __pgprot(PROT_NORMAL_NC)) #define iounmap __iounmap -#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF) -#define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PTE_PXN | PTE_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) - #define ARCH_HAS_IOREMAP_WC #include <asm-generic/iomap.h> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index aff0292c8f4d..c2f006c48bdb 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -31,5 +31,7 @@ extern void paging_init(void); extern void setup_mm_for_reboot(void); extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); extern void init_mem_pgprot(void); +/* create an identity mapping for memory (or io if map_io is true) */ +extern void create_id_mapping(phys_addr_t addr, phys_addr_t size, int map_io); #endif diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index b0cc58a97780..13ce4cc18e26 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -8,7 +8,11 @@ * published by the Free Software Foundation. */ +#include <linux/types.h> + #define cpu_has_neon() (1) -void kernel_neon_begin(void); +#define kernel_neon_begin() kernel_neon_begin_partial(32) + +void kernel_neon_begin_partial(u32 num_regs); void kernel_neon_end(void); diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 5fc8a66c3924..955e8c5f0afb 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -29,6 +29,8 @@ */ #define PUD_TABLE_BIT (_AT(pgdval_t, 1) << 1) +#define PUD_TYPE_MASK (_AT(pgdval_t, 3) << 0) +#define PUD_TYPE_SECT (_AT(pgdval_t, 1) << 0) /* * Level 2 descriptor (PMD). diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e2f96748859b..598cc384fc1c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -52,66 +52,59 @@ extern void __pgd_error(const char *file, int line, unsigned long val); #endif #define pgd_ERROR(pgd) __pgd_error(__FILE__, __LINE__, pgd_val(pgd)) -/* - * The pgprot_* and protection_map entries will be fixed up at runtime to - * include the cachable and bufferable bits based on memory policy, as well as - * any architecture dependent bits like global/ASID and SMP shared mapping - * bits. - */ -#define _PAGE_DEFAULT PTE_TYPE_PAGE | PTE_AF +#ifdef CONFIG_SMP +#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) +#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) +#else +#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF) +#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF) +#endif -extern pgprot_t pgprot_default; +#define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_DEVICE_nGnRE)) +#define PROT_NORMAL_NC (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL_NC)) +#define PROT_NORMAL (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_ATTRINDX(MT_NORMAL)) -#define __pgprot_modify(prot,mask,bits) \ - __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) +#define PROT_SECT_DEVICE_nGnRE (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE)) +#define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) +#define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) -#define _MOD_PROT(p, b) __pgprot_modify(p, 0, b) +#define _PAGE_DEFAULT (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) -#define PAGE_NONE __pgprot_modify(pgprot_default, PTE_TYPE_MASK, PTE_PROT_NONE | PTE_PXN | PTE_UXN) -#define PAGE_SHARED _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) -#define PAGE_SHARED_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) -#define PAGE_COPY _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define PAGE_COPY_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN) -#define PAGE_READONLY _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define PAGE_READONLY_EXEC _MOD_PROT(pgprot_default, PTE_USER | PTE_NG | PTE_PXN) -#define PAGE_KERNEL _MOD_PROT(pgprot_default, PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) -#define PAGE_KERNEL_EXEC _MOD_PROT(pgprot_default, PTE_UXN | PTE_DIRTY | PTE_WRITE) +#define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) +#define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) -#define PAGE_HYP _MOD_PROT(pgprot_default, PTE_HYP) +#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) -#define PAGE_S2 __pgprot_modify(pgprot_default, PTE_S2_MEMATTR_MASK, PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) +#define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) #define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDWR | PTE_UXN) -#define __PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_TYPE_MASK) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) -#define __PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) -#define __PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) -#define __PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define __PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) -#define __PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define __PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) - -#endif /* __ASSEMBLY__ */ - -#define __P000 __PAGE_NONE -#define __P001 __PAGE_READONLY -#define __P010 __PAGE_COPY -#define __P011 __PAGE_COPY -#define __P100 __PAGE_READONLY_EXEC -#define __P101 __PAGE_READONLY_EXEC -#define __P110 __PAGE_COPY_EXEC -#define __P111 __PAGE_COPY_EXEC - -#define __S000 __PAGE_NONE -#define __S001 __PAGE_READONLY -#define __S010 __PAGE_SHARED -#define __S011 __PAGE_SHARED -#define __S100 __PAGE_READONLY_EXEC -#define __S101 __PAGE_READONLY_EXEC -#define __S110 __PAGE_SHARED_EXEC -#define __S111 __PAGE_SHARED_EXEC +#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_TYPE_MASK) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) +#define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) +#define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) +#define PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) +#define PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) +#define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) + +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY_EXEC +#define __P101 PAGE_READONLY_EXEC +#define __P110 PAGE_COPY_EXEC +#define __P111 PAGE_COPY_EXEC + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY_EXEC +#define __S101 PAGE_READONLY_EXEC +#define __S110 PAGE_SHARED_EXEC +#define __S111 PAGE_SHARED_EXEC -#ifndef __ASSEMBLY__ /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -265,6 +258,7 @@ static inline pmd_t pte_pmd(pte_t pte) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) +#define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT) #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) @@ -273,6 +267,9 @@ static inline int has_transparent_hugepage(void) return 1; } +#define __pgprot_modify(prot,mask,bits) \ + __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) + /* * Mark the prot value as uncacheable and unbufferable. */ @@ -295,11 +292,17 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, #define pmd_sect(pmd) ((pmd_val(pmd) & PMD_TYPE_MASK) == \ PMD_TYPE_SECT) +#ifdef ARM64_64K_PAGES +#define pud_sect(pud) (0) +#else +#define pud_sect(pud) ((pud_val(pud) & PUD_TYPE_MASK) == \ + PUD_TYPE_SECT) +#endif static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { *pmdp = pmd; - dsb(); + dsb(ishst); } static inline void pmd_clear(pmd_t *pmdp) @@ -329,7 +332,7 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) static inline void set_pud(pud_t *pudp, pud_t pud) { *pudp = pud; - dsb(); + dsb(ishst); } static inline void pud_clear(pud_t *pudp) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 45b20cd6cbca..34de2a8f7d93 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -79,6 +79,7 @@ struct thread_struct { unsigned long tp_value; struct fpsimd_state fpsimd_state; unsigned long fault_address; /* fault info */ + unsigned long fault_code; /* ESR_EL1 value */ struct debug_info debug; /* debugging */ }; diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index c7ba261dd4b3..a429b5940be2 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -135,6 +135,11 @@ struct pt_regs { #define user_stack_pointer(regs) \ (!compat_user_mode(regs)) ? ((regs)->sp) : ((regs)->compat_sp) +static inline unsigned long regs_return_value(struct pt_regs *regs) +{ + return regs->regs[0]; +} + /* * Are the current registers suitable for user mode? (used to maintain * security in signal handlers) diff --git a/arch/arm64/include/asm/sigcontext.h b/arch/arm64/include/asm/sigcontext.h deleted file mode 100644 index dca1094acc74..000000000000 --- a/arch/arm64/include/asm/sigcontext.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -#ifndef __ASM_SIGCONTEXT_H -#define __ASM_SIGCONTEXT_H - -#include <uapi/asm/sigcontext.h> - -/* - * Auxiliary context saved in the sigcontext.__reserved array. Not exported to - * user space as it will change with the addition of new context. User space - * should check the magic/size information. - */ -struct aux_context { - struct fpsimd_context fpsimd; - /* additional context to be added before "end" */ - struct _aarch64_ctx end; -}; -#endif diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index 3ee8b303d9a9..64d2d4884a9d 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -22,6 +22,18 @@ extern char *strrchr(const char *, int c); #define __HAVE_ARCH_STRCHR extern char *strchr(const char *, int c); +#define __HAVE_ARCH_STRCMP +extern int strcmp(const char *, const char *); + +#define __HAVE_ARCH_STRNCMP +extern int strncmp(const char *, const char *, __kernel_size_t); + +#define __HAVE_ARCH_STRLEN +extern __kernel_size_t strlen(const char *); + +#define __HAVE_ARCH_STRNLEN +extern __kernel_size_t strnlen(const char *, __kernel_size_t); + #define __HAVE_ARCH_MEMCPY extern void *memcpy(void *, const void *, __kernel_size_t); @@ -34,4 +46,7 @@ extern void *memchr(const void *, int, __kernel_size_t); #define __HAVE_ARCH_MEMSET extern void *memset(void *, int, __kernel_size_t); +#define __HAVE_ARCH_MEMCMP +extern int memcmp(const void *, const void *, size_t); + #endif diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h index 70ba9d4ee978..383771eb0b87 100644 --- a/arch/arm64/include/asm/syscall.h +++ b/arch/arm64/include/asm/syscall.h @@ -18,6 +18,7 @@ #include <linux/err.h> +extern const void *sys_call_table[]; static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 7b8e3a2a00fb..e40b6d06d515 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -91,6 +91,9 @@ static inline struct thread_info *current_thread_info(void) /* * thread information flags: * TIF_SYSCALL_TRACE - syscall trace active + * TIF_SYSCALL_TRACEPOINT - syscall tracepoint for ftrace + * TIF_SYSCALL_AUDIT - syscall auditing + * TIF_SECOMP - syscall secure computing * TIF_SIGPENDING - signal pending * TIF_NEED_RESCHED - rescheduling necessary * TIF_NOTIFY_RESUME - callback before returning to user @@ -99,7 +102,11 @@ static inline struct thread_info *current_thread_info(void) #define TIF_SIGPENDING 0 #define TIF_NEED_RESCHED 1 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ +#define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ #define TIF_SYSCALL_TRACE 8 +#define TIF_SYSCALL_AUDIT 9 +#define TIF_SYSCALL_TRACEPOINT 10 +#define TIF_SECCOMP 11 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ #define TIF_FREEZE 19 #define TIF_RESTORE_SIGMASK 20 @@ -110,10 +117,18 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +#define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) +#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) +#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME) + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE) + +#define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP) #endif /* __KERNEL__ */ #endif /* __ASM_THREAD_INFO_H */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 8b482035cfc2..b9349c4513ea 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -72,9 +72,9 @@ extern struct cpu_tlb_fns cpu_tlb; */ static inline void flush_tlb_all(void) { - dsb(); + dsb(ishst); asm("tlbi vmalle1is"); - dsb(); + dsb(ish); isb(); } @@ -82,9 +82,9 @@ static inline void flush_tlb_mm(struct mm_struct *mm) { unsigned long asid = (unsigned long)ASID(mm) << 48; - dsb(); + dsb(ishst); asm("tlbi aside1is, %0" : : "r" (asid)); - dsb(); + dsb(ish); } static inline void flush_tlb_page(struct vm_area_struct *vma, @@ -93,16 +93,36 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr = uaddr >> 12 | ((unsigned long)ASID(vma->vm_mm) << 48); - dsb(); + dsb(ishst); asm("tlbi vae1is, %0" : : "r" (addr)); - dsb(); + dsb(ish); } -/* - * Convert calls to our calling convention. - */ -#define flush_tlb_range(vma,start,end) __cpu_flush_user_tlb_range(start,end,vma) -#define flush_tlb_kernel_range(s,e) __cpu_flush_kern_tlb_range(s,e) +static inline void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long asid = (unsigned long)ASID(vma->vm_mm) << 48; + unsigned long addr; + start = asid | (start >> 12); + end = asid | (end >> 12); + + dsb(ishst); + for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) + asm("tlbi vae1is, %0" : : "r"(addr)); + dsb(ish); +} + +static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + unsigned long addr; + start >>= 12; + end >>= 12; + + dsb(ishst); + for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) + asm("tlbi vaae1is, %0" : : "r"(addr)); + dsb(ish); +} /* * On AArch64, the cache coherency is handled via the set_pte_at() function. @@ -114,7 +134,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, * set_pte() does not have a DSB, so make sure that the page table * write is visible. */ - dsb(); + dsb(ishst); } #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 0172e6d76bf3..7ebcd31ce51c 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -20,9 +20,6 @@ extern struct cpu_topology cpu_topology[NR_CPUS]; #define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_sibling) #define topology_thread_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) -#define mc_capable() (cpu_topology[0].cluster_id != -1) -#define smt_capable() (cpu_topology[0].thread_id != -1) - void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index a4654c656a1e..e5f47df00c24 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -29,3 +29,5 @@ #endif #define __ARCH_WANT_SYS_CLONE #include <uapi/asm/unistd.h> + +#define NR_syscalls (__NR_syscalls) diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h index 690ad51cc901..b72cf405b3fe 100644 --- a/arch/arm64/include/uapi/asm/sigcontext.h +++ b/arch/arm64/include/uapi/asm/sigcontext.h @@ -53,5 +53,12 @@ struct fpsimd_context { __uint128_t vregs[32]; }; +/* ESR_EL1 context */ +#define ESR_MAGIC 0x45535201 + +struct esr_context { + struct _aarch64_ctx head; + u64 esr; +}; #endif /* _UAPI__ASM_SIGCONTEXT_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 7a6fce5167e9..cdaedad3afe5 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -4,15 +4,22 @@ CPPFLAGS_vmlinux.lds := -DTEXT_OFFSET=$(TEXT_OFFSET) AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) +CFLAGS_efi-stub.o := -DTEXT_OFFSET=$(TEXT_OFFSET) \ + -I$(src)/../../../scripts/dtc/libfdt + +CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_insn.o = -pg +CFLAGS_REMOVE_return_address.o = -pg # Object file lists. arm64-obj-y := cputable.o debug-monitors.o entry.o irq.o fpsimd.o \ entry-fpsimd.o process.o ptrace.o setup.o signal.o \ sys.o stacktrace.o time.o traps.o io.o vdso.o \ - hyp-stub.o psci.o cpu_ops.o insn.o + hyp-stub.o psci.o cpu_ops.o insn.o return_address.o arm64-obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ sys_compat.o +arm64-obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o arm64-obj-$(CONFIG_MODULES) += arm64ksyms.o module.o arm64-obj-$(CONFIG_SMP) += smp.o smp_spin_table.o topology.o arm64-obj-$(CONFIG_PERF_EVENTS) += perf_regs.o @@ -21,6 +28,7 @@ arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o arm64-obj-$(CONFIG_ARM64_CPU_SUSPEND) += sleep.o suspend.o arm64-obj-$(CONFIG_JUMP_LABEL) += jump_label.o arm64-obj-$(CONFIG_KGDB) += kgdb.o +arm64-obj-$(CONFIG_EFI) += efi.o efi-stub.o efi-entry.o obj-y += $(arm64-obj-y) vdso/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index 338b568cd8ae..a85843ddbde8 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -44,10 +44,15 @@ EXPORT_SYMBOL(memstart_addr); /* string / mem functions */ EXPORT_SYMBOL(strchr); EXPORT_SYMBOL(strrchr); +EXPORT_SYMBOL(strcmp); +EXPORT_SYMBOL(strncmp); +EXPORT_SYMBOL(strlen); +EXPORT_SYMBOL(strnlen); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memchr); +EXPORT_SYMBOL(memcmp); /* atomic bitops */ EXPORT_SYMBOL(set_bit); @@ -56,3 +61,7 @@ EXPORT_SYMBOL(clear_bit); EXPORT_SYMBOL(test_and_clear_bit); EXPORT_SYMBOL(change_bit); EXPORT_SYMBOL(test_and_change_bit); + +#ifdef CONFIG_FUNCTION_TRACER +EXPORT_SYMBOL(_mcount); +#endif diff --git a/arch/arm64/kernel/efi-entry.S b/arch/arm64/kernel/efi-entry.S new file mode 100644 index 000000000000..66716c9b9e5f --- /dev/null +++ b/arch/arm64/kernel/efi-entry.S @@ -0,0 +1,109 @@ +/* + * EFI entry point. + * + * Copyright (C) 2013, 2014 Red Hat, Inc. + * Author: Mark Salter <msalter@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/linkage.h> +#include <linux/init.h> + +#include <asm/assembler.h> + +#define EFI_LOAD_ERROR 0x8000000000000001 + + __INIT + + /* + * We arrive here from the EFI boot manager with: + * + * * CPU in little-endian mode + * * MMU on with identity-mapped RAM + * * Icache and Dcache on + * + * We will most likely be running from some place other than where + * we want to be. The kernel image wants to be placed at TEXT_OFFSET + * from start of RAM. + */ +ENTRY(efi_stub_entry) + /* + * Create a stack frame to save FP/LR with extra space + * for image_addr variable passed to efi_entry(). + */ + stp x29, x30, [sp, #-32]! + + /* + * Call efi_entry to do the real work. + * x0 and x1 are already set up by firmware. Current runtime + * address of image is calculated and passed via *image_addr. + * + * unsigned long efi_entry(void *handle, + * efi_system_table_t *sys_table, + * unsigned long *image_addr) ; + */ + adrp x8, _text + add x8, x8, #:lo12:_text + add x2, sp, 16 + str x8, [x2] + bl efi_entry + cmn x0, #1 + b.eq efi_load_fail + + /* + * efi_entry() will have relocated the kernel image if necessary + * and we return here with device tree address in x0 and the kernel + * entry point stored at *image_addr. Save those values in registers + * which are callee preserved. + */ + mov x20, x0 // DTB address + ldr x0, [sp, #16] // relocated _text address + mov x21, x0 + + /* + * Flush dcache covering current runtime addresses + * of kernel text/data. Then flush all of icache. + */ + adrp x1, _text + add x1, x1, #:lo12:_text + adrp x2, _edata + add x2, x2, #:lo12:_edata + sub x1, x2, x1 + + bl __flush_dcache_area + ic ialluis + + /* Turn off Dcache and MMU */ + mrs x0, CurrentEL + cmp x0, #PSR_MODE_EL2t + ccmp x0, #PSR_MODE_EL2h, #0x4, ne + b.ne 1f + mrs x0, sctlr_el2 + bic x0, x0, #1 << 0 // clear SCTLR.M + bic x0, x0, #1 << 2 // clear SCTLR.C + msr sctlr_el2, x0 + isb + b 2f +1: + mrs x0, sctlr_el1 + bic x0, x0, #1 << 0 // clear SCTLR.M + bic x0, x0, #1 << 2 // clear SCTLR.C + msr sctlr_el1, x0 + isb +2: + /* Jump to kernel entry point */ + mov x0, x20 + mov x1, xzr + mov x2, xzr + mov x3, xzr + br x21 + +efi_load_fail: + mov x0, #EFI_LOAD_ERROR + ldp x29, x30, [sp], #32 + ret + +ENDPROC(efi_stub_entry) diff --git a/arch/arm64/kernel/efi-stub.c b/arch/arm64/kernel/efi-stub.c new file mode 100644 index 000000000000..60e98a639ac5 --- /dev/null +++ b/arch/arm64/kernel/efi-stub.c @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2013, 2014 Linaro Ltd; <roy.franz@linaro.org> + * + * This file implements the EFI boot stub for the arm64 kernel. + * Adapted from ARM version by Mark Salter <msalter@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/efi.h> +#include <linux/libfdt.h> +#include <asm/sections.h> +#include <generated/compile.h> +#include <generated/utsrelease.h> + +/* + * AArch64 requires the DTB to be 8-byte aligned in the first 512MiB from + * start of kernel and may not cross a 2MiB boundary. We set alignment to + * 2MiB so we know it won't cross a 2MiB boundary. + */ +#define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */ +#define MAX_FDT_OFFSET SZ_512M + +#define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) + +static void efi_char16_printk(efi_system_table_t *sys_table_arg, + efi_char16_t *str); + +static efi_status_t efi_open_volume(efi_system_table_t *sys_table, + void *__image, void **__fh); +static efi_status_t efi_file_close(void *handle); + +static efi_status_t +efi_file_read(void *handle, unsigned long *size, void *addr); + +static efi_status_t +efi_file_size(efi_system_table_t *sys_table, void *__fh, + efi_char16_t *filename_16, void **handle, u64 *file_sz); + +/* Include shared EFI stub code */ +#include "../../../drivers/firmware/efi/efi-stub-helper.c" +#include "../../../drivers/firmware/efi/fdt.c" +#include "../../../drivers/firmware/efi/arm-stub.c" + + +static efi_status_t handle_kernel_image(efi_system_table_t *sys_table, + unsigned long *image_addr, + unsigned long *image_size, + unsigned long *reserve_addr, + unsigned long *reserve_size, + unsigned long dram_base, + efi_loaded_image_t *image) +{ + efi_status_t status; + unsigned long kernel_size, kernel_memsize = 0; + + /* Relocate the image, if required. */ + kernel_size = _edata - _text; + if (*image_addr != (dram_base + TEXT_OFFSET)) { + kernel_memsize = kernel_size + (_end - _edata); + status = efi_relocate_kernel(sys_table, image_addr, + kernel_size, kernel_memsize, + dram_base + TEXT_OFFSET, + PAGE_SIZE); + if (status != EFI_SUCCESS) { + pr_efi_err(sys_table, "Failed to relocate kernel\n"); + return status; + } + if (*image_addr != (dram_base + TEXT_OFFSET)) { + pr_efi_err(sys_table, "Failed to alloc kernel memory\n"); + efi_free(sys_table, kernel_memsize, *image_addr); + return EFI_ERROR; + } + *image_size = kernel_memsize; + } + + + return EFI_SUCCESS; +} diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c new file mode 100644 index 000000000000..14db1f6e8d7f --- /dev/null +++ b/arch/arm64/kernel/efi.c @@ -0,0 +1,469 @@ +/* + * Extensible Firmware Interface + * + * Based on Extensible Firmware Interface Specification version 2.4 + * + * Copyright (C) 2013, 2014 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/efi.h> +#include <linux/export.h> +#include <linux/memblock.h> +#include <linux/bootmem.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/sched.h> +#include <linux/slab.h> + +#include <asm/cacheflush.h> +#include <asm/efi.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> + +struct efi_memory_map memmap; + +static efi_runtime_services_t *runtime; + +static u64 efi_system_table; + +static int uefi_debug __initdata; +static int __init uefi_debug_setup(char *str) +{ + uefi_debug = 1; + + return 0; +} +early_param("uefi_debug", uefi_debug_setup); + +static int __init is_normal_ram(efi_memory_desc_t *md) +{ + if (md->attribute & EFI_MEMORY_WB) + return 1; + return 0; +} + +static void __init efi_setup_idmap(void) +{ + struct memblock_region *r; + efi_memory_desc_t *md; + u64 paddr, npages, size; + + for_each_memblock(memory, r) + create_id_mapping(r->base, r->size, 0); + + /* map runtime io spaces */ + for_each_efi_memory_desc(&memmap, md) { + if (!(md->attribute & EFI_MEMORY_RUNTIME) || is_normal_ram(md)) + continue; + paddr = md->phys_addr; + npages = md->num_pages; + memrange_efi_to_native(&paddr, &npages); + size = npages << PAGE_SHIFT; + create_id_mapping(paddr, size, 1); + } +} + +static int __init uefi_init(void) +{ + efi_char16_t *c16; + char vendor[100] = "unknown"; + int i, retval; + + efi.systab = early_memremap(efi_system_table, + sizeof(efi_system_table_t)); + if (efi.systab == NULL) { + pr_warn("Unable to map EFI system table.\n"); + return -ENOMEM; + } + + set_bit(EFI_BOOT, &efi.flags); + set_bit(EFI_64BIT, &efi.flags); + + /* + * Verify the EFI Table + */ + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { + pr_err("System table signature incorrect\n"); + return -EINVAL; + } + if ((efi.systab->hdr.revision >> 16) < 2) + pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff); + + /* Show what we know for posterity */ + c16 = early_memremap(efi.systab->fw_vendor, + sizeof(vendor)); + if (c16) { + for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i) + vendor[i] = c16[i]; + vendor[i] = '\0'; + } + + pr_info("EFI v%u.%.02u by %s\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + retval = efi_config_init(NULL); + if (retval == 0) + set_bit(EFI_CONFIG_TABLES, &efi.flags); + + early_memunmap(c16, sizeof(vendor)); + early_memunmap(efi.systab, sizeof(efi_system_table_t)); + + return retval; +} + +static __initdata char memory_type_name[][32] = { + {"Reserved"}, + {"Loader Code"}, + {"Loader Data"}, + {"Boot Code"}, + {"Boot Data"}, + {"Runtime Code"}, + {"Runtime Data"}, + {"Conventional Memory"}, + {"Unusable Memory"}, + {"ACPI Reclaim Memory"}, + {"ACPI Memory NVS"}, + {"Memory Mapped I/O"}, + {"MMIO Port Space"}, + {"PAL Code"}, +}; + +/* + * Return true for RAM regions we want to permanently reserve. + */ +static __init int is_reserve_region(efi_memory_desc_t *md) +{ + if (!is_normal_ram(md)) + return 0; + + if (md->attribute & EFI_MEMORY_RUNTIME) + return 1; + + if (md->type == EFI_ACPI_RECLAIM_MEMORY || + md->type == EFI_RESERVED_TYPE) + return 1; + + return 0; +} + +static __init void reserve_regions(void) +{ + efi_memory_desc_t *md; + u64 paddr, npages, size; + + if (uefi_debug) + pr_info("Processing EFI memory map:\n"); + + for_each_efi_memory_desc(&memmap, md) { + paddr = md->phys_addr; + npages = md->num_pages; + + if (uefi_debug) + pr_info(" 0x%012llx-0x%012llx [%s]", + paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1, + memory_type_name[md->type]); + + memrange_efi_to_native(&paddr, &npages); + size = npages << PAGE_SHIFT; + + if (is_normal_ram(md)) + early_init_dt_add_memory_arch(paddr, size); + + if (is_reserve_region(md) || + md->type == EFI_BOOT_SERVICES_CODE || + md->type == EFI_BOOT_SERVICES_DATA) { + memblock_reserve(paddr, size); + if (uefi_debug) + pr_cont("*"); + } + + if (uefi_debug) + pr_cont("\n"); + } +} + + +static u64 __init free_one_region(u64 start, u64 end) +{ + u64 size = end - start; + + if (uefi_debug) + pr_info(" EFI freeing: 0x%012llx-0x%012llx\n", start, end - 1); + + free_bootmem_late(start, size); + return size; +} + +static u64 __init free_region(u64 start, u64 end) +{ + u64 map_start, map_end, total = 0; + + if (end <= start) + return total; + + map_start = (u64)memmap.phys_map; + map_end = PAGE_ALIGN(map_start + (memmap.map_end - memmap.map)); + map_start &= PAGE_MASK; + + if (start < map_end && end > map_start) { + /* region overlaps UEFI memmap */ + if (start < map_start) + total += free_one_region(start, map_start); + + if (map_end < end) + total += free_one_region(map_end, end); + } else + total += free_one_region(start, end); + + return total; +} + +static void __init free_boot_services(void) +{ + u64 total_freed = 0; + u64 keep_end, free_start, free_end; + efi_memory_desc_t *md; + + /* + * If kernel uses larger pages than UEFI, we have to be careful + * not to inadvertantly free memory we want to keep if there is + * overlap at the kernel page size alignment. We do not want to + * free is_reserve_region() memory nor the UEFI memmap itself. + * + * The memory map is sorted, so we keep track of the end of + * any previous region we want to keep, remember any region + * we want to free and defer freeing it until we encounter + * the next region we want to keep. This way, before freeing + * it, we can clip it as needed to avoid freeing memory we + * want to keep for UEFI. + */ + + keep_end = 0; + free_start = 0; + + for_each_efi_memory_desc(&memmap, md) { + u64 paddr, npages, size; + + if (is_reserve_region(md)) { + /* + * We don't want to free any memory from this region. + */ + if (free_start) { + /* adjust free_end then free region */ + if (free_end > md->phys_addr) + free_end -= PAGE_SIZE; + total_freed += free_region(free_start, free_end); + free_start = 0; + } + keep_end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + continue; + } + + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) { + /* no need to free this region */ + continue; + } + + /* + * We want to free memory from this region. + */ + paddr = md->phys_addr; + npages = md->num_pages; + memrange_efi_to_native(&paddr, &npages); + size = npages << PAGE_SHIFT; + + if (free_start) { + if (paddr <= free_end) + free_end = paddr + size; + else { + total_freed += free_region(free_start, free_end); + free_start = paddr; + free_end = paddr + size; + } + } else { + free_start = paddr; + free_end = paddr + size; + } + if (free_start < keep_end) { + free_start += PAGE_SIZE; + if (free_start >= free_end) + free_start = 0; + } + } + if (free_start) + total_freed += free_region(free_start, free_end); + + if (total_freed) + pr_info("Freed 0x%llx bytes of EFI boot services memory", + total_freed); +} + +void __init efi_init(void) +{ + struct efi_fdt_params params; + + /* Grab UEFI information placed in FDT by stub */ + if (!efi_get_fdt_params(¶ms, uefi_debug)) + return; + + efi_system_table = params.system_table; + + memblock_reserve(params.mmap & PAGE_MASK, + PAGE_ALIGN(params.mmap_size + (params.mmap & ~PAGE_MASK))); + memmap.phys_map = (void *)params.mmap; + memmap.map = early_memremap(params.mmap, params.mmap_size); + memmap.map_end = memmap.map + params.mmap_size; + memmap.desc_size = params.desc_size; + memmap.desc_version = params.desc_ver; + + if (uefi_init() < 0) + return; + + reserve_regions(); +} + +void __init efi_idmap_init(void) +{ + if (!efi_enabled(EFI_BOOT)) + return; + + /* boot time idmap_pg_dir is incomplete, so fill in missing parts */ + efi_setup_idmap(); +} + +static int __init remap_region(efi_memory_desc_t *md, void **new) +{ + u64 paddr, vaddr, npages, size; + + paddr = md->phys_addr; + npages = md->num_pages; + memrange_efi_to_native(&paddr, &npages); + size = npages << PAGE_SHIFT; + + if (is_normal_ram(md)) + vaddr = (__force u64)ioremap_cache(paddr, size); + else + vaddr = (__force u64)ioremap(paddr, size); + + if (!vaddr) { + pr_err("Unable to remap 0x%llx pages @ %p\n", + npages, (void *)paddr); + return 0; + } + + /* adjust for any rounding when EFI and system pagesize differs */ + md->virt_addr = vaddr + (md->phys_addr - paddr); + + if (uefi_debug) + pr_info(" EFI remap 0x%012llx => %p\n", + md->phys_addr, (void *)md->virt_addr); + + memcpy(*new, md, memmap.desc_size); + *new += memmap.desc_size; + + return 1; +} + +/* + * Switch UEFI from an identity map to a kernel virtual map + */ +static int __init arm64_enter_virtual_mode(void) +{ + efi_memory_desc_t *md; + phys_addr_t virtmap_phys; + void *virtmap, *virt_md; + efi_status_t status; + u64 mapsize; + int count = 0; + unsigned long flags; + + if (!efi_enabled(EFI_BOOT)) { + pr_info("EFI services will not be available.\n"); + return -1; + } + + pr_info("Remapping and enabling EFI services.\n"); + + /* replace early memmap mapping with permanent mapping */ + mapsize = memmap.map_end - memmap.map; + early_memunmap(memmap.map, mapsize); + memmap.map = (__force void *)ioremap_cache((phys_addr_t)memmap.phys_map, + mapsize); + memmap.map_end = memmap.map + mapsize; + + efi.memmap = &memmap; + + /* Map the runtime regions */ + virtmap = kmalloc(mapsize, GFP_KERNEL); + if (!virtmap) { + pr_err("Failed to allocate EFI virtual memmap\n"); + return -1; + } + virtmap_phys = virt_to_phys(virtmap); + virt_md = virtmap; + + for_each_efi_memory_desc(&memmap, md) { + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + if (remap_region(md, &virt_md)) + ++count; + } + + efi.systab = (__force void *)efi_lookup_mapped_addr(efi_system_table); + if (efi.systab) + set_bit(EFI_SYSTEM_TABLES, &efi.flags); + + local_irq_save(flags); + cpu_switch_mm(idmap_pg_dir, &init_mm); + + /* Call SetVirtualAddressMap with the physical address of the map */ + runtime = efi.systab->runtime; + efi.set_virtual_address_map = runtime->set_virtual_address_map; + + status = efi.set_virtual_address_map(count * memmap.desc_size, + memmap.desc_size, + memmap.desc_version, + (efi_memory_desc_t *)virtmap_phys); + cpu_set_reserved_ttbr0(); + flush_tlb_all(); + local_irq_restore(flags); + + kfree(virtmap); + + free_boot_services(); + + if (status != EFI_SUCCESS) { + pr_err("Failed to set EFI virtual address map! [%lx]\n", + status); + return -1; + } + + /* Set up runtime services function pointers */ + runtime = efi.systab->runtime; + efi.get_time = runtime->get_time; + efi.set_time = runtime->set_time; + efi.get_wakeup_time = runtime->get_wakeup_time; + efi.set_wakeup_time = runtime->set_wakeup_time; + efi.get_variable = runtime->get_variable; + efi.get_next_variable = runtime->get_next_variable; + efi.set_variable = runtime->set_variable; + efi.query_variable_info = runtime->query_variable_info; + efi.update_capsule = runtime->update_capsule; + efi.query_capsule_caps = runtime->query_capsule_caps; + efi.get_next_high_mono_count = runtime->get_next_high_mono_count; + efi.reset_system = runtime->reset_system; + + set_bit(EFI_RUNTIME_SERVICES, &efi.flags); + + return 0; +} +early_initcall(arm64_enter_virtual_mode); diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 6a27cd6dbfa6..d358ccacfc00 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -41,3 +41,27 @@ ENTRY(fpsimd_load_state) fpsimd_restore x0, 8 ret ENDPROC(fpsimd_load_state) + +#ifdef CONFIG_KERNEL_MODE_NEON + +/* + * Save the bottom n FP registers. + * + * x0 - pointer to struct fpsimd_partial_state + */ +ENTRY(fpsimd_save_partial_state) + fpsimd_save_partial x0, 1, 8, 9 + ret +ENDPROC(fpsimd_load_partial_state) + +/* + * Load the bottom n FP registers. + * + * x0 - pointer to struct fpsimd_partial_state + */ +ENTRY(fpsimd_load_partial_state) + fpsimd_restore_partial x0, 8, 9 + ret +ENDPROC(fpsimd_load_partial_state) + +#endif diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S new file mode 100644 index 000000000000..b051871f2965 --- /dev/null +++ b/arch/arm64/kernel/entry-ftrace.S @@ -0,0 +1,218 @@ +/* + * arch/arm64/kernel/entry-ftrace.S + * + * Copyright (C) 2013 Linaro Limited + * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/ftrace.h> +#include <asm/insn.h> + +/* + * Gcc with -pg will put the following code in the beginning of each function: + * mov x0, x30 + * bl _mcount + * [function's body ...] + * "bl _mcount" may be replaced to "bl ftrace_caller" or NOP if dynamic + * ftrace is enabled. + * + * Please note that x0 as an argument will not be used here because we can + * get lr(x30) of instrumented function at any time by winding up call stack + * as long as the kernel is compiled without -fomit-frame-pointer. + * (or CONFIG_FRAME_POINTER, this is forced on arm64) + * + * stack layout after mcount_enter in _mcount(): + * + * current sp/fp => 0:+-----+ + * in _mcount() | x29 | -> instrumented function's fp + * +-----+ + * | x30 | -> _mcount()'s lr (= instrumented function's pc) + * old sp => +16:+-----+ + * when instrumented | | + * function calls | ... | + * _mcount() | | + * | | + * instrumented => +xx:+-----+ + * function's fp | x29 | -> parent's fp + * +-----+ + * | x30 | -> instrumented function's lr (= parent's pc) + * +-----+ + * | ... | + */ + + .macro mcount_enter + stp x29, x30, [sp, #-16]! + mov x29, sp + .endm + + .macro mcount_exit + ldp x29, x30, [sp], #16 + ret + .endm + + .macro mcount_adjust_addr rd, rn + sub \rd, \rn, #AARCH64_INSN_SIZE + .endm + + /* for instrumented function's parent */ + .macro mcount_get_parent_fp reg + ldr \reg, [x29] + ldr \reg, [\reg] + .endm + + /* for instrumented function */ + .macro mcount_get_pc0 reg + mcount_adjust_addr \reg, x30 + .endm + + .macro mcount_get_pc reg + ldr \reg, [x29, #8] + mcount_adjust_addr \reg, \reg + .endm + + .macro mcount_get_lr reg + ldr \reg, [x29] + ldr \reg, [\reg, #8] + mcount_adjust_addr \reg, \reg + .endm + + .macro mcount_get_lr_addr reg + ldr \reg, [x29] + add \reg, \reg, #8 + .endm + +#ifndef CONFIG_DYNAMIC_FTRACE +/* + * void _mcount(unsigned long return_address) + * @return_address: return address to instrumented function + * + * This function makes calls, if enabled, to: + * - tracer function to probe instrumented function's entry, + * - ftrace_graph_caller to set up an exit hook + */ +ENTRY(_mcount) +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ldr x0, =ftrace_trace_stop + ldr x0, [x0] // if ftrace_trace_stop + ret // return; +#endif + mcount_enter + + ldr x0, =ftrace_trace_function + ldr x2, [x0] + adr x0, ftrace_stub + cmp x0, x2 // if (ftrace_trace_function + b.eq skip_ftrace_call // != ftrace_stub) { + + mcount_get_pc x0 // function's pc + mcount_get_lr x1 // function's lr (= parent's pc) + blr x2 // (*ftrace_trace_function)(pc, lr); + +#ifndef CONFIG_FUNCTION_GRAPH_TRACER +skip_ftrace_call: // return; + mcount_exit // } +#else + mcount_exit // return; + // } +skip_ftrace_call: + ldr x1, =ftrace_graph_return + ldr x2, [x1] // if ((ftrace_graph_return + cmp x0, x2 // != ftrace_stub) + b.ne ftrace_graph_caller + + ldr x1, =ftrace_graph_entry // || (ftrace_graph_entry + ldr x2, [x1] // != ftrace_graph_entry_stub)) + ldr x0, =ftrace_graph_entry_stub + cmp x0, x2 + b.ne ftrace_graph_caller // ftrace_graph_caller(); + + mcount_exit +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +ENDPROC(_mcount) + +#else /* CONFIG_DYNAMIC_FTRACE */ +/* + * _mcount() is used to build the kernel with -pg option, but all the branch + * instructions to _mcount() are replaced to NOP initially at kernel start up, + * and later on, NOP to branch to ftrace_caller() when enabled or branch to + * NOP when disabled per-function base. + */ +ENTRY(_mcount) + ret +ENDPROC(_mcount) + +/* + * void ftrace_caller(unsigned long return_address) + * @return_address: return address to instrumented function + * + * This function is a counterpart of _mcount() in 'static' ftrace, and + * makes calls to: + * - tracer function to probe instrumented function's entry, + * - ftrace_graph_caller to set up an exit hook + */ +ENTRY(ftrace_caller) + mcount_enter + + mcount_get_pc0 x0 // function's pc + mcount_get_lr x1 // function's lr + + .global ftrace_call +ftrace_call: // tracer(pc, lr); + nop // This will be replaced with "bl xxx" + // where xxx can be any kind of tracer. + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + .global ftrace_graph_call +ftrace_graph_call: // ftrace_graph_caller(); + nop // If enabled, this will be replaced + // "b ftrace_graph_caller" +#endif + + mcount_exit +ENDPROC(ftrace_caller) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +ENTRY(ftrace_stub) + ret +ENDPROC(ftrace_stub) + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* + * void ftrace_graph_caller(void) + * + * Called from _mcount() or ftrace_caller() when function_graph tracer is + * selected. + * This function w/ prepare_ftrace_return() fakes link register's value on + * the call stack in order to intercept instrumented function's return path + * and run return_to_handler() later on its exit. + */ +ENTRY(ftrace_graph_caller) + mcount_get_lr_addr x0 // pointer to function's saved lr + mcount_get_pc x1 // function's pc + mcount_get_parent_fp x2 // parent's fp + bl prepare_ftrace_return // prepare_ftrace_return(&lr, pc, fp) + + mcount_exit +ENDPROC(ftrace_graph_caller) + +/* + * void return_to_handler(void) + * + * Run ftrace_return_to_handler() before going back to parent. + * @fp is checked against the value passed by ftrace_graph_caller() + * only when CONFIG_FUNCTION_GRAPH_FP_TEST is enabled. + */ +ENTRY(return_to_handler) + str x0, [sp, #-16]! + mov x0, x29 // parent's fp + bl ftrace_return_to_handler// addr = ftrace_return_to_hander(fp); + mov x30, x0 // restore the original return address + ldr x0, [sp], #16 + ret +END(return_to_handler) +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 39ac630d83de..bf017f4ffb4f 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -60,6 +60,9 @@ push x0, x1 .if \el == 0 mrs x21, sp_el0 + get_thread_info tsk // Ensure MDSCR_EL1.SS is clear, + ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug + disable_step_tsk x19, x20 // exceptions when scheduling. .else add x21, sp, #S_FRAME_SIZE .endif @@ -259,7 +262,7 @@ el1_da: * Data abort handling */ mrs x0, far_el1 - enable_dbg_if_not_stepping x2 + enable_dbg // re-enable interrupts if they were enabled in the aborted context tbnz x23, #7, 1f // PSR_I_BIT enable_irq @@ -275,6 +278,7 @@ el1_sp_pc: * Stack or PC alignment exception handling */ mrs x0, far_el1 + enable_dbg mov x1, x25 mov x2, sp b do_sp_pc_abort @@ -282,6 +286,7 @@ el1_undef: /* * Undefined instruction */ + enable_dbg mov x0, sp b do_undefinstr el1_dbg: @@ -294,10 +299,11 @@ el1_dbg: mrs x0, far_el1 mov x2, sp // struct pt_regs bl do_debug_exception - + enable_dbg kernel_exit 1 el1_inv: // TODO: add support for undefined instructions in kernel mode + enable_dbg mov x0, sp mov x1, #BAD_SYNC mrs x2, esr_el1 @@ -307,7 +313,7 @@ ENDPROC(el1_sync) .align 6 el1_irq: kernel_entry 1 - enable_dbg_if_not_stepping x0 + enable_dbg #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off #endif @@ -332,8 +338,7 @@ ENDPROC(el1_irq) #ifdef CONFIG_PREEMPT el1_preempt: mov x24, lr -1: enable_dbg - bl preempt_schedule_irq // irq en/disable is done inside +1: bl preempt_schedule_irq // irq en/disable is done inside ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? ret x24 @@ -349,7 +354,7 @@ el0_sync: lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class cmp x24, #ESR_EL1_EC_SVC64 // SVC in 64-bit state b.eq el0_svc - adr lr, ret_from_exception + adr lr, ret_to_user cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0 b.eq el0_da cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0 @@ -378,7 +383,7 @@ el0_sync_compat: lsr x24, x25, #ESR_EL1_EC_SHIFT // exception class cmp x24, #ESR_EL1_EC_SVC32 // SVC in 32-bit state b.eq el0_svc_compat - adr lr, ret_from_exception + adr lr, ret_to_user cmp x24, #ESR_EL1_EC_DABT_EL0 // data abort in EL0 b.eq el0_da cmp x24, #ESR_EL1_EC_IABT_EL0 // instruction abort in EL0 @@ -423,11 +428,8 @@ el0_da: */ mrs x0, far_el1 bic x0, x0, #(0xff << 56) - disable_step x1 - isb - enable_dbg // enable interrupts before calling the main handler - enable_irq + enable_dbg_and_irq mov x1, x25 mov x2, sp b do_mem_abort @@ -436,11 +438,8 @@ el0_ia: * Instruction abort handling */ mrs x0, far_el1 - disable_step x1 - isb - enable_dbg // enable interrupts before calling the main handler - enable_irq + enable_dbg_and_irq orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts mov x2, sp b do_mem_abort @@ -448,6 +447,7 @@ el0_fpsimd_acc: /* * Floating Point or Advanced SIMD access */ + enable_dbg mov x0, x25 mov x1, sp b do_fpsimd_acc @@ -455,6 +455,7 @@ el0_fpsimd_exc: /* * Floating Point or Advanced SIMD exception */ + enable_dbg mov x0, x25 mov x1, sp b do_fpsimd_exc @@ -463,11 +464,8 @@ el0_sp_pc: * Stack or PC alignment exception handling */ mrs x0, far_el1 - disable_step x1 - isb - enable_dbg // enable interrupts before calling the main handler - enable_irq + enable_dbg_and_irq mov x1, x25 mov x2, sp b do_sp_pc_abort @@ -475,9 +473,9 @@ el0_undef: /* * Undefined instruction */ - mov x0, sp // enable interrupts before calling the main handler - enable_irq + enable_dbg_and_irq + mov x0, sp b do_undefinstr el0_dbg: /* @@ -485,11 +483,13 @@ el0_dbg: */ tbnz x24, #0, el0_inv // EL0 only mrs x0, far_el1 - disable_step x1 mov x1, x25 mov x2, sp - b do_debug_exception + bl do_debug_exception + enable_dbg + b ret_to_user el0_inv: + enable_dbg mov x0, sp mov x1, #BAD_SYNC mrs x2, esr_el1 @@ -500,15 +500,12 @@ ENDPROC(el0_sync) el0_irq: kernel_entry 0 el0_irq_naked: - disable_step x1 - isb enable_dbg #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_off #endif irq_handler - get_thread_info tsk #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_on @@ -517,14 +514,6 @@ el0_irq_naked: ENDPROC(el0_irq) /* - * This is the return code to user mode for abort handlers - */ -ret_from_exception: - get_thread_info tsk - b ret_to_user -ENDPROC(ret_from_exception) - -/* * Register switch for AArch64. The callee-saved registers need to be saved * and restored. On entry: * x0 = previous task_struct (must be preserved across the switch) @@ -563,10 +552,7 @@ ret_fast_syscall: ldr x1, [tsk, #TI_FLAGS] and x2, x1, #_TIF_WORK_MASK cbnz x2, fast_work_pending - tbz x1, #TIF_SINGLESTEP, fast_exit - disable_dbg - enable_step x2 -fast_exit: + enable_step_tsk x1, x2 kernel_exit 0, ret = 1 /* @@ -576,7 +562,7 @@ fast_work_pending: str x0, [sp, #S_X0] // returned x0 work_pending: tbnz x1, #TIF_NEED_RESCHED, work_resched - /* TIF_SIGPENDING or TIF_NOTIFY_RESUME case */ + /* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */ ldr x2, [sp, #S_PSTATE] mov x0, sp // 'regs' tst x2, #PSR_MODE_MASK // user mode regs? @@ -585,7 +571,6 @@ work_pending: bl do_notify_resume b ret_to_user work_resched: - enable_dbg bl schedule /* @@ -596,9 +581,7 @@ ret_to_user: ldr x1, [tsk, #TI_FLAGS] and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending - tbz x1, #TIF_SINGLESTEP, no_work_pending - disable_dbg - enable_step x2 + enable_step_tsk x1, x2 no_work_pending: kernel_exit 0, ret = 0 ENDPROC(ret_to_user) @@ -625,14 +608,11 @@ el0_svc: mov sc_nr, #__NR_syscalls el0_svc_naked: // compat entry point stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number - disable_step x16 - isb - enable_dbg - enable_irq + enable_dbg_and_irq - get_thread_info tsk - ldr x16, [tsk, #TI_FLAGS] // check for syscall tracing - tbnz x16, #TIF_SYSCALL_TRACE, __sys_trace // are we tracing syscalls? + ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks + tst x16, #_TIF_SYSCALL_WORK + b.ne __sys_trace adr lr, ret_fast_syscall // return address cmp scno, sc_nr // check upper syscall limit b.hs ni_sys @@ -648,9 +628,8 @@ ENDPROC(el0_svc) * switches, and waiting for our parent to respond. */ __sys_trace: - mov x1, sp - mov w0, #0 // trace entry - bl syscall_trace + mov x0, sp + bl syscall_trace_enter adr lr, __sys_trace_return // return address uxtw scno, w0 // syscall number (possibly new) mov x1, sp // pointer to regs @@ -665,9 +644,8 @@ __sys_trace: __sys_trace_return: str x0, [sp] // save returned x0 - mov x1, sp - mov w0, #1 // trace exit - bl syscall_trace + mov x0, sp + bl syscall_trace_exit b ret_to_user /* diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 4aef42a04bdc..ad8aebb1cdef 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -35,6 +35,60 @@ #define FPEXC_IDF (1 << 7) /* + * In order to reduce the number of times the FPSIMD state is needlessly saved + * and restored, we need to keep track of two things: + * (a) for each task, we need to remember which CPU was the last one to have + * the task's FPSIMD state loaded into its FPSIMD registers; + * (b) for each CPU, we need to remember which task's userland FPSIMD state has + * been loaded into its FPSIMD registers most recently, or whether it has + * been used to perform kernel mode NEON in the meantime. + * + * For (a), we add a 'cpu' field to struct fpsimd_state, which gets updated to + * the id of the current CPU everytime the state is loaded onto a CPU. For (b), + * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the + * address of the userland FPSIMD state of the task that was loaded onto the CPU + * the most recently, or NULL if kernel mode NEON has been performed after that. + * + * With this in place, we no longer have to restore the next FPSIMD state right + * when switching between tasks. Instead, we can defer this check to userland + * resume, at which time we verify whether the CPU's fpsimd_last_state and the + * task's fpsimd_state.cpu are still mutually in sync. If this is the case, we + * can omit the FPSIMD restore. + * + * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to + * indicate whether or not the userland FPSIMD state of the current task is + * present in the registers. The flag is set unless the FPSIMD registers of this + * CPU currently contain the most recent userland FPSIMD state of the current + * task. + * + * For a certain task, the sequence may look something like this: + * - the task gets scheduled in; if both the task's fpsimd_state.cpu field + * contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu + * variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is + * cleared, otherwise it is set; + * + * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's + * userland FPSIMD state is copied from memory to the registers, the task's + * fpsimd_state.cpu field is set to the id of the current CPU, the current + * CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the + * TIF_FOREIGN_FPSTATE flag is cleared; + * + * - the task executes an ordinary syscall; upon return to userland, the + * TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is + * restored; + * + * - the task executes a syscall which executes some NEON instructions; this is + * preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD + * register contents to memory, clears the fpsimd_last_state per-cpu variable + * and sets the TIF_FOREIGN_FPSTATE flag; + * + * - the task gets preempted after kernel_neon_end() is called; as we have not + * returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so + * whatever is in the FPSIMD registers is not saved to memory, but discarded. + */ +static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_last_state); + +/* * Trapped FP/ASIMD access. */ void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) @@ -72,43 +126,137 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) void fpsimd_thread_switch(struct task_struct *next) { - /* check if not kernel threads */ - if (current->mm) + /* + * Save the current FPSIMD state to memory, but only if whatever is in + * the registers is in fact the most recent userland FPSIMD state of + * 'current'. + */ + if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) fpsimd_save_state(¤t->thread.fpsimd_state); - if (next->mm) - fpsimd_load_state(&next->thread.fpsimd_state); + + if (next->mm) { + /* + * If we are switching to a task whose most recent userland + * FPSIMD state is already in the registers of *this* cpu, + * we can skip loading the state from memory. Otherwise, set + * the TIF_FOREIGN_FPSTATE flag so the state will be loaded + * upon the next return to userland. + */ + struct fpsimd_state *st = &next->thread.fpsimd_state; + + if (__this_cpu_read(fpsimd_last_state) == st + && st->cpu == smp_processor_id()) + clear_ti_thread_flag(task_thread_info(next), + TIF_FOREIGN_FPSTATE); + else + set_ti_thread_flag(task_thread_info(next), + TIF_FOREIGN_FPSTATE); + } } void fpsimd_flush_thread(void) { - preempt_disable(); memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); - fpsimd_load_state(¤t->thread.fpsimd_state); + set_thread_flag(TIF_FOREIGN_FPSTATE); +} + +/* + * Save the userland FPSIMD state of 'current' to memory, but only if the state + * currently held in the registers does in fact belong to 'current' + */ +void fpsimd_preserve_current_state(void) +{ + preempt_disable(); + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) + fpsimd_save_state(¤t->thread.fpsimd_state); + preempt_enable(); +} + +/* + * Load the userland FPSIMD state of 'current' from memory, but only if the + * FPSIMD state already held in the registers is /not/ the most recent FPSIMD + * state of 'current' + */ +void fpsimd_restore_current_state(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { + struct fpsimd_state *st = ¤t->thread.fpsimd_state; + + fpsimd_load_state(st); + this_cpu_write(fpsimd_last_state, st); + st->cpu = smp_processor_id(); + } + preempt_enable(); +} + +/* + * Load an updated userland FPSIMD state for 'current' from memory and set the + * flag that indicates that the FPSIMD register contents are the most recent + * FPSIMD state of 'current' + */ +void fpsimd_update_current_state(struct fpsimd_state *state) +{ + preempt_disable(); + fpsimd_load_state(state); + if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { + struct fpsimd_state *st = ¤t->thread.fpsimd_state; + + this_cpu_write(fpsimd_last_state, st); + st->cpu = smp_processor_id(); + } preempt_enable(); } +/* + * Invalidate live CPU copies of task t's FPSIMD state + */ +void fpsimd_flush_task_state(struct task_struct *t) +{ + t->thread.fpsimd_state.cpu = NR_CPUS; +} + #ifdef CONFIG_KERNEL_MODE_NEON +static DEFINE_PER_CPU(struct fpsimd_partial_state, hardirq_fpsimdstate); +static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); + /* * Kernel-side NEON support functions */ -void kernel_neon_begin(void) +void kernel_neon_begin_partial(u32 num_regs) { - /* Avoid using the NEON in interrupt context */ - BUG_ON(in_interrupt()); - preempt_disable(); + if (in_interrupt()) { + struct fpsimd_partial_state *s = this_cpu_ptr( + in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); - if (current->mm) - fpsimd_save_state(¤t->thread.fpsimd_state); + BUG_ON(num_regs > 32); + fpsimd_save_partial_state(s, roundup(num_regs, 2)); + } else { + /* + * Save the userland FPSIMD state if we have one and if we + * haven't done so already. Clear fpsimd_last_state to indicate + * that there is no longer userland FPSIMD state in the + * registers. + */ + preempt_disable(); + if (current->mm && + !test_and_set_thread_flag(TIF_FOREIGN_FPSTATE)) + fpsimd_save_state(¤t->thread.fpsimd_state); + this_cpu_write(fpsimd_last_state, NULL); + } } -EXPORT_SYMBOL(kernel_neon_begin); +EXPORT_SYMBOL(kernel_neon_begin_partial); void kernel_neon_end(void) { - if (current->mm) - fpsimd_load_state(¤t->thread.fpsimd_state); - - preempt_enable(); + if (in_interrupt()) { + struct fpsimd_partial_state *s = this_cpu_ptr( + in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); + fpsimd_load_partial_state(s); + } else { + preempt_enable(); + } } EXPORT_SYMBOL(kernel_neon_end); @@ -120,12 +268,12 @@ static int fpsimd_cpu_pm_notifier(struct notifier_block *self, { switch (cmd) { case CPU_PM_ENTER: - if (current->mm) + if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) fpsimd_save_state(¤t->thread.fpsimd_state); break; case CPU_PM_EXIT: if (current->mm) - fpsimd_load_state(¤t->thread.fpsimd_state); + set_thread_flag(TIF_FOREIGN_FPSTATE); break; case CPU_PM_ENTER_FAILED: default: diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c new file mode 100644 index 000000000000..7924d73b6476 --- /dev/null +++ b/arch/arm64/kernel/ftrace.c @@ -0,0 +1,176 @@ +/* + * arch/arm64/kernel/ftrace.c + * + * Copyright (C) 2013 Linaro Limited + * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/ftrace.h> +#include <linux/swab.h> +#include <linux/uaccess.h> + +#include <asm/cacheflush.h> +#include <asm/ftrace.h> +#include <asm/insn.h> + +#ifdef CONFIG_DYNAMIC_FTRACE +/* + * Replace a single instruction, which may be a branch or NOP. + * If @validate == true, a replaced instruction is checked against 'old'. + */ +static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, + bool validate) +{ + u32 replaced; + + /* + * Note: + * Due to modules and __init, code can disappear and change, + * we need to protect against faulting as well as code changing. + * We do this by aarch64_insn_*() which use the probe_kernel_*(). + * + * No lock is held here because all the modifications are run + * through stop_machine(). + */ + if (validate) { + if (aarch64_insn_read((void *)pc, &replaced)) + return -EFAULT; + + if (replaced != old) + return -EINVAL; + } + if (aarch64_insn_patch_text_nosync((void *)pc, new)) + return -EPERM; + + return 0; +} + +/* + * Replace tracer function in ftrace_caller() + */ +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long pc; + u32 new; + + pc = (unsigned long)&ftrace_call; + new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, true); + + return ftrace_modify_code(pc, 0, new, false); +} + +/* + * Turn on the call to ftrace_caller() in instrumented function + */ +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ + unsigned long pc = rec->ip; + u32 old, new; + + old = aarch64_insn_gen_nop(); + new = aarch64_insn_gen_branch_imm(pc, addr, true); + + return ftrace_modify_code(pc, old, new, true); +} + +/* + * Turn off the call to ftrace_caller() in instrumented function + */ +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, + unsigned long addr) +{ + unsigned long pc = rec->ip; + u32 old, new; + + old = aarch64_insn_gen_branch_imm(pc, addr, true); + new = aarch64_insn_gen_nop(); + + return ftrace_modify_code(pc, old, new, true); +} + +int __init ftrace_dyn_arch_init(void) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* + * function_graph tracer expects ftrace_return_to_handler() to be called + * on the way back to parent. For this purpose, this function is called + * in _mcount() or ftrace_caller() to replace return address (*parent) on + * the call stack to return_to_handler. + * + * Note that @frame_pointer is used only for sanity check later. + */ +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, + unsigned long frame_pointer) +{ + unsigned long return_hooker = (unsigned long)&return_to_handler; + unsigned long old; + struct ftrace_graph_ent trace; + int err; + + if (unlikely(atomic_read(¤t->tracing_graph_pause))) + return; + + /* + * Note: + * No protection against faulting at *parent, which may be seen + * on other archs. It's unlikely on AArch64. + */ + old = *parent; + *parent = return_hooker; + + trace.func = self_addr; + trace.depth = current->curr_ret_stack + 1; + + /* Only trace if the calling function expects to */ + if (!ftrace_graph_entry(&trace)) { + *parent = old; + return; + } + + err = ftrace_push_return_trace(old, self_addr, &trace.depth, + frame_pointer); + if (err == -EBUSY) { + *parent = old; + return; + } +} + +#ifdef CONFIG_DYNAMIC_FTRACE +/* + * Turn on/off the call to ftrace_graph_caller() in ftrace_caller() + * depending on @enable. + */ +static int ftrace_modify_graph_caller(bool enable) +{ + unsigned long pc = (unsigned long)&ftrace_graph_call; + u32 branch, nop; + + branch = aarch64_insn_gen_branch_imm(pc, + (unsigned long)ftrace_graph_caller, false); + nop = aarch64_insn_gen_nop(); + + if (enable) + return ftrace_modify_code(pc, nop, branch, true); + else + return ftrace_modify_code(pc, branch, nop, true); +} + +int ftrace_enable_ftrace_graph_caller(void) +{ + return ftrace_modify_graph_caller(true); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ + return ftrace_modify_graph_caller(false); +} +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 0fd565000772..a96d3a6a63f6 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -108,8 +108,18 @@ /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ +#ifdef CONFIG_EFI +efi_head: + /* + * This add instruction has no meaningful effect except that + * its opcode forms the magic "MZ" signature required by UEFI. + */ + add x13, x18, #0x16 + b stext +#else b stext // branch to kernel start, magic .long 0 // reserved +#endif .quad TEXT_OFFSET // Image load offset from start of RAM .quad 0 // reserved .quad 0 // reserved @@ -120,7 +130,109 @@ .byte 0x52 .byte 0x4d .byte 0x64 +#ifdef CONFIG_EFI + .long pe_header - efi_head // Offset to the PE header. +#else .word 0 // reserved +#endif + +#ifdef CONFIG_EFI + .align 3 +pe_header: + .ascii "PE" + .short 0 +coff_header: + .short 0xaa64 // AArch64 + .short 2 // nr_sections + .long 0 // TimeDateStamp + .long 0 // PointerToSymbolTable + .long 1 // NumberOfSymbols + .short section_table - optional_header // SizeOfOptionalHeader + .short 0x206 // Characteristics. + // IMAGE_FILE_DEBUG_STRIPPED | + // IMAGE_FILE_EXECUTABLE_IMAGE | + // IMAGE_FILE_LINE_NUMS_STRIPPED +optional_header: + .short 0x20b // PE32+ format + .byte 0x02 // MajorLinkerVersion + .byte 0x14 // MinorLinkerVersion + .long _edata - stext // SizeOfCode + .long 0 // SizeOfInitializedData + .long 0 // SizeOfUninitializedData + .long efi_stub_entry - efi_head // AddressOfEntryPoint + .long stext - efi_head // BaseOfCode + +extra_header_fields: + .quad 0 // ImageBase + .long 0x20 // SectionAlignment + .long 0x8 // FileAlignment + .short 0 // MajorOperatingSystemVersion + .short 0 // MinorOperatingSystemVersion + .short 0 // MajorImageVersion + .short 0 // MinorImageVersion + .short 0 // MajorSubsystemVersion + .short 0 // MinorSubsystemVersion + .long 0 // Win32VersionValue + + .long _edata - efi_head // SizeOfImage + + // Everything before the kernel image is considered part of the header + .long stext - efi_head // SizeOfHeaders + .long 0 // CheckSum + .short 0xa // Subsystem (EFI application) + .short 0 // DllCharacteristics + .quad 0 // SizeOfStackReserve + .quad 0 // SizeOfStackCommit + .quad 0 // SizeOfHeapReserve + .quad 0 // SizeOfHeapCommit + .long 0 // LoaderFlags + .long 0x6 // NumberOfRvaAndSizes + + .quad 0 // ExportTable + .quad 0 // ImportTable + .quad 0 // ResourceTable + .quad 0 // ExceptionTable + .quad 0 // CertificationTable + .quad 0 // BaseRelocationTable + + // Section table +section_table: + + /* + * The EFI application loader requires a relocation section + * because EFI applications must be relocatable. This is a + * dummy section as far as we are concerned. + */ + .ascii ".reloc" + .byte 0 + .byte 0 // end of 0 padding of section name + .long 0 + .long 0 + .long 0 // SizeOfRawData + .long 0 // PointerToRawData + .long 0 // PointerToRelocations + .long 0 // PointerToLineNumbers + .short 0 // NumberOfRelocations + .short 0 // NumberOfLineNumbers + .long 0x42100040 // Characteristics (section flags) + + + .ascii ".text" + .byte 0 + .byte 0 + .byte 0 // end of 0 padding of section name + .long _edata - stext // VirtualSize + .long stext - efi_head // VirtualAddress + .long _edata - stext // SizeOfRawData + .long stext - efi_head // PointerToRawData + + .long 0 // PointerToRelocations (0 for executables) + .long 0 // PointerToLineNumbers (0 for executables) + .short 0 // NumberOfRelocations (0 for executables) + .short 0 // NumberOfLineNumbers (0 for executables) + .long 0xe0500020 // Characteristics (section flags) + .align 5 +#endif ENTRY(stext) mov x21, x0 // x21=FDT @@ -230,11 +342,9 @@ ENTRY(set_cpu_boot_mode_flag) cmp w20, #BOOT_CPU_MODE_EL2 b.ne 1f add x1, x1, #4 -1: dc cvac, x1 // Clean potentially dirty cache line - dsb sy - str w20, [x1] // This CPU has booted in EL1 - dc civac, x1 // Clean&invalidate potentially stale cache line - dsb sy +1: str w20, [x1] // This CPU has booted in EL1 + dmb sy + dc ivac, x1 // Invalidate potentially stale cache line ret ENDPROC(set_cpu_boot_mode_flag) diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index bee789757806..df1cf15377b4 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -20,6 +20,7 @@ #define pr_fmt(fmt) "hw-breakpoint: " fmt +#include <linux/compat.h> #include <linux/cpu_pm.h> #include <linux/errno.h> #include <linux/hw_breakpoint.h> @@ -27,7 +28,6 @@ #include <linux/ptrace.h> #include <linux/smp.h> -#include <asm/compat.h> #include <asm/current.h> #include <asm/debug-monitors.h> #include <asm/hw_breakpoint.h> diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 6391485f342d..43b7c34f92cb 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -20,6 +20,7 @@ #include <stdarg.h> +#include <linux/compat.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/kernel.h> @@ -113,32 +114,62 @@ void arch_cpu_idle_dead(void) } #endif +/* + * Called by kexec, immediately prior to machine_kexec(). + * + * This must completely disable all secondary CPUs; simply causing those CPUs + * to execute e.g. a RAM-based pin loop is not sufficient. This allows the + * kexec'd kernel to use any and all RAM as it sees fit, without having to + * avoid any code or data used by any SW CPU pin loop. The CPU hotplug + * functionality embodied in disable_nonboot_cpus() to achieve this. + */ void machine_shutdown(void) { -#ifdef CONFIG_SMP - smp_send_stop(); -#endif + disable_nonboot_cpus(); } +/* + * Halting simply requires that the secondary CPUs stop performing any + * activity (executing tasks, handling interrupts). smp_send_stop() + * achieves this. + */ void machine_halt(void) { - machine_shutdown(); + local_irq_disable(); + smp_send_stop(); while (1); } +/* + * Power-off simply requires that the secondary CPUs stop performing any + * activity (executing tasks, handling interrupts). smp_send_stop() + * achieves this. When the system power is turned off, it will take all CPUs + * with it. + */ void machine_power_off(void) { - machine_shutdown(); + local_irq_disable(); + smp_send_stop(); if (pm_power_off) pm_power_off(); } +/* + * Restart requires that the secondary CPUs stop performing any activity + * while the primary CPU resets the system. Systems with a single CPU can + * use soft_restart() as their machine descriptor's .restart hook, since that + * will cause the only available CPU to reset. Systems with multiple CPUs must + * provide a HW restart implementation, to ensure that all CPUs reset at once. + * This is required so that any code running after reset on the primary CPU + * doesn't have to co-ordinate with other CPUs to ensure they aren't still + * executing pre-reset code, and using RAM that the primary CPU's code wishes + * to use. Implementing such co-ordination would be essentially impossible. + */ void machine_restart(char *cmd) { - machine_shutdown(); - /* Disable interrupts first */ local_irq_disable(); + smp_send_stop(); /* Now call the architecture specific reboot code. */ if (arm_pm_restart) @@ -205,7 +236,7 @@ void release_thread(struct task_struct *dead_task) int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - fpsimd_save_state(¤t->thread.fpsimd_state); + fpsimd_preserve_current_state(); *dst = *src; return 0; } @@ -300,7 +331,7 @@ struct task_struct *__switch_to(struct task_struct *prev, * Complete any pending TLB or cache maintenance on this CPU in case * the thread migrates to a different CPU. */ - dsb(); + dsb(ish); /* the actual thread switch */ last = cpu_switch_to(prev, next); diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 6a8928bba03c..3e926b9c0641 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -19,6 +19,7 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/compat.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> @@ -41,6 +42,9 @@ #include <asm/traps.h> #include <asm/system_misc.h> +#define CREATE_TRACE_POINTS +#include <trace/events/syscalls.h> + /* * TODO: does not yet catch signals sent when the child dies. * in exit.c or in signal.c. @@ -517,6 +521,7 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset, return ret; target->thread.fpsimd_state.user_fpsimd = newstate; + fpsimd_flush_task_state(target); return ret; } @@ -764,6 +769,7 @@ static int compat_vfp_set(struct task_struct *target, uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK; } + fpsimd_flush_task_state(target); return ret; } @@ -1058,35 +1064,49 @@ long arch_ptrace(struct task_struct *child, long request, return ptrace_request(child, request, addr, data); } -asmlinkage int syscall_trace(int dir, struct pt_regs *regs) +enum ptrace_syscall_dir { + PTRACE_SYSCALL_ENTER = 0, + PTRACE_SYSCALL_EXIT, +}; + +static void tracehook_report_syscall(struct pt_regs *regs, + enum ptrace_syscall_dir dir) { + int regno; unsigned long saved_reg; - if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return regs->syscallno; - - if (is_compat_task()) { - /* AArch32 uses ip (r12) for scratch */ - saved_reg = regs->regs[12]; - regs->regs[12] = dir; - } else { - /* - * Save X7. X7 is used to denote syscall entry/exit: - * X7 = 0 -> entry, = 1 -> exit - */ - saved_reg = regs->regs[7]; - regs->regs[7] = dir; - } + /* + * A scratch register (ip(r12) on AArch32, x7 on AArch64) is + * used to denote syscall entry/exit: + */ + regno = (is_compat_task() ? 12 : 7); + saved_reg = regs->regs[regno]; + regs->regs[regno] = dir; - if (dir) + if (dir == PTRACE_SYSCALL_EXIT) tracehook_report_syscall_exit(regs, 0); else if (tracehook_report_syscall_entry(regs)) regs->syscallno = ~0UL; - if (is_compat_task()) - regs->regs[12] = saved_reg; - else - regs->regs[7] = saved_reg; + regs->regs[regno] = saved_reg; +} + +asmlinkage int syscall_trace_enter(struct pt_regs *regs) +{ + if (test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER); + + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + trace_sys_enter(regs, regs->syscallno); return regs->syscallno; } + +asmlinkage void syscall_trace_exit(struct pt_regs *regs) +{ + if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) + trace_sys_exit(regs, regs_return_value(regs)); + + if (test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT); +} diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c new file mode 100644 index 000000000000..89102a6ffad5 --- /dev/null +++ b/arch/arm64/kernel/return_address.c @@ -0,0 +1,55 @@ +/* + * arch/arm64/kernel/return_address.c + * + * Copyright (C) 2013 Linaro Limited + * Author: AKASHI Takahiro <takahiro.akashi@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/export.h> +#include <linux/ftrace.h> + +#include <asm/stacktrace.h> + +struct return_address_data { + unsigned int level; + void *addr; +}; + +static int save_return_addr(struct stackframe *frame, void *d) +{ + struct return_address_data *data = d; + + if (!data->level) { + data->addr = (void *)frame->pc; + return 1; + } else { + --data->level; + return 0; + } +} + +void *return_address(unsigned int level) +{ + struct return_address_data data; + struct stackframe frame; + register unsigned long current_sp asm ("sp"); + + data.level = level + 2; + data.addr = NULL; + + frame.fp = (unsigned long)__builtin_frame_address(0); + frame.sp = current_sp; + frame.pc = (unsigned long)return_address; /* dummy */ + + walk_stackframe(&frame, save_return_addr, &data); + + if (!data.level) + return data.addr; + else + return NULL; +} +EXPORT_SYMBOL_GPL(return_address); diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 7ec784653b29..46d1125571f6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -25,6 +25,7 @@ #include <linux/utsname.h> #include <linux/initrd.h> #include <linux/console.h> +#include <linux/cache.h> #include <linux/bootmem.h> #include <linux/seq_file.h> #include <linux/screen_info.h> @@ -41,6 +42,7 @@ #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/of_platform.h> +#include <linux/efi.h> #include <asm/fixmap.h> #include <asm/cputype.h> @@ -55,6 +57,7 @@ #include <asm/traps.h> #include <asm/memblock.h> #include <asm/psci.h> +#include <asm/efi.h> unsigned int processor_id; EXPORT_SYMBOL(processor_id); @@ -198,6 +201,8 @@ static void __init setup_processor(void) { struct cpu_info *cpu_info; u64 features, block; + u32 cwg; + int cls; cpu_info = lookup_processor_type(read_cpuid_id()); if (!cpu_info) { @@ -215,6 +220,18 @@ static void __init setup_processor(void) elf_hwcap = 0; /* + * Check for sane CTR_EL0.CWG value. + */ + cwg = cache_type_cwg(); + cls = cache_line_size(); + if (!cwg) + pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n", + cls); + if (L1_CACHE_BYTES < cls) + pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n", + L1_CACHE_BYTES, cls); + + /* * ID_AA64ISAR0_EL1 contains 4-bit wide signed feature blocks. * The blocks we test below represent incremental functionality * for non-negative values. Negative values are reserved. @@ -361,16 +378,18 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = boot_command_line; - init_mem_pgprot(); early_ioremap_init(); parse_early_param(); + efi_init(); arm64_memblock_init(); paging_init(); request_standard_resources(); + efi_idmap_init(); + unflatten_device_tree(); psci_init(); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 890a591f75dd..6357b9c6c90e 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -17,6 +17,7 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/compat.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/personality.h> @@ -25,7 +26,6 @@ #include <linux/tracehook.h> #include <linux/ratelimit.h> -#include <asm/compat.h> #include <asm/debug-monitors.h> #include <asm/elf.h> #include <asm/cacheflush.h> @@ -51,7 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) int err; /* dump the hardware registers to the fpsimd_state structure */ - fpsimd_save_state(fpsimd); + fpsimd_preserve_current_state(); /* copy the FP and status/control registers */ err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs)); @@ -86,11 +86,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx) __get_user_error(fpsimd.fpcr, &ctx->fpcr, err); /* load the hardware registers from the fpsimd_state structure */ - if (!err) { - preempt_disable(); - fpsimd_load_state(&fpsimd); - preempt_enable(); - } + if (!err) + fpsimd_update_current_state(&fpsimd); return err ? -EFAULT : 0; } @@ -100,8 +97,7 @@ static int restore_sigframe(struct pt_regs *regs, { sigset_t set; int i, err; - struct aux_context __user *aux = - (struct aux_context __user *)sf->uc.uc_mcontext.__reserved; + void *aux = sf->uc.uc_mcontext.__reserved; err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set)); if (err == 0) @@ -121,8 +117,11 @@ static int restore_sigframe(struct pt_regs *regs, err |= !valid_user_regs(®s->user_regs); - if (err == 0) - err |= restore_fpsimd_context(&aux->fpsimd); + if (err == 0) { + struct fpsimd_context *fpsimd_ctx = + container_of(aux, struct fpsimd_context, head); + err |= restore_fpsimd_context(fpsimd_ctx); + } return err; } @@ -167,8 +166,8 @@ static int setup_sigframe(struct rt_sigframe __user *sf, struct pt_regs *regs, sigset_t *set) { int i, err = 0; - struct aux_context __user *aux = - (struct aux_context __user *)sf->uc.uc_mcontext.__reserved; + void *aux = sf->uc.uc_mcontext.__reserved; + struct _aarch64_ctx *end; /* set up the stack frame for unwinding */ __put_user_error(regs->regs[29], &sf->fp, err); @@ -185,12 +184,27 @@ static int setup_sigframe(struct rt_sigframe __user *sf, err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set)); - if (err == 0) - err |= preserve_fpsimd_context(&aux->fpsimd); + if (err == 0) { + struct fpsimd_context *fpsimd_ctx = + container_of(aux, struct fpsimd_context, head); + err |= preserve_fpsimd_context(fpsimd_ctx); + aux += sizeof(*fpsimd_ctx); + } + + /* fault information, if valid */ + if (current->thread.fault_code) { + struct esr_context *esr_ctx = + container_of(aux, struct esr_context, head); + __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err); + __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err); + __put_user_error(current->thread.fault_code, &esr_ctx->esr, err); + aux += sizeof(*esr_ctx); + } /* set the "end" magic */ - __put_user_error(0, &aux->end.magic, err); - __put_user_error(0, &aux->end.size, err); + end = aux; + __put_user_error(0, &end->magic, err); + __put_user_error(0, &end->size, err); return err; } @@ -416,4 +430,8 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } + + if (thread_flags & _TIF_FOREIGN_FPSTATE) + fpsimd_restore_current_state(); + } diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index b3fc9f5ec6d3..3491c638f172 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -23,6 +23,7 @@ #include <linux/syscalls.h> #include <linux/ratelimit.h> +#include <asm/esr.h> #include <asm/fpsimd.h> #include <asm/signal32.h> #include <asm/uaccess.h> @@ -81,6 +82,8 @@ struct compat_vfp_sigframe { #define VFP_MAGIC 0x56465001 #define VFP_STORAGE_SIZE sizeof(struct compat_vfp_sigframe) +#define FSR_WRITE_SHIFT (11) + struct compat_aux_sigframe { struct compat_vfp_sigframe vfp; @@ -219,7 +222,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) * Note that this also saves V16-31, which aren't visible * in AArch32. */ - fpsimd_save_state(fpsimd); + fpsimd_preserve_current_state(); /* Place structure header on the stack */ __put_user_error(magic, &frame->magic, err); @@ -282,11 +285,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) * We don't need to touch the exception register, so * reload the hardware state. */ - if (!err) { - preempt_disable(); - fpsimd_load_state(&fpsimd); - preempt_enable(); - } + if (!err) + fpsimd_update_current_state(&fpsimd); return err ? -EFAULT : 0; } @@ -500,7 +500,9 @@ static int compat_setup_sigframe(struct compat_sigframe __user *sf, __put_user_error(regs->pstate, &sf->uc.uc_mcontext.arm_cpsr, err); __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.trap_no, err); - __put_user_error((compat_ulong_t)0, &sf->uc.uc_mcontext.error_code, err); + /* set the compat FSR WnR */ + __put_user_error(!!(current->thread.fault_code & ESR_EL1_WRITE) << + FSR_WRITE_SHIFT, &sf->uc.uc_mcontext.error_code, err); __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err); __put_user_error(set->sig[0], &sf->uc.uc_mcontext.oldmask, err); diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index c3cb160edc69..40f38f46c8e0 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -35,6 +35,7 @@ #include <linux/clockchips.h> #include <linux/completion.h> #include <linux/of.h> +#include <linux/irq_work.h> #include <asm/atomic.h> #include <asm/cacheflush.h> @@ -62,6 +63,7 @@ enum ipi_msg_type { IPI_CALL_FUNC_SINGLE, IPI_CPU_STOP, IPI_TIMER, + IPI_IRQ_WORK, }; /* @@ -477,6 +479,14 @@ void arch_send_call_function_single_ipi(int cpu) smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE); } +#ifdef CONFIG_IRQ_WORK +void arch_irq_work_raise(void) +{ + if (smp_cross_call) + smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK); +} +#endif + static const char *ipi_types[NR_IPI] = { #define S(x,s) [x - IPI_RESCHEDULE] = s S(IPI_RESCHEDULE, "Rescheduling interrupts"), @@ -484,6 +494,7 @@ static const char *ipi_types[NR_IPI] = { S(IPI_CALL_FUNC_SINGLE, "Single function call interrupts"), S(IPI_CPU_STOP, "CPU stop interrupts"), S(IPI_TIMER, "Timer broadcast interrupts"), + S(IPI_IRQ_WORK, "IRQ work interrupts"), }; void show_ipi_list(struct seq_file *p, int prec) @@ -576,6 +587,14 @@ void handle_IPI(int ipinr, struct pt_regs *regs) break; #endif +#ifdef CONFIG_IRQ_WORK + case IPI_IRQ_WORK: + irq_enter(); + irq_work_run(); + irq_exit(); + break; +#endif + default: pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); break; diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c index 7a530d2cc807..0347d38eea29 100644 --- a/arch/arm64/kernel/smp_spin_table.c +++ b/arch/arm64/kernel/smp_spin_table.c @@ -30,7 +30,6 @@ extern void secondary_holding_pen(void); volatile unsigned long secondary_holding_pen_release = INVALID_HWID; static phys_addr_t cpu_release_addr[NR_CPUS]; -static DEFINE_RAW_SPINLOCK(boot_lock); /* * Write secondary_holding_pen_release in a way that is guaranteed to be @@ -94,14 +93,6 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu) static int smp_spin_table_cpu_boot(unsigned int cpu) { - unsigned long timeout; - - /* - * Set synchronisation state between this boot processor - * and the secondary one - */ - raw_spin_lock(&boot_lock); - /* * Update the pen release flag. */ @@ -112,34 +103,7 @@ static int smp_spin_table_cpu_boot(unsigned int cpu) */ sev(); - timeout = jiffies + (1 * HZ); - while (time_before(jiffies, timeout)) { - if (secondary_holding_pen_release == INVALID_HWID) - break; - udelay(10); - } - - /* - * Now the secondary core is starting up let it run its - * calibrations, then wait for it to finish - */ - raw_spin_unlock(&boot_lock); - - return secondary_holding_pen_release != INVALID_HWID ? -ENOSYS : 0; -} - -static void smp_spin_table_cpu_postboot(void) -{ - /* - * Let the primary processor know we're out of the pen. - */ - write_pen_release(INVALID_HWID); - - /* - * Synchronise with the boot thread. - */ - raw_spin_lock(&boot_lock); - raw_spin_unlock(&boot_lock); + return 0; } const struct cpu_operations smp_spin_table_ops = { @@ -147,5 +111,4 @@ const struct cpu_operations smp_spin_table_ops = { .cpu_init = smp_spin_table_cpu_init, .cpu_prepare = smp_spin_table_cpu_prepare, .cpu_boot = smp_spin_table_cpu_boot, - .cpu_postboot = smp_spin_table_cpu_postboot, }; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 38f0558f0c0a..55437ba1f5a4 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -35,7 +35,7 @@ * ldp x29, x30, [sp] * add sp, sp, #0x10 */ -int unwind_frame(struct stackframe *frame) +int notrace unwind_frame(struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 6815987b50f8..1a7125c3099b 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -18,6 +18,7 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/clockchips.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/interrupt.h> @@ -69,6 +70,8 @@ void __init time_init(void) of_clk_init(NULL); clocksource_of_init(); + tick_setup_hrtimer_broadcast(); + arch_timer_rate = arch_timer_get_rate(); if (!arch_timer_rate) panic("Unable to initialise architected timer.\n"); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3e06b0be4ec8..43514f905916 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -17,10 +17,192 @@ #include <linux/percpu.h> #include <linux/node.h> #include <linux/nodemask.h> +#include <linux/of.h> #include <linux/sched.h> #include <asm/topology.h> +static int __init get_cpu_for_node(struct device_node *node) +{ + struct device_node *cpu_node; + int cpu; + + cpu_node = of_parse_phandle(node, "cpu", 0); + if (!cpu_node) + return -1; + + for_each_possible_cpu(cpu) { + if (of_get_cpu_node(cpu, NULL) == cpu_node) { + of_node_put(cpu_node); + return cpu; + } + } + + pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name); + + of_node_put(cpu_node); + return -1; +} + +static int __init parse_core(struct device_node *core, int cluster_id, + int core_id) +{ + char name[10]; + bool leaf = true; + int i = 0; + int cpu; + struct device_node *t; + + do { + snprintf(name, sizeof(name), "thread%d", i); + t = of_get_child_by_name(core, name); + if (t) { + leaf = false; + cpu = get_cpu_for_node(t); + if (cpu >= 0) { + cpu_topology[cpu].cluster_id = cluster_id; + cpu_topology[cpu].core_id = core_id; + cpu_topology[cpu].thread_id = i; + } else { + pr_err("%s: Can't get CPU for thread\n", + t->full_name); + of_node_put(t); + return -EINVAL; + } + of_node_put(t); + } + i++; + } while (t); + + cpu = get_cpu_for_node(core); + if (cpu >= 0) { + if (!leaf) { + pr_err("%s: Core has both threads and CPU\n", + core->full_name); + return -EINVAL; + } + + cpu_topology[cpu].cluster_id = cluster_id; + cpu_topology[cpu].core_id = core_id; + } else if (leaf) { + pr_err("%s: Can't get CPU for leaf core\n", core->full_name); + return -EINVAL; + } + + return 0; +} + +static int __init parse_cluster(struct device_node *cluster, int depth) +{ + char name[10]; + bool leaf = true; + bool has_cores = false; + struct device_node *c; + static int cluster_id __initdata; + int core_id = 0; + int i, ret; + + /* + * First check for child clusters; we currently ignore any + * information about the nesting of clusters and present the + * scheduler with a flat list of them. + */ + i = 0; + do { + snprintf(name, sizeof(name), "cluster%d", i); + c = of_get_child_by_name(cluster, name); + if (c) { + leaf = false; + ret = parse_cluster(c, depth + 1); + of_node_put(c); + if (ret != 0) + return ret; + } + i++; + } while (c); + + /* Now check for cores */ + i = 0; + do { + snprintf(name, sizeof(name), "core%d", i); + c = of_get_child_by_name(cluster, name); + if (c) { + has_cores = true; + + if (depth == 0) { + pr_err("%s: cpu-map children should be clusters\n", + c->full_name); + of_node_put(c); + return -EINVAL; + } + + if (leaf) { + ret = parse_core(c, cluster_id, core_id++); + } else { + pr_err("%s: Non-leaf cluster with core %s\n", + cluster->full_name, name); + ret = -EINVAL; + } + + of_node_put(c); + if (ret != 0) + return ret; + } + i++; + } while (c); + + if (leaf && !has_cores) + pr_warn("%s: empty cluster\n", cluster->full_name); + + if (leaf) + cluster_id++; + + return 0; +} + +static int __init parse_dt_topology(void) +{ + struct device_node *cn, *map; + int ret = 0; + int cpu; + + cn = of_find_node_by_path("/cpus"); + if (!cn) { + pr_err("No CPU information found in DT\n"); + return 0; + } + + /* + * When topology is provided cpu-map is essentially a root + * cluster with restricted subnodes. + */ + map = of_get_child_by_name(cn, "cpu-map"); + if (!map) + goto out; + + ret = parse_cluster(map, 0); + if (ret != 0) + goto out_map; + + /* + * Check that all cores are in the topology; the SMP code will + * only mark cores described in the DT as possible. + */ + for_each_possible_cpu(cpu) { + if (cpu_topology[cpu].cluster_id == -1) { + pr_err("CPU%d: No topology information specified\n", + cpu); + ret = -EINVAL; + } + } + +out_map: + of_node_put(map); +out: + of_node_put(cn); + return ret; +} + /* * cpu topology table */ @@ -39,13 +221,9 @@ static void update_siblings_masks(unsigned int cpuid) if (cpuid_topo->cluster_id == -1) { /* - * DT does not contain topology information for this cpu - * reset it to default behaviour + * DT does not contain topology information for this cpu. */ pr_debug("CPU%u: No topology information configured\n", cpuid); - cpuid_topo->core_id = 0; - cpumask_set_cpu(cpuid, &cpuid_topo->core_sibling); - cpumask_set_cpu(cpuid, &cpuid_topo->thread_sibling); return; } @@ -74,22 +252,32 @@ void store_cpu_topology(unsigned int cpuid) update_siblings_masks(cpuid); } -/* - * init_cpu_topology is called at boot when only one cpu is running - * which prevent simultaneous write access to cpu_topology array - */ -void __init init_cpu_topology(void) +static void __init reset_cpu_topology(void) { unsigned int cpu; - /* init core mask and power*/ for_each_possible_cpu(cpu) { struct cpu_topology *cpu_topo = &cpu_topology[cpu]; cpu_topo->thread_id = -1; - cpu_topo->core_id = -1; + cpu_topo->core_id = 0; cpu_topo->cluster_id = -1; + cpumask_clear(&cpu_topo->core_sibling); + cpumask_set_cpu(cpu, &cpu_topo->core_sibling); cpumask_clear(&cpu_topo->thread_sibling); + cpumask_set_cpu(cpu, &cpu_topo->thread_sibling); } } + +void __init init_cpu_topology(void) +{ + reset_cpu_topology(); + + /* + * Discard anything that was parsed if we hit an error so we + * don't use partial information. + */ + if (parse_dt_topology()) + reset_cpu_topology(); +} diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 7ffadddb645d..c43cfa9b8304 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -251,10 +251,13 @@ void die(const char *str, struct pt_regs *regs, int err) void arm64_notify_die(const char *str, struct pt_regs *regs, struct siginfo *info, int err) { - if (user_mode(regs)) + if (user_mode(regs)) { + current->thread.fault_address = 0; + current->thread.fault_code = err; force_sig_info(info->si_signo, info, current); - else + } else { die(str, regs, err); + } } asmlinkage void __exception do_undefinstr(struct pt_regs *regs) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 4ba7a55b49c7..f1e6d5c032e1 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -13,7 +13,7 @@ #define ARM_EXIT_DISCARD(x) x OUTPUT_ARCH(aarch64) -ENTRY(stext) +ENTRY(_text) jiffies = jiffies_64; diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 2c56012cb2d2..b0d1512acf08 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -630,9 +630,15 @@ ENTRY(__kvm_tlb_flush_vmid_ipa) * whole of Stage-1. Weep... */ tlbi ipas2e1is, x1 - dsb sy + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb ish tlbi vmalle1is - dsb sy + dsb ish isb msr vttbr_el2, xzr @@ -643,7 +649,7 @@ ENTRY(__kvm_flush_vm_context) dsb ishst tlbi alle1is ic ialluis - dsb sy + dsb ish ret ENDPROC(__kvm_flush_vm_context) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 03244582bc55..c59a1bdab5eb 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -71,13 +71,13 @@ static u32 get_ccsidr(u32 csselr) static void do_dc_cisw(u32 val) { asm volatile("dc cisw, %x0" : : "r" (val)); - dsb(); + dsb(ish); } static void do_dc_csw(u32 val) { asm volatile("dc csw, %x0" : : "r" (val)); - dsb(); + dsb(ish); } /* See note at ARM ARM B1.14.4 */ diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 328ce1a99daa..d98d3e39879e 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,4 +1,5 @@ lib-y := bitops.o clear_user.o delay.o copy_from_user.o \ copy_to_user.o copy_in_user.o copy_page.o \ clear_page.o memchr.o memcpy.o memmove.o memset.o \ + memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \ strchr.o strrchr.o diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S new file mode 100644 index 000000000000..6ea0776ba6de --- /dev/null +++ b/arch/arm64/lib/memcmp.S @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* +* compare memory areas(when two memory areas' offset are different, +* alignment handled by the hardware) +* +* Parameters: +* x0 - const memory area 1 pointer +* x1 - const memory area 2 pointer +* x2 - the maximal compare byte length +* Returns: +* x0 - a compare result, maybe less than, equal to, or greater than ZERO +*/ + +/* Parameters and result. */ +src1 .req x0 +src2 .req x1 +limit .req x2 +result .req x0 + +/* Internal variables. */ +data1 .req x3 +data1w .req w3 +data2 .req x4 +data2w .req w4 +has_nul .req x5 +diff .req x6 +endloop .req x7 +tmp1 .req x8 +tmp2 .req x9 +tmp3 .req x10 +pos .req x11 +limit_wd .req x12 +mask .req x13 + +ENTRY(memcmp) + cbz limit, .Lret0 + eor tmp1, src1, src2 + tst tmp1, #7 + b.ne .Lmisaligned8 + ands tmp1, src1, #7 + b.ne .Lmutual_align + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ + /* + * The input source addresses are at alignment boundary. + * Directly compare eight bytes each time. + */ +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, cs /* Last Dword or differences. */ + cbz endloop, .Lloop_aligned + + /* Not reached the limit, must have found a diff. */ + tbz limit_wd, #63, .Lnot_limit + + /* Limit % 8 == 0 => the diff is in the last 8 bytes. */ + ands limit, limit, #7 + b.eq .Lnot_limit + /* + * The remained bytes less than 8. It is needed to extract valid data + * from last eight bytes of the intended memory range. + */ + lsl limit, limit, #3 /* bytes-> bits. */ + mov mask, #~0 +CPU_BE( lsr mask, mask, limit ) +CPU_LE( lsl mask, mask, limit ) + bic data1, data1, mask + bic data2, data2, mask + + orr diff, diff, mask + b .Lnot_limit + +.Lmutual_align: + /* + * Sources are mutually aligned, but are not currently at an + * alignment boundary. Round down the addresses and then mask off + * the bytes that precede the start point. + */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + ldr data2, [src2], #8 + /* + * We can not add limit with alignment offset(tmp1) here. Since the + * addition probably make the limit overflown. + */ + sub limit_wd, limit, #1/*limit != 0, so no underflow.*/ + and tmp3, limit_wd, #7 + lsr limit_wd, limit_wd, #3 + add tmp3, tmp3, tmp1 + add limit_wd, limit_wd, tmp3, lsr #3 + add limit, limit, tmp1/* Adjust the limit for the extra. */ + + lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ + neg tmp1, tmp1/* Bits to alignment -64. */ + mov tmp2, #~0 + /*mask off the non-intended bytes before the start address.*/ +CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp1 ) + + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b .Lstart_realigned + + /*src1 and src2 have different alignment offset.*/ +.Lmisaligned8: + cmp limit, #8 + b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/ + + and tmp1, src1, #7 + neg tmp1, tmp1 + add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ + and tmp2, src2, #7 + neg tmp2, tmp2 + add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ + subs tmp3, tmp1, tmp2 + csel pos, tmp1, tmp2, hi /*Choose the maximum.*/ + + sub limit, limit, pos + /*compare the proceeding bytes in the first 8 byte segment.*/ +.Ltinycmp: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs pos, pos, #1 + ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ + b.eq .Ltinycmp + cbnz pos, 1f /*diff occurred before the last byte.*/ + cmp data1w, data2w + b.eq .Lstart_align +1: + sub result, data1, data2 + ret + +.Lstart_align: + lsr limit_wd, limit, #3 + cbz limit_wd, .Lremain8 + + ands xzr, src1, #7 + b.eq .Lrecal_offset + /*process more leading bytes to make src1 aligned...*/ + add src1, src1, tmp3 /*backwards src1 to alignment boundary*/ + add src2, src2, tmp3 + sub limit, limit, tmp3 + lsr limit_wd, limit, #3 + cbz limit_wd, .Lremain8 + /*load 8 bytes from aligned SRC1..*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 /*Non-zero if differences found.*/ + csinv endloop, diff, xzr, ne + cbnz endloop, .Lunequal_proc + /*How far is the current SRC2 from the alignment boundary...*/ + and tmp3, tmp3, #7 + +.Lrecal_offset:/*src1 is aligned now..*/ + neg pos, tmp3 +.Lloopcmp_proc: + /* + * Divide the eight bytes into two parts. First,backwards the src2 + * to an alignment boundary,load eight bytes and compare from + * the SRC2 alignment boundary. If all 8 bytes are equal,then start + * the second part's comparison. Otherwise finish the comparison. + * This special handle can garantee all the accesses are in the + * thread/task space in avoid to overrange access. + */ + ldr data1, [src1,pos] + ldr data2, [src2,pos] + eor diff, data1, data2 /* Non-zero if differences found. */ + cbnz diff, .Lnot_limit + + /*The second part process*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + eor diff, data1, data2 /* Non-zero if differences found. */ + subs limit_wd, limit_wd, #1 + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ + cbz endloop, .Lloopcmp_proc +.Lunequal_proc: + cbz diff, .Lremain8 + +/*There is differnence occured in the latest comparison.*/ +.Lnot_limit: +/* +* For little endian,reverse the low significant equal bits into MSB,then +* following CLZ can find how many equal bits exist. +*/ +CPU_LE( rev diff, diff ) +CPU_LE( rev data1, data1 ) +CPU_LE( rev data2, data2 ) + + /* + * The MS-non-zero bit of DIFF marks either the first bit + * that is different, or the end of the significant data. + * Shifting left now will bring the critical information into the + * top bits. + */ + clz pos, diff + lsl data1, data1, pos + lsl data2, data2, pos + /* + * We need to zero-extend (char is unsigned) the value and then + * perform a signed subtraction. + */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret + +.Lremain8: + /* Limit % 8 == 0 =>. all data are equal.*/ + ands limit, limit, #7 + b.eq .Lret0 + +.Ltiny8proc: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + + ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ + b.eq .Ltiny8proc + sub result, data1, data2 + ret +.Lret0: + mov result, #0 + ret +ENDPROC(memcmp) diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 27b5003609b6..8a9a96d3ddae 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -1,5 +1,13 @@ /* * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -16,6 +24,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/cache.h> /* * Copy a buffer from src to dest (alignment handled by the hardware) @@ -27,27 +36,166 @@ * Returns: * x0 - dest */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +tmp3 .req x5 +tmp3w .req w5 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + ENTRY(memcpy) - mov x4, x0 - subs x2, x2, #8 - b.mi 2f -1: ldr x3, [x1], #8 - subs x2, x2, #8 - str x3, [x4], #8 - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f - ldr w3, [x1], #4 - sub x2, x2, #4 - str w3, [x4], #4 -3: adds x2, x2, #2 - b.mi 4f - ldrh w3, [x1], #2 - sub x2, x2, #2 - strh w3, [x4], #2 -4: adds x2, x2, #1 - b.mi 5f - ldrb w3, [x1] - strb w3, [x4] -5: ret + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15 + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwritting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb tmp1w, [src], #1 + strb tmp1w, [dst], #1 +1: + tbz tmp2, #1, 2f + ldrh tmp1w, [src], #2 + strh tmp1w, [dst], #2 +2: + tbz tmp2, #2, 3f + ldr tmp1w, [src], #4 + str tmp1w, [dst], #4 +3: + tbz tmp2, #3, .LSrcAligned + ldr tmp1, [src],#8 + str tmp1, [dst],#8 + +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp A_l, A_h, [src], #16 + stp A_l, A_h, [dst], #16 +1: + ldp A_l, A_h, [src], #16 + stp A_l, A_h, [dst], #16 +2: + ldp A_l, A_h, [src], #16 + stp A_l, A_h, [dst], #16 +.Ltiny15: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr tmp1, [src], #8 + str tmp1, [dst], #8 +1: + tbz count, #2, 2f + ldr tmp1w, [src], #4 + str tmp1w, [dst], #4 +2: + tbz count, #1, 3f + ldrh tmp1w, [src], #2 + strh tmp1w, [dst], #2 +3: + tbz count, #0, .Lexitfunc + ldrb tmp1w, [src] + strb tmp1w, [dst] + +.Lexitfunc: + ret + +.Lcpy_over64: + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp A_l, A_h, [src],#16 + stp A_l, A_h, [dst],#16 + ldp B_l, B_h, [src],#16 + ldp C_l, C_h, [src],#16 + stp B_l, B_h, [dst],#16 + stp C_l, C_h, [dst],#16 + ldp D_l, D_h, [src],#16 + stp D_l, D_h, [dst],#16 + + tst count, #0x3f + b.ne .Ltail63 + ret + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large: + /* pre-get 64 bytes data. */ + ldp A_l, A_h, [src],#16 + ldp B_l, B_h, [src],#16 + ldp C_l, C_h, [src],#16 + ldp D_l, D_h, [src],#16 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp A_l, A_h, [dst],#16 + ldp A_l, A_h, [src],#16 + stp B_l, B_h, [dst],#16 + ldp B_l, B_h, [src],#16 + stp C_l, C_h, [dst],#16 + ldp C_l, C_h, [src],#16 + stp D_l, D_h, [dst],#16 + ldp D_l, D_h, [src],#16 + subs count, count, #64 + b.ge 1b + stp A_l, A_h, [dst],#16 + stp B_l, B_h, [dst],#16 + stp C_l, C_h, [dst],#16 + stp D_l, D_h, [dst],#16 + + tst count, #0x3f + b.ne .Ltail63 + ret ENDPROC(memcpy) diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S index b79fdfa42d39..57b19ea2dad4 100644 --- a/arch/arm64/lib/memmove.S +++ b/arch/arm64/lib/memmove.S @@ -1,5 +1,13 @@ /* * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -16,6 +24,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/cache.h> /* * Move a buffer from src to test (alignment handled by the hardware). @@ -28,30 +37,161 @@ * Returns: * x0 - dest */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +tmp3 .req x5 +tmp3w .req w5 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + ENTRY(memmove) - cmp x0, x1 - b.ls memcpy - add x4, x0, x2 - add x1, x1, x2 - subs x2, x2, #8 - b.mi 2f -1: ldr x3, [x1, #-8]! - subs x2, x2, #8 - str x3, [x4, #-8]! - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f - ldr w3, [x1, #-4]! - sub x2, x2, #4 - str w3, [x4, #-4]! -3: adds x2, x2, #2 - b.mi 4f - ldrh w3, [x1, #-2]! - sub x2, x2, #2 - strh w3, [x4, #-2]! -4: adds x2, x2, #1 - b.mi 5f - ldrb w3, [x1, #-1] - strb w3, [x4, #-1] -5: ret + cmp dstin, src + b.lo memcpy + add tmp1, src, count + cmp dstin, tmp1 + b.hs memcpy /* No overlap. */ + + add dst, dstin, count + add src, src, count + cmp count, #16 + b.lo .Ltail15 /*probably non-alignment accesses.*/ + + ands tmp2, src, #15 /* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * process the aligned offset length to make the src aligned firstly. + * those extra instructions' cost is acceptable. It also make the + * coming accesses are based on aligned address. + */ + tbz tmp2, #0, 1f + ldrb tmp1w, [src, #-1]! + strb tmp1w, [dst, #-1]! +1: + tbz tmp2, #1, 2f + ldrh tmp1w, [src, #-2]! + strh tmp1w, [dst, #-2]! +2: + tbz tmp2, #2, 3f + ldr tmp1w, [src, #-4]! + str tmp1w, [dst, #-4]! +3: + tbz tmp2, #3, .LSrcAligned + ldr tmp1, [src, #-8]! + str tmp1, [dst, #-8]! + +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltail15 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp A_l, A_h, [src, #-16]! + stp A_l, A_h, [dst, #-16]! +1: + ldp A_l, A_h, [src, #-16]! + stp A_l, A_h, [dst, #-16]! +2: + ldp A_l, A_h, [src, #-16]! + stp A_l, A_h, [dst, #-16]! + +.Ltail15: + tbz count, #3, 1f + ldr tmp1, [src, #-8]! + str tmp1, [dst, #-8]! +1: + tbz count, #2, 2f + ldr tmp1w, [src, #-4]! + str tmp1w, [dst, #-4]! +2: + tbz count, #1, 3f + ldrh tmp1w, [src, #-2]! + strh tmp1w, [dst, #-2]! +3: + tbz count, #0, .Lexitfunc + ldrb tmp1w, [src, #-1] + strb tmp1w, [dst, #-1] + +.Lexitfunc: + ret + +.Lcpy_over64: + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 bytes here and then jump + * to the tail. + */ + ldp A_l, A_h, [src, #-16] + stp A_l, A_h, [dst, #-16] + ldp B_l, B_h, [src, #-32] + ldp C_l, C_h, [src, #-48] + stp B_l, B_h, [dst, #-32] + stp C_l, C_h, [dst, #-48] + ldp D_l, D_h, [src, #-64]! + stp D_l, D_h, [dst, #-64]! + + tst count, #0x3f + b.ne .Ltail63 + ret + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large: + /* pre-load 64 bytes data. */ + ldp A_l, A_h, [src, #-16] + ldp B_l, B_h, [src, #-32] + ldp C_l, C_h, [src, #-48] + ldp D_l, D_h, [src, #-64]! +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp A_l, A_h, [dst, #-16] + ldp A_l, A_h, [src, #-16] + stp B_l, B_h, [dst, #-32] + ldp B_l, B_h, [src, #-32] + stp C_l, C_h, [dst, #-48] + ldp C_l, C_h, [src, #-48] + stp D_l, D_h, [dst, #-64]! + ldp D_l, D_h, [src, #-64]! + subs count, count, #64 + b.ge 1b + stp A_l, A_h, [dst, #-16] + stp B_l, B_h, [dst, #-32] + stp C_l, C_h, [dst, #-48] + stp D_l, D_h, [dst, #-64]! + + tst count, #0x3f + b.ne .Ltail63 + ret ENDPROC(memmove) diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S index 87e4a68fbbbc..7c72dfd36b63 100644 --- a/arch/arm64/lib/memset.S +++ b/arch/arm64/lib/memset.S @@ -1,5 +1,13 @@ /* * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -16,6 +24,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/cache.h> /* * Fill in the buffer with character c (alignment handled by the hardware) @@ -27,27 +36,181 @@ * Returns: * x0 - buf */ + +dstin .req x0 +val .req w1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +zva_len_x .req x5 +zva_len .req w5 +zva_bits_x .req x6 + +A_l .req x7 +A_lw .req w7 +dst .req x8 +tmp3w .req w9 +tmp3 .req x9 + ENTRY(memset) - mov x4, x0 - and w1, w1, #0xff - orr w1, w1, w1, lsl #8 - orr w1, w1, w1, lsl #16 - orr x1, x1, x1, lsl #32 - subs x2, x2, #8 - b.mi 2f -1: str x1, [x4], #8 - subs x2, x2, #8 - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f - sub x2, x2, #4 - str w1, [x4], #4 -3: adds x2, x2, #2 - b.mi 4f - sub x2, x2, #2 - strh w1, [x4], #2 -4: adds x2, x2, #1 - b.mi 5f - strb w1, [x4] -5: ret + mov dst, dstin /* Preserve return value. */ + and A_lw, val, #255 + orr A_lw, A_lw, A_lw, lsl #8 + orr A_lw, A_lw, A_lw, lsl #16 + orr A_l, A_l, A_l, lsl #32 + + cmp count, #15 + b.hi .Lover16_proc + /*All store maybe are non-aligned..*/ + tbz count, #3, 1f + str A_l, [dst], #8 +1: + tbz count, #2, 2f + str A_lw, [dst], #4 +2: + tbz count, #1, 3f + strh A_lw, [dst], #2 +3: + tbz count, #0, 4f + strb A_lw, [dst] +4: + ret + +.Lover16_proc: + /*Whether the start address is aligned with 16.*/ + neg tmp2, dst + ands tmp2, tmp2, #15 + b.eq .Laligned +/* +* The count is not less than 16, we can use stp to store the start 16 bytes, +* then adjust the dst aligned with 16.This process will make the current +* memory address at alignment boundary. +*/ + stp A_l, A_l, [dst] /*non-aligned store..*/ + /*make the dst aligned..*/ + sub count, count, tmp2 + add dst, dst, tmp2 + +.Laligned: + cbz A_l, .Lzero_mem + +.Ltail_maybe_long: + cmp count, #64 + b.ge .Lnot_short +.Ltail63: + ands tmp1, count, #0x30 + b.eq 3f + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + stp A_l, A_l, [dst], #16 +1: + stp A_l, A_l, [dst], #16 +2: + stp A_l, A_l, [dst], #16 +/* +* The last store length is less than 16,use stp to write last 16 bytes. +* It will lead some bytes written twice and the access is non-aligned. +*/ +3: + ands count, count, #15 + cbz count, 4f + add dst, dst, count + stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ +4: + ret + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line, this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lnot_short: + sub dst, dst, #16/* Pre-bias. */ + sub count, count, #64 +1: + stp A_l, A_l, [dst, #16] + stp A_l, A_l, [dst, #32] + stp A_l, A_l, [dst, #48] + stp A_l, A_l, [dst, #64]! + subs count, count, #64 + b.ge 1b + tst count, #0x3f + add dst, dst, #16 + b.ne .Ltail63 +.Lexitfunc: + ret + + /* + * For zeroing memory, check to see if we can use the ZVA feature to + * zero entire 'cache' lines. + */ +.Lzero_mem: + cmp count, #63 + b.le .Ltail63 + /* + * For zeroing small amounts of memory, it's not worth setting up + * the line-clear code. + */ + cmp count, #128 + b.lt .Lnot_short /*count is at least 128 bytes*/ + + mrs tmp1, dczid_el0 + tbnz tmp1, #4, .Lnot_short + mov tmp3w, #4 + and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ + lsl zva_len, tmp3w, zva_len + + ands tmp3w, zva_len, #63 + /* + * ensure the zva_len is not less than 64. + * It is not meaningful to use ZVA if the block size is less than 64. + */ + b.ne .Lnot_short +.Lzero_by_line: + /* + * Compute how far we need to go to become suitably aligned. We're + * already at quad-word alignment. + */ + cmp count, zva_len_x + b.lt .Lnot_short /* Not enough to reach alignment. */ + sub zva_bits_x, zva_len_x, #1 + neg tmp2, dst + ands tmp2, tmp2, zva_bits_x + b.eq 2f /* Already aligned. */ + /* Not aligned, check that there's enough to copy after alignment.*/ + sub tmp1, count, tmp2 + /* + * grantee the remain length to be ZVA is bigger than 64, + * avoid to make the 2f's process over mem range.*/ + cmp tmp1, #64 + ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ + b.lt .Lnot_short + /* + * We know that there's at least 64 bytes to zero and that it's safe + * to overrun by 64 bytes. + */ + mov count, tmp1 +1: + stp A_l, A_l, [dst] + stp A_l, A_l, [dst, #16] + stp A_l, A_l, [dst, #32] + subs tmp2, tmp2, #64 + stp A_l, A_l, [dst, #48] + add dst, dst, #64 + b.ge 1b + /* We've overrun a bit, so adjust dst downwards.*/ + add dst, dst, tmp2 +2: + sub count, count, zva_len_x +3: + dc zva, dst + add dst, dst, zva_len_x + subs count, count, zva_len_x + b.ge 3b + ands count, count, zva_bits_x + b.ne .Ltail_maybe_long + ret ENDPROC(memset) diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S new file mode 100644 index 000000000000..42f828b06c59 --- /dev/null +++ b/arch/arm64/lib/strcmp.S @@ -0,0 +1,234 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * compare two strings + * + * Parameters: + * x0 - const string 1 pointer + * x1 - const string 2 pointer + * Returns: + * x0 - an integer less than, equal to, or greater than zero + * if s1 is found, respectively, to be less than, to match, + * or be greater than s2. + */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result. */ +src1 .req x0 +src2 .req x1 +result .req x0 + +/* Internal variables. */ +data1 .req x2 +data1w .req w2 +data2 .req x3 +data2w .req w3 +has_nul .req x4 +diff .req x5 +syndrome .req x6 +tmp1 .req x7 +tmp2 .req x8 +tmp3 .req x9 +zeroones .req x10 +pos .req x11 + +ENTRY(strcmp) + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + b.ne .Lmisaligned8 + ands tmp1, src1, #7 + b.ne .Lmutual_align + + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + orr syndrome, diff, has_nul + cbz syndrome, .Lloop_aligned + b .Lcal_cmpresult + +.Lmutual_align: + /* + * Sources are mutually aligned, but are not currently at an + * alignment boundary. Round down the addresses and then mask off + * the bytes that preceed the start point. + */ + bic src1, src1, #7 + bic src2, src2, #7 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + ldr data1, [src1], #8 + neg tmp1, tmp1 /* Bits to alignment -64. */ + ldr data2, [src2], #8 + mov tmp2, #~0 + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b .Lstart_realigned + +.Lmisaligned8: + /* + * Get the align offset length to compare per byte first. + * After this process, one string's address will be aligned. + */ + and tmp1, src1, #7 + neg tmp1, tmp1 + add tmp1, tmp1, #8 + and tmp2, src2, #7 + neg tmp2, tmp2 + add tmp2, tmp2, #8 + subs tmp3, tmp1, tmp2 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ +.Ltinycmp: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs pos, pos, #1 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq .Ltinycmp + cbnz pos, 1f /*find the null or unequal...*/ + cmp data1w, #1 + ccmp data1w, data2w, #0, cs + b.eq .Lstart_align /*the last bytes are equal....*/ +1: + sub result, data1, data2 + ret + +.Lstart_align: + ands xzr, src1, #7 + b.eq .Lrecal_offset + /*process more leading bytes to make str1 aligned...*/ + add src1, src1, tmp3 + add src2, src2, tmp3 + /*load 8 bytes from aligned str1 and non-aligned str2..*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul, tmp1, tmp2 + eor diff, data1, data2 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + cbnz syndrome, .Lcal_cmpresult + /*How far is the current str2 from the alignment boundary...*/ + and tmp3, tmp3, #7 +.Lrecal_offset: + neg pos, tmp3 +.Lloopcmp_proc: + /* + * Divide the eight bytes into two parts. First,backwards the src2 + * to an alignment boundary,load eight bytes from the SRC2 alignment + * boundary,then compare with the relative bytes from SRC1. + * If all 8 bytes are equal,then start the second part's comparison. + * Otherwise finish the comparison. + * This special handle can garantee all the accesses are in the + * thread/task space in avoid to overrange access. + */ + ldr data1, [src1,pos] + ldr data2, [src2,pos] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul, tmp1, tmp2 + eor diff, data1, data2 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + cbnz syndrome, .Lcal_cmpresult + + /*The second part process*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bic has_nul, tmp1, tmp2 + eor diff, data1, data2 /* Non-zero if differences found. */ + orr syndrome, diff, has_nul + cbz syndrome, .Lloopcmp_proc + +.Lcal_cmpresult: + /* + * reversed the byte-order as big-endian,then CLZ can find the most + * significant zero bits. + */ +CPU_LE( rev syndrome, syndrome ) +CPU_LE( rev data1, data1 ) +CPU_LE( rev data2, data2 ) + + /* + * For big-endian we cannot use the trick with the syndrome value + * as carry-propagation can corrupt the upper bits if the trailing + * bytes in the string contain 0x01. + * However, if there is no NUL byte in the dword, we can generate + * the result directly. We ca not just subtract the bytes as the + * MSB might be significant. + */ +CPU_BE( cbnz has_nul, 1f ) +CPU_BE( cmp data1, data2 ) +CPU_BE( cset result, ne ) +CPU_BE( cneg result, result, lo ) +CPU_BE( ret ) +CPU_BE( 1: ) + /*Re-compute the NUL-byte detection, using a byte-reversed value. */ +CPU_BE( rev tmp3, data1 ) +CPU_BE( sub tmp1, tmp3, zeroones ) +CPU_BE( orr tmp2, tmp3, #REP8_7f ) +CPU_BE( bic has_nul, tmp1, tmp2 ) +CPU_BE( rev has_nul, has_nul ) +CPU_BE( orr syndrome, diff, has_nul ) + + clz pos, syndrome + /* + * The MS-non-zero bit of the syndrome marks either the first bit + * that is different, or the top bit of the first zero byte. + * Shifting left now will bring the critical information into the + * top bits. + */ + lsl data1, data1, pos + lsl data2, data2, pos + /* + * But we need to zero-extend (char is unsigned) the value and then + * perform a signed 32-bit subtraction. + */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +ENDPROC(strcmp) diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S new file mode 100644 index 000000000000..987b68b9ce44 --- /dev/null +++ b/arch/arm64/lib/strlen.S @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * calculate the length of a string + * + * Parameters: + * x0 - const string pointer + * Returns: + * x0 - the return length of specific string + */ + +/* Arguments and results. */ +srcin .req x0 +len .req x0 + +/* Locals and temporaries. */ +src .req x1 +data1 .req x2 +data2 .req x3 +data2a .req x4 +has_nul1 .req x5 +has_nul2 .req x6 +tmp1 .req x7 +tmp2 .req x8 +tmp3 .req x9 +tmp4 .req x10 +zeroones .req x11 +pos .req x12 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +ENTRY(strlen) + mov zeroones, #REP8_01 + bic src, srcin, #15 + ands tmp1, srcin, #15 + b.ne .Lmisaligned + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ + /* + * The inner loop deals with two Dwords at a time. This has a + * slightly higher start-up cost, but we should win quite quickly, + * especially on cores with a high number of issue slots per + * cycle, as we get much better parallelism out of the operations. + */ +.Lloop: + ldp data1, data2, [src], #16 +.Lrealigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bics has_nul2, tmp3, tmp4 + ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ + b.eq .Lloop + + sub len, src, srcin + cbz has_nul1, .Lnul_in_data2 +CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/ + sub len, len, #8 + mov has_nul2, has_nul1 +.Lnul_in_data2: + /* + * For big-endian, carry propagation (if the final byte in the + * string is 0x01) means we cannot use has_nul directly. The + * easiest way to get the correct byte is to byte-swap the data + * and calculate the syndrome a second time. + */ +CPU_BE( rev data2, data2 ) +CPU_BE( sub tmp1, data2, zeroones ) +CPU_BE( orr tmp2, data2, #REP8_7f ) +CPU_BE( bic has_nul2, tmp1, tmp2 ) + + sub len, len, #8 + rev has_nul2, has_nul2 + clz pos, has_nul2 + add len, len, pos, lsr #3 /* Bits to bytes. */ + ret + +.Lmisaligned: + cmp tmp1, #8 + neg tmp1, tmp1 + ldp data1, data2, [src], #16 + lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ + mov tmp2, #~0 + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + csinv data1, data1, xzr, le + csel data2, data2, data2a, le + b .Lrealigned +ENDPROC(strlen) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S new file mode 100644 index 000000000000..0224cf5a5533 --- /dev/null +++ b/arch/arm64/lib/strncmp.S @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * compare two strings + * + * Parameters: + * x0 - const string 1 pointer + * x1 - const string 2 pointer + * x2 - the maximal length to be compared + * Returns: + * x0 - an integer less than, equal to, or greater than zero if s1 is found, + * respectively, to be less than, to match, or be greater than s2. + */ + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +/* Parameters and result. */ +src1 .req x0 +src2 .req x1 +limit .req x2 +result .req x0 + +/* Internal variables. */ +data1 .req x3 +data1w .req w3 +data2 .req x4 +data2w .req w4 +has_nul .req x5 +diff .req x6 +syndrome .req x7 +tmp1 .req x8 +tmp2 .req x9 +tmp3 .req x10 +zeroones .req x11 +pos .req x12 +limit_wd .req x13 +mask .req x14 +endloop .req x15 + +ENTRY(strncmp) + cbz limit, .Lret0 + eor tmp1, src1, src2 + mov zeroones, #REP8_01 + tst tmp1, #7 + b.ne .Lmisaligned8 + ands tmp1, src1, #7 + b.ne .Lmutual_align + /* Calculate the number of full and partial words -1. */ + /* + * when limit is mulitply of 8, if not sub 1, + * the judgement of last dword will wrong. + */ + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ + + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + subs limit_wd, limit_wd, #1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, pl /* Last Dword or differences.*/ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp endloop, #0, #0, eq + b.eq .Lloop_aligned + + /*Not reached the limit, must have found the end or a diff. */ + tbz limit_wd, #63, .Lnot_limit + + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq .Lnot_limit + + lsl limit, limit, #3 /* Bits -> bytes. */ + mov mask, #~0 +CPU_BE( lsr mask, mask, limit ) +CPU_LE( lsl mask, mask, limit ) + bic data1, data1, mask + bic data2, data2, mask + + /* Make sure that the NUL byte is marked in the syndrome. */ + orr has_nul, has_nul, mask + +.Lnot_limit: + orr syndrome, diff, has_nul + b .Lcal_cmpresult + +.Lmutual_align: + /* + * Sources are mutually aligned, but are not currently at an + * alignment boundary. Round down the addresses and then mask off + * the bytes that precede the start point. + * We also need to adjust the limit calculations, but without + * overflowing if the limit is near ULONG_MAX. + */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ + ldr data2, [src2], #8 + mov tmp2, #~0 + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ + + and tmp3, limit_wd, #7 + lsr limit_wd, limit_wd, #3 + /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/ + add limit, limit, tmp1 + add tmp3, tmp3, tmp1 + orr data1, data1, tmp2 + orr data2, data2, tmp2 + add limit_wd, limit_wd, tmp3, lsr #3 + b .Lstart_realigned + +/*when src1 offset is not equal to src2 offset...*/ +.Lmisaligned8: + cmp limit, #8 + b.lo .Ltiny8proc /*limit < 8... */ + /* + * Get the align offset length to compare per byte first. + * After this process, one string's address will be aligned.*/ + and tmp1, src1, #7 + neg tmp1, tmp1 + add tmp1, tmp1, #8 + and tmp2, src2, #7 + neg tmp2, tmp2 + add tmp2, tmp2, #8 + subs tmp3, tmp1, tmp2 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ + /* + * Here, limit is not less than 8, so directly run .Ltinycmp + * without checking the limit.*/ + sub limit, limit, pos +.Ltinycmp: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs pos, pos, #1 + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq .Ltinycmp + cbnz pos, 1f /*find the null or unequal...*/ + cmp data1w, #1 + ccmp data1w, data2w, #0, cs + b.eq .Lstart_align /*the last bytes are equal....*/ +1: + sub result, data1, data2 + ret + +.Lstart_align: + lsr limit_wd, limit, #3 + cbz limit_wd, .Lremain8 + /*process more leading bytes to make str1 aligned...*/ + ands xzr, src1, #7 + b.eq .Lrecal_offset + add src1, src1, tmp3 /*tmp3 is positive in this branch.*/ + add src2, src2, tmp3 + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + sub limit, limit, tmp3 + lsr limit_wd, limit, #3 + subs limit_wd, limit_wd, #1 + + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ + bics has_nul, tmp1, tmp2 + ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ + b.ne .Lunequal_proc + /*How far is the current str2 from the alignment boundary...*/ + and tmp3, tmp3, #7 +.Lrecal_offset: + neg pos, tmp3 +.Lloopcmp_proc: + /* + * Divide the eight bytes into two parts. First,backwards the src2 + * to an alignment boundary,load eight bytes from the SRC2 alignment + * boundary,then compare with the relative bytes from SRC1. + * If all 8 bytes are equal,then start the second part's comparison. + * Otherwise finish the comparison. + * This special handle can garantee all the accesses are in the + * thread/task space in avoid to overrange access. + */ + ldr data1, [src1,pos] + ldr data2, [src2,pos] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, eq + cbnz endloop, .Lunequal_proc + + /*The second part process*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + subs limit_wd, limit_wd, #1 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ + bics has_nul, tmp1, tmp2 + ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ + b.eq .Lloopcmp_proc + +.Lunequal_proc: + orr syndrome, diff, has_nul + cbz syndrome, .Lremain8 +.Lcal_cmpresult: + /* + * reversed the byte-order as big-endian,then CLZ can find the most + * significant zero bits. + */ +CPU_LE( rev syndrome, syndrome ) +CPU_LE( rev data1, data1 ) +CPU_LE( rev data2, data2 ) + /* + * For big-endian we cannot use the trick with the syndrome value + * as carry-propagation can corrupt the upper bits if the trailing + * bytes in the string contain 0x01. + * However, if there is no NUL byte in the dword, we can generate + * the result directly. We can't just subtract the bytes as the + * MSB might be significant. + */ +CPU_BE( cbnz has_nul, 1f ) +CPU_BE( cmp data1, data2 ) +CPU_BE( cset result, ne ) +CPU_BE( cneg result, result, lo ) +CPU_BE( ret ) +CPU_BE( 1: ) + /* Re-compute the NUL-byte detection, using a byte-reversed value.*/ +CPU_BE( rev tmp3, data1 ) +CPU_BE( sub tmp1, tmp3, zeroones ) +CPU_BE( orr tmp2, tmp3, #REP8_7f ) +CPU_BE( bic has_nul, tmp1, tmp2 ) +CPU_BE( rev has_nul, has_nul ) +CPU_BE( orr syndrome, diff, has_nul ) + /* + * The MS-non-zero bit of the syndrome marks either the first bit + * that is different, or the top bit of the first zero byte. + * Shifting left now will bring the critical information into the + * top bits. + */ + clz pos, syndrome + lsl data1, data1, pos + lsl data2, data2, pos + /* + * But we need to zero-extend (char is unsigned) the value and then + * perform a signed 32-bit subtraction. + */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret + +.Lremain8: + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq .Lret0 +.Ltiny8proc: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + + ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq .Ltiny8proc + sub result, data1, data2 + ret + +.Lret0: + mov result, #0 + ret +ENDPROC(strncmp) diff --git a/arch/arm64/lib/strnlen.S b/arch/arm64/lib/strnlen.S new file mode 100644 index 000000000000..2ca665711bf2 --- /dev/null +++ b/arch/arm64/lib/strnlen.S @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + +/* + * determine the length of a fixed-size string + * + * Parameters: + * x0 - const string pointer + * x1 - maximal string length + * Returns: + * x0 - the return length of specific string + */ + +/* Arguments and results. */ +srcin .req x0 +len .req x0 +limit .req x1 + +/* Locals and temporaries. */ +src .req x2 +data1 .req x3 +data2 .req x4 +data2a .req x5 +has_nul1 .req x6 +has_nul2 .req x7 +tmp1 .req x8 +tmp2 .req x9 +tmp3 .req x10 +tmp4 .req x11 +zeroones .req x12 +pos .req x13 +limit_wd .req x14 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f +#define REP8_80 0x8080808080808080 + +ENTRY(strnlen) + cbz limit, .Lhit_limit + mov zeroones, #REP8_01 + bic src, srcin, #15 + ands tmp1, srcin, #15 + b.ne .Lmisaligned + /* Calculate the number of full and partial words -1. */ + sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ + + /* + * NUL detection works on the principle that (X - 1) & (~X) & 0x80 + * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + * can be done in parallel across the entire word. + */ + /* + * The inner loop deals with two Dwords at a time. This has a + * slightly higher start-up cost, but we should win quite quickly, + * especially on cores with a high number of issue slots per + * cycle, as we get much better parallelism out of the operations. + */ +.Lloop: + ldp data1, data2, [src], #16 +.Lrealigned: + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, #REP8_7f + bic has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + subs limit_wd, limit_wd, #1 + orr tmp1, has_nul1, has_nul2 + ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ + b.eq .Lloop + + cbz tmp1, .Lhit_limit /* No null in final Qword. */ + + /* + * We know there's a null in the final Qword. The easiest thing + * to do now is work out the length of the string and return + * MIN (len, limit). + */ + sub len, src, srcin + cbz has_nul1, .Lnul_in_data2 +CPU_BE( mov data2, data1 ) /*perpare data to re-calculate the syndrome*/ + + sub len, len, #8 + mov has_nul2, has_nul1 +.Lnul_in_data2: + /* + * For big-endian, carry propagation (if the final byte in the + * string is 0x01) means we cannot use has_nul directly. The + * easiest way to get the correct byte is to byte-swap the data + * and calculate the syndrome a second time. + */ +CPU_BE( rev data2, data2 ) +CPU_BE( sub tmp1, data2, zeroones ) +CPU_BE( orr tmp2, data2, #REP8_7f ) +CPU_BE( bic has_nul2, tmp1, tmp2 ) + + sub len, len, #8 + rev has_nul2, has_nul2 + clz pos, has_nul2 + add len, len, pos, lsr #3 /* Bits to bytes. */ + cmp len, limit + csel len, len, limit, ls /* Return the lower value. */ + ret + +.Lmisaligned: + /* + * Deal with a partial first word. + * We're doing two things in parallel here; + * 1) Calculate the number of words (but avoiding overflow if + * limit is near ULONG_MAX) - to do this we need to work out + * limit + tmp1 - 1 as a 65-bit value before shifting it; + * 2) Load and mask the initial data words - we force the bytes + * before the ones we are interested in to 0xff - this ensures + * early bytes will not hit any zero detection. + */ + ldp data1, data2, [src], #16 + + sub limit_wd, limit, #1 + and tmp3, limit_wd, #15 + lsr limit_wd, limit_wd, #4 + + add tmp3, tmp3, tmp1 + add limit_wd, limit_wd, tmp3, lsr #4 + + neg tmp4, tmp1 + lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ + + mov tmp2, #~0 + /* Big-endian. Early bytes are at MSB. */ +CPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ + /* Little-endian. Early bytes are at LSB. */ +CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */ + + cmp tmp1, #8 + + orr data1, data1, tmp2 + orr data2a, data2, tmp2 + + csinv data1, data1, xzr, le + csel data2, data2, data2a, le + b .Lrealigned + +.Lhit_limit: + mov len, limit + ret +ENDPROC(strnlen) diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index b51d36401d83..3ecb56c624d3 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -1,5 +1,5 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ cache.o copypage.o flush.o \ ioremap.o mmap.o pgd.o mmu.o \ - context.o tlb.o proc.o + context.o proc.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index fda756875fa6..23663837acff 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -31,7 +31,7 @@ * Corrupted registers: x0-x7, x9-x11 */ __flush_dcache_all: - dsb sy // ensure ordering with previous memory accesses + dmb sy // ensure ordering with previous memory accesses mrs x0, clidr_el1 // read clidr and x3, x0, #0x7000000 // extract loc from clidr lsr x3, x3, #23 // left align loc bit field @@ -128,7 +128,7 @@ USER(9f, dc cvau, x4 ) // clean D line to PoU add x4, x4, x2 cmp x4, x1 b.lo 1b - dsb sy + dsb ish icache_line_size x2, x3 sub x3, x2, #1 @@ -139,7 +139,7 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU cmp x4, x1 b.lo 1b 9: // ignore any faulting cache operation - dsb sy + dsb ish isb ret ENDPROC(flush_icache_range) diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index c851eb44dc50..4164c5ace9f8 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -115,7 +115,7 @@ static void *__dma_alloc_noncoherent(struct device *dev, size_t size, for (i = 0; i < (size >> PAGE_SHIFT); i++) map[i] = page + i; coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP, - __get_dma_pgprot(attrs, pgprot_default, false)); + __get_dma_pgprot(attrs, __pgprot(PROT_NORMAL_NC), false)); kfree(map); if (!coherent_ptr) goto no_map; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c23751b06120..bcc965e2cce1 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -32,6 +32,7 @@ #include <asm/exception.h> #include <asm/debug-monitors.h> +#include <asm/esr.h> #include <asm/system_misc.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -123,6 +124,7 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr, } tsk->thread.fault_address = addr; + tsk->thread.fault_code = esr; si.si_signo = sig; si.si_errno = 0; si.si_code = code; @@ -148,8 +150,6 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re #define VM_FAULT_BADMAP 0x010000 #define VM_FAULT_BADACCESS 0x020000 -#define ESR_WRITE (1 << 6) -#define ESR_CM (1 << 8) #define ESR_LNX_EXEC (1 << 24) static int __do_page_fault(struct mm_struct *mm, unsigned long addr, @@ -218,7 +218,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, if (esr & ESR_LNX_EXEC) { vm_flags = VM_EXEC; - } else if ((esr & ESR_WRITE) && !(esr & ESR_CM)) { + } else if ((esr & ESR_EL1_WRITE) && !(esr & ESR_EL1_CM)) { vm_flags = VM_WRITE; mm_flags |= FAULT_FLAG_WRITE; } @@ -525,7 +525,7 @@ asmlinkage int __exception do_debug_exception(unsigned long addr, info.si_errno = 0; info.si_code = inf->code; info.si_addr = (void __user *)addr; - arm64_notify_die("", regs, &info, esr); + arm64_notify_die("", regs, &info, 0); return 0; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 0a472c41a67f..c43f1dd19489 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -43,11 +43,6 @@ struct page *empty_zero_page; EXPORT_SYMBOL(empty_zero_page); -pgprot_t pgprot_default; -EXPORT_SYMBOL(pgprot_default); - -static pmdval_t prot_sect_kernel; - struct cachepolicy { const char policy[16]; u64 mair; @@ -122,33 +117,6 @@ static int __init early_cachepolicy(char *p) } early_param("cachepolicy", early_cachepolicy); -/* - * Adjust the PMD section entries according to the CPU in use. - */ -void __init init_mem_pgprot(void) -{ - pteval_t default_pgprot; - int i; - - default_pgprot = PTE_ATTRINDX(MT_NORMAL); - prot_sect_kernel = PMD_TYPE_SECT | PMD_SECT_AF | PMD_ATTRINDX(MT_NORMAL); - -#ifdef CONFIG_SMP - /* - * Mark memory with the "shared" attribute for SMP systems - */ - default_pgprot |= PTE_SHARED; - prot_sect_kernel |= PMD_SECT_S; -#endif - - for (i = 0; i < 16; i++) { - unsigned long v = pgprot_val(protection_map[i]); - protection_map[i] = __pgprot(v | default_pgprot); - } - - pgprot_default = __pgprot(PTE_TYPE_PAGE | PTE_AF | default_pgprot); -} - pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -168,7 +136,8 @@ static void __init *early_alloc(unsigned long sz) } static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, - unsigned long end, unsigned long pfn) + unsigned long end, unsigned long pfn, + pgprot_t prot) { pte_t *pte; @@ -180,16 +149,27 @@ static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr, pte = pte_offset_kernel(pmd, addr); do { - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); + set_pte(pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); } static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, - unsigned long end, phys_addr_t phys) + unsigned long end, phys_addr_t phys, + int map_io) { pmd_t *pmd; unsigned long next; + pmdval_t prot_sect; + pgprot_t prot_pte; + + if (map_io) { + prot_sect = PROT_SECT_DEVICE_nGnRE; + prot_pte = __pgprot(PROT_DEVICE_nGnRE); + } else { + prot_sect = PROT_SECT_NORMAL_EXEC; + prot_pte = PAGE_KERNEL_EXEC; + } /* * Check for initial section mappings in the pgd/pud and remove them. @@ -205,7 +185,7 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, /* try section mapping first */ if (((addr | next | phys) & ~SECTION_MASK) == 0) { pmd_t old_pmd =*pmd; - set_pmd(pmd, __pmd(phys | prot_sect_kernel)); + set_pmd(pmd, __pmd(phys | prot_sect)); /* * Check for previous table entries created during * boot (__create_page_tables) and flush them. @@ -213,21 +193,46 @@ static void __init alloc_init_pmd(pud_t *pud, unsigned long addr, if (!pmd_none(old_pmd)) flush_tlb_all(); } else { - alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys)); + alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), + prot_pte); } phys += next - addr; } while (pmd++, addr = next, addr != end); } static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, - unsigned long end, unsigned long phys) + unsigned long end, unsigned long phys, + int map_io) { pud_t *pud = pud_offset(pgd, addr); unsigned long next; do { next = pud_addr_end(addr, end); - alloc_init_pmd(pud, addr, next, phys); + + /* + * For 4K granule only, attempt to put down a 1GB block + */ + if (!map_io && (PAGE_SHIFT == 12) && + ((addr | next | phys) & ~PUD_MASK) == 0) { + pud_t old_pud = *pud; + set_pud(pud, __pud(phys | PROT_SECT_NORMAL_EXEC)); + + /* + * If we have an old value for a pud, it will + * be pointing to a pmd table that we no longer + * need (from swapper_pg_dir). + * + * Look up the old pmd table and free it. + */ + if (!pud_none(old_pud)) { + phys_addr_t table = __pa(pmd_offset(&old_pud, 0)); + memblock_free(table, PAGE_SIZE); + flush_tlb_all(); + } + } else { + alloc_init_pmd(pud, addr, next, phys, map_io); + } phys += next - addr; } while (pud++, addr = next, addr != end); } @@ -236,30 +241,44 @@ static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr, * Create the page directory entries and any necessary page tables for the * mapping specified by 'md'. */ -static void __init create_mapping(phys_addr_t phys, unsigned long virt, - phys_addr_t size) +static void __init __create_mapping(pgd_t *pgd, phys_addr_t phys, + unsigned long virt, phys_addr_t size, + int map_io) { unsigned long addr, length, end, next; - pgd_t *pgd; - - if (virt < VMALLOC_START) { - pr_warning("BUG: not creating mapping for 0x%016llx at 0x%016lx - outside kernel range\n", - phys, virt); - return; - } addr = virt & PAGE_MASK; length = PAGE_ALIGN(size + (virt & ~PAGE_MASK)); - pgd = pgd_offset_k(addr); end = addr + length; do { next = pgd_addr_end(addr, end); - alloc_init_pud(pgd, addr, next, phys); + alloc_init_pud(pgd, addr, next, phys, map_io); phys += next - addr; } while (pgd++, addr = next, addr != end); } +static void __init create_mapping(phys_addr_t phys, unsigned long virt, + phys_addr_t size) +{ + if (virt < VMALLOC_START) { + pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", + &phys, virt); + return; + } + __create_mapping(pgd_offset_k(virt & PAGE_MASK), phys, virt, size, 0); +} + +void __init create_id_mapping(phys_addr_t addr, phys_addr_t size, int map_io) +{ + if ((addr >> PGDIR_SHIFT) >= ARRAY_SIZE(idmap_pg_dir)) { + pr_warn("BUG: not creating id mapping for %pa\n", &addr); + return; + } + __create_mapping(&idmap_pg_dir[pgd_index(addr)], + addr, addr, size, map_io); +} + static void __init map_mem(void) { struct memblock_region *reg; @@ -370,6 +389,9 @@ int kern_addr_valid(unsigned long addr) if (pud_none(*pud)) return 0; + if (pud_sect(*pud)) + return pfn_valid(pud_pfn(*pud)); + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return 0; @@ -417,7 +439,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) if (!p) return -ENOMEM; - set_pmd(pmd, __pmd(__pa(p) | prot_sect_kernel)); + set_pmd(pmd, __pmd(__pa(p) | PROT_SECT_NORMAL)); } else vmemmap_verify((pte_t *)pmd, node, addr, next); } while (addr = next, addr != end); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 9042aff5e9e3..7736779c9809 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -182,7 +182,7 @@ ENDPROC(cpu_do_switch_mm) ENTRY(__cpu_setup) ic iallu // I+BTB cache invalidate tlbi vmalle1is // invalidate I + D TLBs - dsb sy + dsb ish mov x0, #3 << 20 msr cpacr_el1, x0 // Enable FP/ASIMD diff --git a/arch/arm64/mm/tlb.S b/arch/arm64/mm/tlb.S deleted file mode 100644 index 19da91e0cd27..000000000000 --- a/arch/arm64/mm/tlb.S +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Based on arch/arm/mm/tlb.S - * - * Copyright (C) 1997-2002 Russell King - * Copyright (C) 2012 ARM Ltd. - * Written by Catalin Marinas <catalin.marinas@arm.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ -#include <linux/linkage.h> -#include <asm/assembler.h> -#include <asm/asm-offsets.h> -#include <asm/page.h> -#include <asm/tlbflush.h> -#include "proc-macros.S" - -/* - * __cpu_flush_user_tlb_range(start, end, vma) - * - * Invalidate a range of TLB entries in the specified address space. - * - * - start - start address (may not be aligned) - * - end - end address (exclusive, may not be aligned) - * - vma - vma_struct describing address range - */ -ENTRY(__cpu_flush_user_tlb_range) - vma_vm_mm x3, x2 // get vma->vm_mm - mmid w3, x3 // get vm_mm->context.id - dsb sy - lsr x0, x0, #12 // align address - lsr x1, x1, #12 - bfi x0, x3, #48, #16 // start VA and ASID - bfi x1, x3, #48, #16 // end VA and ASID -1: tlbi vae1is, x0 // TLB invalidate by address and ASID - add x0, x0, #1 - cmp x0, x1 - b.lo 1b - dsb sy - ret -ENDPROC(__cpu_flush_user_tlb_range) - -/* - * __cpu_flush_kern_tlb_range(start,end) - * - * Invalidate a range of kernel TLB entries. - * - * - start - start address (may not be aligned) - * - end - end address (exclusive, may not be aligned) - */ -ENTRY(__cpu_flush_kern_tlb_range) - dsb sy - lsr x0, x0, #12 // align address - lsr x1, x1, #12 -1: tlbi vaae1is, x0 // TLB invalidate by address - add x0, x0, #1 - cmp x0, x1 - b.lo 1b - dsb sy - isb - ret -ENDPROC(__cpu_flush_kern_tlb_range) diff --git a/arch/blackfin/include/asm/ftrace.h b/arch/blackfin/include/asm/ftrace.h index 8a029505d7b7..2f1c3c2657ad 100644 --- a/arch/blackfin/include/asm/ftrace.h +++ b/arch/blackfin/include/asm/ftrace.h @@ -66,16 +66,7 @@ extern inline void *return_address(unsigned int level) #endif /* CONFIG_FRAME_POINTER */ -#define HAVE_ARCH_CALLER_ADDR - -/* inline function or macro may lead to unexpected result */ -#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -#define CALLER_ADDR1 ((unsigned long)return_address(1)) -#define CALLER_ADDR2 ((unsigned long)return_address(2)) -#define CALLER_ADDR3 ((unsigned long)return_address(3)) -#define CALLER_ADDR4 ((unsigned long)return_address(4)) -#define CALLER_ADDR5 ((unsigned long)return_address(5)) -#define CALLER_ADDR6 ((unsigned long)return_address(6)) +#define ftrace_return_address(n) return_address(n) #endif /* __ASSEMBLY__ */ diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index e1f88e028cfe..8b8fe671b1a6 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -117,6 +117,7 @@ put_reg(struct task_struct *task, unsigned long regno, unsigned long data) int is_user_addr_valid(struct task_struct *child, unsigned long start, unsigned long len) { + bool valid; struct vm_area_struct *vma; struct sram_list_struct *sraml; @@ -124,9 +125,12 @@ is_user_addr_valid(struct task_struct *child, unsigned long start, unsigned long if (start + len < start) return -EIO; + down_read(&child->mm->mmap_sem); vma = find_vma(child->mm, start); - if (vma && start >= vma->vm_start && start + len <= vma->vm_end) - return 0; + valid = vma && start >= vma->vm_start && start + len <= vma->vm_end; + up_read(&child->mm->mmap_sem); + if (valid) + return 0; for (sraml = child->mm->context.sram_list; sraml; sraml = sraml->next) if (start >= (unsigned long)sraml->addr diff --git a/arch/cris/arch-v10/drivers/gpio.c b/arch/cris/arch-v10/drivers/gpio.c index f4374bae4fb4..64285e0d3481 100644 --- a/arch/cris/arch-v10/drivers/gpio.c +++ b/arch/cris/arch-v10/drivers/gpio.c @@ -833,8 +833,8 @@ static int __init gpio_init(void) printk(KERN_INFO "ETRAX 100LX GPIO driver v2.5, (c) 2001-2008 " "Axis Communications AB\n"); /* We call etrax_gpio_wake_up_check() from timer interrupt and - * from cpu_idle() in kernel/process.c - * The check in cpu_idle() reduces latency from ~15 ms to ~6 ms + * from default_idle() in kernel/process.c + * The check in default_idle() reduces latency from ~15 ms to ~6 ms * in some tests. */ res = request_irq(TIMER0_IRQ_NBR, gpio_poll_timer_interrupt, diff --git a/arch/cris/arch-v32/drivers/mach-fs/gpio.c b/arch/cris/arch-v32/drivers/mach-fs/gpio.c index 9e54273af0ca..009f4ee1bd09 100644 --- a/arch/cris/arch-v32/drivers/mach-fs/gpio.c +++ b/arch/cris/arch-v32/drivers/mach-fs/gpio.c @@ -958,11 +958,7 @@ gpio_init(void) printk(KERN_INFO "ETRAX FS GPIO driver v2.5, (c) 2003-2007 " "Axis Communications AB\n"); - /* We call etrax_gpio_wake_up_check() from timer interrupt and - * from cpu_idle() in kernel/process.c - * The check in cpu_idle() reduces latency from ~15 ms to ~6 ms - * in some tests. - */ + /* We call etrax_gpio_wake_up_check() from timer interrupt */ if (request_irq(TIMER0_INTR_VECT, gpio_poll_timer_interrupt, IRQF_SHARED, "gpio poll", &alarmlist)) printk(KERN_ERR "timer0 irq for gpio\n"); diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index b942f4032d7a..2955f359e2a7 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -237,7 +237,7 @@ kdump_init_notifier(struct notifier_block *self, unsigned long val, void *data) } #ifdef CONFIG_SYSCTL -static ctl_table kdump_ctl_table[] = { +static struct ctl_table kdump_ctl_table[] = { { .procname = "kdump_on_init", .data = &kdump_on_init, @@ -255,7 +255,7 @@ static ctl_table kdump_ctl_table[] = { { } }; -static ctl_table sys_table[] = { +static struct ctl_table sys_table[] = { { .procname = "kernel", .mode = 0555, diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index d841c4bd6864..5845ffea67c3 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -521,7 +521,7 @@ static pmu_config_t *pmu_conf; pfm_sysctl_t pfm_sysctl; EXPORT_SYMBOL(pfm_sysctl); -static ctl_table pfm_ctl_table[]={ +static struct ctl_table pfm_ctl_table[] = { { .procname = "debug", .data = &pfm_sysctl.debug, @@ -552,7 +552,7 @@ static ctl_table pfm_ctl_table[]={ }, {} }; -static ctl_table pfm_sysctl_dir[] = { +static struct ctl_table pfm_sysctl_dir[] = { { .procname = "perfmon", .mode = 0555, @@ -560,7 +560,7 @@ static ctl_table pfm_sysctl_dir[] = { }, {} }; -static ctl_table pfm_sysctl_root[] = { +static struct ctl_table pfm_sysctl_root[] = { { .procname = "kernel", .mode = 0555, diff --git a/arch/m68k/include/asm/signal.h b/arch/m68k/include/asm/signal.h index 214320b50384..8c8ce5e1ee0e 100644 --- a/arch/m68k/include/asm/signal.h +++ b/arch/m68k/include/asm/signal.h @@ -60,15 +60,6 @@ static inline int __gen_sigismember(sigset_t *set, int _sig) __const_sigismember(set,sig) : \ __gen_sigismember(set,sig)) -static inline int sigfindinword(unsigned long word) -{ - asm ("bfffo %1{#0,#0},%0" - : "=d" (word) - : "d" (word & -word) - : "cc"); - return word ^ 31; -} - #endif /* !CONFIG_CPU_HAS_NO_BITFIELDS */ #ifndef __uClinux__ diff --git a/arch/microblaze/configs/mmu_defconfig b/arch/microblaze/configs/mmu_defconfig index deaf45ab6429..e2f6543b91e7 100644 --- a/arch/microblaze/configs/mmu_defconfig +++ b/arch/microblaze/configs/mmu_defconfig @@ -49,6 +49,7 @@ CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_UARTLITE=y CONFIG_SERIAL_UARTLITE_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_HW_RANDOM is not set CONFIG_XILINX_HWICAP=y CONFIG_I2C=y diff --git a/arch/microblaze/configs/nommu_defconfig b/arch/microblaze/configs/nommu_defconfig index 10b5172283d7..a29ebd4a9fcb 100644 --- a/arch/microblaze/configs/nommu_defconfig +++ b/arch/microblaze/configs/nommu_defconfig @@ -58,6 +58,7 @@ CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_UARTLITE=y CONFIG_SERIAL_UARTLITE_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_HW_RANDOM is not set CONFIG_XILINX_HWICAP=y CONFIG_I2C=y diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index c98ed95c0541..35b3ecaf25d5 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -2,6 +2,7 @@ generic-y += barrier.h generic-y += clkdev.h generic-y += cputime.h +generic-y += device.h generic-y += exec.h generic-y += hash.h generic-y += mcs_spinlock.h diff --git a/arch/microblaze/include/asm/device.h b/arch/microblaze/include/asm/device.h deleted file mode 100644 index 123b2fe72d01..000000000000 --- a/arch/microblaze/include/asm/device.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Arch specific extensions to struct device - * - * This file is subject to the terms and conditions of the GNU General Public - * License v2. See the file "COPYING" in the main directory of this archive - * for more details. - */ - -#ifndef _ASM_MICROBLAZE_DEVICE_H -#define _ASM_MICROBLAZE_DEVICE_H - -struct device_node; - -struct dev_archdata { - /* DMA operations on that device */ - struct dma_map_ops *dma_ops; - void *dma_data; -}; - -struct pdev_archdata { - u64 dma_mask; -}; - -#endif /* _ASM_MICROBLAZE_DEVICE_H */ - - diff --git a/arch/microblaze/include/asm/dma-mapping.h b/arch/microblaze/include/asm/dma-mapping.h index 46460f1c49c4..ab353723076a 100644 --- a/arch/microblaze/include/asm/dma-mapping.h +++ b/arch/microblaze/include/asm/dma-mapping.h @@ -35,16 +35,6 @@ #define __dma_alloc_coherent(dev, gfp, size, handle) NULL #define __dma_free_coherent(size, addr) ((void)0) -static inline unsigned long device_to_mask(struct device *dev) -{ - if (dev->dma_mask && *dev->dma_mask) - return *dev->dma_mask; - /* Assume devices without mask can take 32 bit addresses */ - return 0xfffffffful; -} - -extern struct dma_map_ops *dma_ops; - /* * Available generic sets of operations */ @@ -52,20 +42,7 @@ extern struct dma_map_ops dma_direct_ops; static inline struct dma_map_ops *get_dma_ops(struct device *dev) { - /* We don't handle the NULL dev case for ISA for now. We could - * do it via an out of line call but it is not needed for now. The - * only ISA DMA device we support is the floppy and we have a hack - * in the floppy driver directly to get a device for us. - */ - if (unlikely(!dev) || !dev->archdata.dma_ops) - return NULL; - - return dev->archdata.dma_ops; -} - -static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops) -{ - dev->archdata.dma_ops = ops; + return &dma_direct_ops; } static inline int dma_supported(struct device *dev, u64 mask) diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h index 1e4c3329f62e..433751b2a003 100644 --- a/arch/microblaze/include/asm/io.h +++ b/arch/microblaze/include/asm/io.h @@ -19,17 +19,14 @@ #ifndef CONFIG_PCI #define _IO_BASE 0 #define _ISA_MEM_BASE 0 -#define PCI_DRAM_OFFSET 0 #else #define _IO_BASE isa_io_base #define _ISA_MEM_BASE isa_mem_base -#define PCI_DRAM_OFFSET pci_dram_offset struct pci_dev; extern void pci_iounmap(struct pci_dev *dev, void __iomem *); #define pci_iounmap pci_iounmap extern unsigned long isa_io_base; -extern unsigned long pci_dram_offset; extern resource_size_t isa_mem_base; #endif diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h index 335524040fff..468aca8cec0d 100644 --- a/arch/microblaze/include/asm/pci.h +++ b/arch/microblaze/include/asm/pci.h @@ -45,14 +45,6 @@ struct pci_dev; #define pcibios_assign_all_busses() 0 #ifdef CONFIG_PCI -extern void set_pci_dma_ops(struct dma_map_ops *dma_ops); -extern struct dma_map_ops *get_pci_dma_ops(void); -#else /* CONFIG_PCI */ -#define set_pci_dma_ops(d) -#define get_pci_dma_ops() NULL -#endif - -#ifdef CONFIG_PCI static inline void pci_dma_burst_advice(struct pci_dev *pdev, enum pci_dma_burst_strategy *strat, unsigned long *strategy_parameter) diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c index da68d00fd087..4633c36c1b32 100644 --- a/arch/microblaze/kernel/dma.c +++ b/arch/microblaze/kernel/dma.c @@ -13,23 +13,6 @@ #include <linux/export.h> #include <linux/bug.h> -/* - * Generic direct DMA implementation - * - * This implementation supports a per-device offset that can be applied if - * the address at which memory is visible to devices is not 0. Platform code - * can set archdata.dma_data to an unsigned long holding the offset. By - * default the offset is PCI_DRAM_OFFSET. - */ - -static unsigned long get_dma_direct_offset(struct device *dev) -{ - if (likely(dev)) - return (unsigned long)dev->archdata.dma_data; - - return PCI_DRAM_OFFSET; /* FIXME Not sure if is correct */ -} - #define NOT_COHERENT_CACHE static void *dma_direct_alloc_coherent(struct device *dev, size_t size, @@ -51,7 +34,7 @@ static void *dma_direct_alloc_coherent(struct device *dev, size_t size, return NULL; ret = page_address(page); memset(ret, 0, size); - *dma_handle = virt_to_phys(ret) + get_dma_direct_offset(dev); + *dma_handle = virt_to_phys(ret); return ret; #endif @@ -77,7 +60,7 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, /* FIXME this part of code is untested */ for_each_sg(sgl, sg, nents, i) { - sg->dma_address = sg_phys(sg) + get_dma_direct_offset(dev); + sg->dma_address = sg_phys(sg); __dma_sync(page_to_phys(sg_page(sg)) + sg->offset, sg->length, direction); } @@ -85,12 +68,6 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, return nents; } -static void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - struct dma_attrs *attrs) -{ -} - static int dma_direct_dma_supported(struct device *dev, u64 mask) { return 1; @@ -104,7 +81,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, struct dma_attrs *attrs) { __dma_sync(page_to_phys(page) + offset, size, direction); - return page_to_phys(page) + offset + get_dma_direct_offset(dev); + return page_to_phys(page) + offset; } static inline void dma_direct_unmap_page(struct device *dev, @@ -181,7 +158,6 @@ struct dma_map_ops dma_direct_ops = { .alloc = dma_direct_alloc_coherent, .free = dma_direct_free_coherent, .map_sg = dma_direct_map_sg, - .unmap_sg = dma_direct_unmap_sg, .dma_supported = dma_direct_dma_supported, .map_page = dma_direct_map_page, .unmap_page = dma_direct_unmap_page, diff --git a/arch/microblaze/kernel/head.S b/arch/microblaze/kernel/head.S index 17645b2e2f07..4655ff342c64 100644 --- a/arch/microblaze/kernel/head.S +++ b/arch/microblaze/kernel/head.S @@ -205,7 +205,7 @@ GT4: /* r11 contains the rest - will be either 1 or 4 */ GT16: /* TLB0 is 16MB */ addik r9, r0, 0x1000000 /* means TLB0 is 16MB */ TLB1: - /* must be used r2 because of substract if failed */ + /* must be used r2 because of subtract if failed */ addik r2, r11, -0x0400000 bgei r2, GT20 /* size is greater than 16MB */ /* size is >16MB and <20MB */ diff --git a/arch/microblaze/kernel/setup.c b/arch/microblaze/kernel/setup.c index 67cc4b282cc1..ab5b488e1fde 100644 --- a/arch/microblaze/kernel/setup.c +++ b/arch/microblaze/kernel/setup.c @@ -71,13 +71,9 @@ void __init setup_arch(char **cmdline_p) xilinx_pci_init(); -#ifdef CONFIG_VT -#if defined(CONFIG_XILINX_CONSOLE) - conswitchp = &xil_con; -#elif defined(CONFIG_DUMMY_CONSOLE) +#if defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; #endif -#endif } #ifdef CONFIG_MTD_UCLINUX @@ -229,31 +225,3 @@ static int __init debugfs_tlb(void) device_initcall(debugfs_tlb); # endif #endif - -static int dflt_bus_notify(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct device *dev = data; - - /* We are only intereted in device addition */ - if (action != BUS_NOTIFY_ADD_DEVICE) - return 0; - - set_dma_ops(dev, &dma_direct_ops); - - return NOTIFY_DONE; -} - -static struct notifier_block dflt_plat_bus_notifier = { - .notifier_call = dflt_bus_notify, - .priority = INT_MAX, -}; - -static int __init setup_bus_notifier(void) -{ - bus_register_notifier(&platform_bus_type, &dflt_plat_bus_notifier); - - return 0; -} - -arch_initcall(setup_bus_notifier); diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c index a59de1bc1ce0..9037914f6985 100644 --- a/arch/microblaze/pci/pci-common.c +++ b/arch/microblaze/pci/pci-common.c @@ -47,24 +47,9 @@ static int global_phb_number; /* Global phb counter */ /* ISA Memory physical address */ resource_size_t isa_mem_base; -static struct dma_map_ops *pci_dma_ops = &dma_direct_ops; - unsigned long isa_io_base; -unsigned long pci_dram_offset; static int pci_bus_count; - -void set_pci_dma_ops(struct dma_map_ops *dma_ops) -{ - pci_dma_ops = dma_ops; -} - -struct dma_map_ops *get_pci_dma_ops(void) -{ - return pci_dma_ops; -} -EXPORT_SYMBOL(get_pci_dma_ops); - struct pci_controller *pcibios_alloc_controller(struct device_node *dev) { struct pci_controller *phb; @@ -866,10 +851,6 @@ void pcibios_setup_bus_devices(struct pci_bus *bus) */ set_dev_node(&dev->dev, pcibus_to_node(dev->bus)); - /* Hook up default DMA ops */ - set_dma_ops(&dev->dev, pci_dma_ops); - dev->dev.archdata.dma_data = (void *)PCI_DRAM_OFFSET; - /* Read default IRQs and fixup if necessary */ dev->irq = of_irq_parse_and_map_pci(dev, 0, 0); } diff --git a/arch/mips/dec/Makefile b/arch/mips/dec/Makefile index 3d5d2c56de8d..bd74e05c90b0 100644 --- a/arch/mips/dec/Makefile +++ b/arch/mips/dec/Makefile @@ -3,7 +3,7 @@ # obj-y := ecc-berr.o int-handler.o ioasic-irq.o kn01-berr.o \ - kn02-irq.o kn02xa-berr.o reset.o setup.o time.o + kn02-irq.o kn02xa-berr.o platform.o reset.o setup.o time.o obj-$(CONFIG_TC) += tc.o obj-$(CONFIG_CPU_HAS_WB) += wbflush.o diff --git a/arch/mips/dec/platform.c b/arch/mips/dec/platform.c new file mode 100644 index 000000000000..c7ac86af847a --- /dev/null +++ b/arch/mips/dec/platform.c @@ -0,0 +1,44 @@ +/* + * DEC platform devices. + * + * Copyright (c) 2014 Maciej W. Rozycki + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/ioport.h> +#include <linux/kernel.h> +#include <linux/mc146818rtc.h> +#include <linux/platform_device.h> + +static struct resource dec_rtc_resources[] = { + { + .name = "rtc", + .flags = IORESOURCE_MEM, + }, +}; + +static struct cmos_rtc_board_info dec_rtc_info = { + .flags = CMOS_RTC_FLAGS_NOFREQ, + .address_space = 64, +}; + +static struct platform_device dec_rtc_device = { + .name = "rtc_cmos", + .id = PLATFORM_DEVID_NONE, + .dev.platform_data = &dec_rtc_info, + .resource = dec_rtc_resources, + .num_resources = ARRAY_SIZE(dec_rtc_resources), +}; + +static int __init dec_add_devices(void) +{ + dec_rtc_resources[0].start = RTC_PORT(0); + dec_rtc_resources[0].end = RTC_PORT(0) + dec_kn_slot_size - 1; + return platform_device_register(&dec_rtc_device); +} + +device_initcall(dec_add_devices); diff --git a/arch/parisc/include/asm/ftrace.h b/arch/parisc/include/asm/ftrace.h index 72c0fafaa039..544ed8ef87eb 100644 --- a/arch/parisc/include/asm/ftrace.h +++ b/arch/parisc/include/asm/ftrace.h @@ -24,15 +24,7 @@ extern void return_to_handler(void); extern unsigned long return_address(unsigned int); -#define HAVE_ARCH_CALLER_ADDR - -#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -#define CALLER_ADDR1 return_address(1) -#define CALLER_ADDR2 return_address(2) -#define CALLER_ADDR3 return_address(3) -#define CALLER_ADDR4 return_address(4) -#define CALLER_ADDR5 return_address(5) -#define CALLER_ADDR6 return_address(6) +#define ftrace_return_address(n) return_address(n) #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index ca1cd7459c4a..248ee7e5bebd 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -304,7 +304,7 @@ void notrace restore_interrupts(void) * being re-enabled and generally sanitized the lazy irq state, * and in the latter case it will leave with interrupts hard * disabled and marked as such, so the local_irq_enable() call - * in cpu_idle() will properly re-enable everything. + * in arch_cpu_idle() will properly re-enable everything. */ bool prep_irq_for_idle(void) { diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index 13e9966464c2..e79fb6ebaa42 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -40,15 +40,7 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) /* arch/sh/kernel/return_address.c */ extern void *return_address(unsigned int); -#define HAVE_ARCH_CALLER_ADDR - -#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) -#define CALLER_ADDR1 ((unsigned long)return_address(1)) -#define CALLER_ADDR2 ((unsigned long)return_address(2)) -#define CALLER_ADDR3 ((unsigned long)return_address(3)) -#define CALLER_ADDR4 ((unsigned long)return_address(4)) -#define CALLER_ADDR5 ((unsigned long)return_address(5)) -#define CALLER_ADDR6 ((unsigned long)return_address(6)) +#define ftrace_return_address(n) return_address(n) #endif /* __ASSEMBLY__ */ diff --git a/arch/tile/kernel/proc.c b/arch/tile/kernel/proc.c index 681100c59fda..6829a9508649 100644 --- a/arch/tile/kernel/proc.c +++ b/arch/tile/kernel/proc.c @@ -113,7 +113,7 @@ arch_initcall(proc_tile_init); * Support /proc/sys/tile directory */ -static ctl_table unaligned_subtable[] = { +static struct ctl_table unaligned_subtable[] = { { .procname = "enabled", .data = &unaligned_fixup, @@ -138,7 +138,7 @@ static ctl_table unaligned_subtable[] = { {} }; -static ctl_table unaligned_table[] = { +static struct ctl_table unaligned_table[] = { { .procname = "unaligned_fixup", .mode = 0555, diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 4703a6c4b8e3..0331d765c2bb 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1087,8 +1087,7 @@ struct boot_params *make_boot_params(struct efi_config *c) hdr->type_of_loader = 0x21; /* Convert unicode cmdline to ascii */ - cmdline_ptr = efi_convert_cmdline_to_ascii(sys_table, image, - &options_size); + cmdline_ptr = efi_convert_cmdline(sys_table, image, &options_size); if (!cmdline_ptr) goto fail; hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 0d558ee899ae..2884e0c3e8a5 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -452,7 +452,7 @@ efi32_config: .global efi64_config efi64_config: .fill 11,8,0 - .quad efi_call6 + .quad efi_call .byte 1 #endif /* CONFIG_EFI_STUB */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 0ca9a5c362bc..84c223479e3c 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -375,8 +375,7 @@ xloadflags: # define XLF0 0 #endif -#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64) && \ - !defined(CONFIG_EFI_MIXED) +#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_X86_64) /* kernel/boot_param/ramdisk could be loaded above 4g */ # define XLF1 XLF_CAN_BE_LOADED_ABOVE_4G #else diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 185fad49d86f..5d1e0075ac24 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -92,7 +92,7 @@ __clmul_gf128mul_ble: ret ENDPROC(__clmul_gf128mul_ble) -/* void clmul_ghash_mul(char *dst, const be128 *shash) */ +/* void clmul_ghash_mul(char *dst, const u128 *shash) */ ENTRY(clmul_ghash_mul) movups (%rdi), DATA movups (%rsi), SHASH @@ -106,7 +106,7 @@ ENDPROC(clmul_ghash_mul) /* * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - * const be128 *shash); + * const u128 *shash); */ ENTRY(clmul_ghash_update) cmp $16, %rdx diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index d785cf2c529c..88bb7ba8b175 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -25,17 +25,17 @@ #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 -void clmul_ghash_mul(char *dst, const be128 *shash); +void clmul_ghash_mul(char *dst, const u128 *shash); void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, - const be128 *shash); + const u128 *shash); struct ghash_async_ctx { struct cryptd_ahash *cryptd_tfm; }; struct ghash_ctx { - be128 shash; + u128 shash; }; struct ghash_desc_ctx { @@ -68,11 +68,11 @@ static int ghash_setkey(struct crypto_shash *tfm, a = be64_to_cpu(x->a); b = be64_to_cpu(x->b); - ctx->shash.a = (__be64)((b << 1) | (a >> 63)); - ctx->shash.b = (__be64)((a << 1) | (b >> 63)); + ctx->shash.a = (b << 1) | (a >> 63); + ctx->shash.b = (a << 1) | (b >> 63); if (a >> 63) - ctx->shash.b ^= cpu_to_be64(0xc2); + ctx->shash.b ^= ((u64)0xc2) << 56; return 0; } diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 0869434eaf72..1eb5f6433ad8 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_EFI_H #define _ASM_X86_EFI_H +#include <asm/i387.h> /* * We map the EFI regions needed for runtime services non-contiguously, * with preserved alignment on virtual addresses starting from -4G down @@ -27,91 +28,58 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); -#define efi_call_phys0(f) efi_call_phys(f) -#define efi_call_phys1(f, a1) efi_call_phys(f, a1) -#define efi_call_phys2(f, a1, a2) efi_call_phys(f, a1, a2) -#define efi_call_phys3(f, a1, a2, a3) efi_call_phys(f, a1, a2, a3) -#define efi_call_phys4(f, a1, a2, a3, a4) \ - efi_call_phys(f, a1, a2, a3, a4) -#define efi_call_phys5(f, a1, a2, a3, a4, a5) \ - efi_call_phys(f, a1, a2, a3, a4, a5) -#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \ - efi_call_phys(f, a1, a2, a3, a4, a5, a6) /* * Wrap all the virtual calls in a way that forces the parameters on the stack. */ +/* Use this macro if your virtual returns a non-void value */ #define efi_call_virt(f, args...) \ - ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args) - -#define efi_call_virt0(f) efi_call_virt(f) -#define efi_call_virt1(f, a1) efi_call_virt(f, a1) -#define efi_call_virt2(f, a1, a2) efi_call_virt(f, a1, a2) -#define efi_call_virt3(f, a1, a2, a3) efi_call_virt(f, a1, a2, a3) -#define efi_call_virt4(f, a1, a2, a3, a4) \ - efi_call_virt(f, a1, a2, a3, a4) -#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ - efi_call_virt(f, a1, a2, a3, a4, a5) -#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ - efi_call_virt(f, a1, a2, a3, a4, a5, a6) +({ \ + efi_status_t __s; \ + kernel_fpu_begin(); \ + __s = ((efi_##f##_t __attribute__((regparm(0)))*) \ + efi.systab->runtime->f)(args); \ + kernel_fpu_end(); \ + __s; \ +}) + +/* Use this macro if your virtual call does not return any value */ +#define __efi_call_virt(f, args...) \ +({ \ + kernel_fpu_begin(); \ + ((efi_##f##_t __attribute__((regparm(0)))*) \ + efi.systab->runtime->f)(args); \ + kernel_fpu_end(); \ +}) #define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ -extern u64 efi_call0(void *fp); -extern u64 efi_call1(void *fp, u64 arg1); -extern u64 efi_call2(void *fp, u64 arg1, u64 arg2); -extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3); -extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4); -extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3, - u64 arg4, u64 arg5); -extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, - u64 arg4, u64 arg5, u64 arg6); - -#define efi_call_phys0(f) \ - efi_call0((f)) -#define efi_call_phys1(f, a1) \ - efi_call1((f), (u64)(a1)) -#define efi_call_phys2(f, a1, a2) \ - efi_call2((f), (u64)(a1), (u64)(a2)) -#define efi_call_phys3(f, a1, a2, a3) \ - efi_call3((f), (u64)(a1), (u64)(a2), (u64)(a3)) -#define efi_call_phys4(f, a1, a2, a3, a4) \ - efi_call4((f), (u64)(a1), (u64)(a2), (u64)(a3), \ - (u64)(a4)) -#define efi_call_phys5(f, a1, a2, a3, a4, a5) \ - efi_call5((f), (u64)(a1), (u64)(a2), (u64)(a3), \ - (u64)(a4), (u64)(a5)) -#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \ - efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3), \ - (u64)(a4), (u64)(a5), (u64)(a6)) - -#define _efi_call_virtX(x, f, ...) \ +#define EFI_LOADER_SIGNATURE "EL64" + +extern u64 asmlinkage efi_call(void *fp, ...); + +#define efi_call_phys(f, args...) efi_call((f), args) + +#define efi_call_virt(f, ...) \ ({ \ efi_status_t __s; \ \ efi_sync_low_kernel_mappings(); \ preempt_disable(); \ - __s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \ + __kernel_fpu_begin(); \ + __s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \ + __kernel_fpu_end(); \ preempt_enable(); \ __s; \ }) -#define efi_call_virt0(f) \ - _efi_call_virtX(0, f) -#define efi_call_virt1(f, a1) \ - _efi_call_virtX(1, f, (u64)(a1)) -#define efi_call_virt2(f, a1, a2) \ - _efi_call_virtX(2, f, (u64)(a1), (u64)(a2)) -#define efi_call_virt3(f, a1, a2, a3) \ - _efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3)) -#define efi_call_virt4(f, a1, a2, a3, a4) \ - _efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4)) -#define efi_call_virt5(f, a1, a2, a3, a4, a5) \ - _efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5)) -#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ - _efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) +/* + * All X86_64 virt calls return non-void values. Thus, use non-void call for + * virt calls that would be void on X86_32. + */ +#define __efi_call_virt(f, args...) efi_call_virt(f, args) extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index cea1c76d49bf..115e3689cd53 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -87,22 +87,22 @@ static inline int is_x32_frame(void) static __always_inline __pure bool use_eager_fpu(void) { - return static_cpu_has(X86_FEATURE_EAGER_FPU); + return static_cpu_has_safe(X86_FEATURE_EAGER_FPU); } static __always_inline __pure bool use_xsaveopt(void) { - return static_cpu_has(X86_FEATURE_XSAVEOPT); + return static_cpu_has_safe(X86_FEATURE_XSAVEOPT); } static __always_inline __pure bool use_xsave(void) { - return static_cpu_has(X86_FEATURE_XSAVE); + return static_cpu_has_safe(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { - return static_cpu_has(X86_FEATURE_FXSR); + return static_cpu_has_safe(X86_FEATURE_FXSR); } static inline void fx_finit(struct i387_fxsave_struct *fx) @@ -293,7 +293,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk) /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. "m" is a random variable that should be in L1 */ - if (unlikely(static_cpu_has(X86_FEATURE_FXSAVE_LEAK))) { + if (unlikely(static_cpu_has_safe(X86_FEATURE_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 35e67a457182..31eab867e6d3 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -92,12 +92,6 @@ static inline int __gen_sigismember(sigset_t *set, int _sig) ? __const_sigismember((set), (sig)) \ : __gen_sigismember((set), (sig))) -static inline int sigfindinword(unsigned long word) -{ - asm("bsfl %1,%0" : "=r"(word) : "rm"(word) : "cc"); - return word; -} - struct pt_regs; #else /* __i386__ */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 283a76a9cc40..11ccfb0a63e7 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -17,6 +17,7 @@ #include <asm/idle.h> #include <asm/mce.h> #include <asm/hw_irq.h> +#include <asm/desc.h> #define CREATE_TRACE_POINTS #include <asm/trace/irq_vectors.h> @@ -334,10 +335,17 @@ int check_irq_vectors_for_cpu_disable(void) for_each_online_cpu(cpu) { if (cpu == this_cpu) continue; - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; - vector++) { - if (per_cpu(vector_irq, cpu)[vector] < 0) - count++; + /* + * We scan from FIRST_EXTERNAL_VECTOR to first system + * vector. If the vector is marked in the used vectors + * bitmap or an irq is assigned to it, we don't count + * it as available. + */ + for (vector = FIRST_EXTERNAL_VECTOR; + vector < first_system_vector; vector++) { + if (!test_bit(vector, used_vectors) && + per_cpu(vector_irq, cpu)[vector] < 0) + count++; } } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5d93ac1b72db..5492798930ef 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -866,9 +866,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* was set by cpu_init() */ cpumask_clear_cpu(cpu, cpu_initialized_mask); - - set_cpu_present(cpu, false); - per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID; } /* mark "stuck" area as not stuck */ @@ -928,7 +925,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) err = do_boot_cpu(apicid, cpu, tidle); if (err) { - pr_debug("do_boot_cpu failed %d\n", err); + pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); return -EIO; } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 3781dd39e8bd..87fc96bcc13c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -110,7 +110,7 @@ static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt2(get_time, tm, tc); + status = efi_call_virt(get_time, tm, tc); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -121,7 +121,7 @@ static efi_status_t virt_efi_set_time(efi_time_t *tm) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt1(set_time, tm); + status = efi_call_virt(set_time, tm); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -134,8 +134,7 @@ static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt3(get_wakeup_time, - enabled, pending, tm); + status = efi_call_virt(get_wakeup_time, enabled, pending, tm); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -146,8 +145,7 @@ static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); - status = efi_call_virt2(set_wakeup_time, - enabled, tm); + status = efi_call_virt(set_wakeup_time, enabled, tm); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -158,17 +156,17 @@ static efi_status_t virt_efi_get_variable(efi_char16_t *name, unsigned long *data_size, void *data) { - return efi_call_virt5(get_variable, - name, vendor, attr, - data_size, data); + return efi_call_virt(get_variable, + name, vendor, attr, + data_size, data); } static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor) { - return efi_call_virt3(get_next_variable, - name_size, name, vendor); + return efi_call_virt(get_next_variable, + name_size, name, vendor); } static efi_status_t virt_efi_set_variable(efi_char16_t *name, @@ -177,9 +175,9 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, unsigned long data_size, void *data) { - return efi_call_virt5(set_variable, - name, vendor, attr, - data_size, data); + return efi_call_virt(set_variable, + name, vendor, attr, + data_size, data); } static efi_status_t virt_efi_query_variable_info(u32 attr, @@ -190,13 +188,13 @@ static efi_status_t virt_efi_query_variable_info(u32 attr, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt4(query_variable_info, attr, storage_space, - remaining_space, max_variable_size); + return efi_call_virt(query_variable_info, attr, storage_space, + remaining_space, max_variable_size); } static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) { - return efi_call_virt1(get_next_high_mono_count, count); + return efi_call_virt(get_next_high_mono_count, count); } static void virt_efi_reset_system(int reset_type, @@ -204,8 +202,8 @@ static void virt_efi_reset_system(int reset_type, unsigned long data_size, efi_char16_t *data) { - efi_call_virt4(reset_system, reset_type, status, - data_size, data); + __efi_call_virt(reset_system, reset_type, status, + data_size, data); } static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, @@ -215,7 +213,7 @@ static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt3(update_capsule, capsules, count, sg_list); + return efi_call_virt(update_capsule, capsules, count, sg_list); } static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, @@ -226,8 +224,8 @@ static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt4(query_capsule_caps, capsules, count, max_size, - reset_type); + return efi_call_virt(query_capsule_caps, capsules, count, max_size, + reset_type); } static efi_status_t __init phys_efi_set_virtual_address_map( @@ -239,9 +237,9 @@ static efi_status_t __init phys_efi_set_virtual_address_map( efi_status_t status; efi_call_phys_prelog(); - status = efi_call_phys4(efi_phys.set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); + status = efi_call_phys(efi_phys.set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); efi_call_phys_epilog(); return status; } @@ -919,6 +917,9 @@ static void __init save_runtime_map(void) void *tmp, *p, *q = NULL; int count = 0; + if (efi_enabled(EFI_OLD_MEMMAP)) + return; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index e0984ef0374b..5fcda7272550 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -73,84 +73,7 @@ 2: .endm -ENTRY(efi_call0) - SAVE_XMM - subq $32, %rsp - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call0) - -ENTRY(efi_call1) - SAVE_XMM - subq $32, %rsp - mov %rsi, %rcx - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call1) - -ENTRY(efi_call2) - SAVE_XMM - subq $32, %rsp - mov %rsi, %rcx - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call2) - -ENTRY(efi_call3) - SAVE_XMM - subq $32, %rsp - mov %rcx, %r8 - mov %rsi, %rcx - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call3) - -ENTRY(efi_call4) - SAVE_XMM - subq $32, %rsp - mov %r8, %r9 - mov %rcx, %r8 - mov %rsi, %rcx - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $32, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call4) - -ENTRY(efi_call5) - SAVE_XMM - subq $48, %rsp - mov %r9, 32(%rsp) - mov %r8, %r9 - mov %rcx, %r8 - mov %rsi, %rcx - SWITCH_PGT - call *%rdi - RESTORE_PGT - addq $48, %rsp - RESTORE_XMM - ret -ENDPROC(efi_call5) - -ENTRY(efi_call6) +ENTRY(efi_call) SAVE_XMM mov (%rsp), %rax mov 8(%rax), %rax @@ -166,7 +89,7 @@ ENTRY(efi_call6) addq $48, %rsp RESTORE_XMM ret -ENDPROC(efi_call6) +ENDPROC(efi_call) #ifdef CONFIG_EFI_MIXED diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 766612137a62..1584cbed0dce 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -39,7 +39,7 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) */ return BIOS_STATUS_UNIMPLEMENTED; - ret = efi_call6((void *)__va(tab->function), (u64)which, + ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5); return ret; } diff --git a/arch/xtensa/include/asm/ftrace.h b/arch/xtensa/include/asm/ftrace.h index 736b9d214d80..6c6d9a9f185f 100644 --- a/arch/xtensa/include/asm/ftrace.h +++ b/arch/xtensa/include/asm/ftrace.h @@ -12,24 +12,18 @@ #include <asm/processor.h> -#define HAVE_ARCH_CALLER_ADDR #ifndef __ASSEMBLY__ -#define CALLER_ADDR0 ({ unsigned long a0, a1; \ +#define ftrace_return_address0 ({ unsigned long a0, a1; \ __asm__ __volatile__ ( \ "mov %0, a0\n" \ "mov %1, a1\n" \ : "=r"(a0), "=r"(a1)); \ MAKE_PC_FROM_RA(a0, a1); }) + #ifdef CONFIG_FRAME_POINTER extern unsigned long return_address(unsigned level); -#define CALLER_ADDR1 return_address(1) -#define CALLER_ADDR2 return_address(2) -#define CALLER_ADDR3 return_address(3) -#else /* CONFIG_FRAME_POINTER */ -#define CALLER_ADDR1 (0) -#define CALLER_ADDR2 (0) -#define CALLER_ADDR3 (0) -#endif /* CONFIG_FRAME_POINTER */ +#define ftrace_return_address(n) return_address(n) +#endif #endif /* __ASSEMBLY__ */ #ifdef CONFIG_FUNCTION_TRACER |