12 files changed, 579 insertions, 172 deletions
diff --git a/lib/aarch32/misc_helpers.S b/lib/aarch32/misc_helpers.S
index bf4084a8..dc847995 100644
--- a/lib/aarch32/misc_helpers.S
+++ b/lib/aarch32/misc_helpers.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 
 	.globl	smc
 	.globl	zeromem
+	.globl	zero_normalmem
 	.globl	memcpy4
 	.globl	disable_mmu_icache_secure
 	.globl	disable_mmu_secure
@@ -50,30 +51,108 @@ func smc
 endfunc smc
 
 /* -----------------------------------------------------------------------
- * void zeromem(void *mem, unsigned int length);
+ * void zeromem(void *mem, unsigned int length)
+ *
+ * Initialise a region in normal memory to 0. This functions complies with the
+ * AAPCS and can be called from C code.
  *
- * Initialise a memory region to 0.
- * The memory address and length must be 4-byte aligned.
  * -----------------------------------------------------------------------
  */
 func zeromem
-#if ASM_ASSERTION
-	tst	r0, #0x3
-	ASM_ASSERT(eq)
-	tst	r1, #0x3
-	ASM_ASSERT(eq)
-#endif
-	add	r2, r0, r1
-	mov	r1, #0
-z_loop:
-	cmp	r2, r0
-	beq	z_end
-	str	r1, [r0], #4
-	b	z_loop
-z_end:
+	/*
+	 * Readable names for registers
+	 *
+	 * Registers r0, r1 and r2 are also set by zeromem which
+	 * branches into the fallback path directly, so cursor, length and
+	 * stop_address should not be retargeted to other registers.
+	 */
+	cursor       .req r0 /* Start address and then current address */
+	length       .req r1 /* Length in bytes of the region to zero out */
+	/*
+	 * Reusing the r1 register as length is only used at the beginning of
+	 * the function.
+	 */
+	stop_address .req r1  /* Address past the last zeroed byte */
+	zeroreg1     .req r2  /* Source register filled with 0 */
+	zeroreg2     .req r3  /* Source register filled with 0 */
+	tmp	     .req r12 /* Temporary scratch register */
+
+	mov	zeroreg1, #0
+
+	/* stop_address is the address past the last to zero */
+	add	stop_address, cursor, length
+
+	/*
+	 * Length cannot be used anymore as it shares the same register with
+	 * stop_address.
+	 */
+	.unreq	length
+
+	/*
+	 * If the start address is already aligned to 8 bytes, skip this loop.
+	 */
+	tst	cursor, #(8-1)
+	beq	.Lzeromem_8bytes_aligned
+
+	/* Calculate the next address aligned to 8 bytes */
+	orr	tmp, cursor, #(8-1)
+	adds	tmp, tmp, #1
+	/* If it overflows, fallback to byte per byte zeroing */
+	beq	.Lzeromem_1byte_aligned
+	/* If the next aligned address is after the stop address, fall back */
+	cmp	tmp, stop_address
+	bhs	.Lzeromem_1byte_aligned
+
+	/* zero byte per byte */
+1:
+	strb	zeroreg1, [cursor], #1
+	cmp	cursor, tmp
+	bne	1b
+
+	/* zero 8 bytes at a time */
+.Lzeromem_8bytes_aligned:
+
+	/* Calculate the last 8 bytes aligned address. */
+	bic	tmp, stop_address, #(8-1)
+
+	cmp	cursor, tmp
+	bhs	2f
+
+	mov	zeroreg2, #0
+1:
+	stmia	cursor!, {zeroreg1, zeroreg2}
+	cmp	cursor, tmp
+	blo	1b
+2:
+
+	/* zero byte per byte */
+.Lzeromem_1byte_aligned:
+	cmp	cursor, stop_address
+	beq	2f
+1:
+	strb	zeroreg1, [cursor], #1
+	cmp	cursor, stop_address
+	bne	1b
+2:
 	bx	lr
+
+	.unreq	cursor
+	/*
+	 * length is already unreq'ed to reuse the register for another
+	 * variable.
+	 */
+	.unreq	stop_address
+	.unreq	zeroreg1
+	.unreq	zeroreg2
+	.unreq	tmp
 endfunc zeromem
 
+/*
+ * AArch32 does not have special ways of zeroing normal memory as AArch64 does
+ * using the DC ZVA instruction, so we just alias zero_normalmem to zeromem.
+ */
+.equ	zero_normalmem, zeromem
+
 /* --------------------------------------------------------------------------
  * void memcpy4(void *dest, const void *src, unsigned int length)
  *
diff --git a/lib/aarch64/misc_helpers.S b/lib/aarch64/misc_helpers.S
index 574146f6..84265e0b 100644
--- a/lib/aarch64/misc_helpers.S
+++ b/lib/aarch64/misc_helpers.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,6 +37,8 @@
 	.globl	eret
 	.globl	smc
 
+	.globl	zero_normalmem
+	.globl	zeromem
 	.globl	zeromem16
 	.globl	memcpy16
 
@@ -80,31 +82,358 @@ endfunc smc
  *
  * Initialise a memory region to 0.
  * The memory address must be 16-byte aligned.
+ * NOTE: This function is deprecated and zeromem should be used instead.
  * -----------------------------------------------------------------------
  */
-func zeromem16
+.equ	zeromem16, zeromem
+
+/* -----------------------------------------------------------------------
+ * void zero_normalmem(void *mem, unsigned int length);
+ *
+ * Initialise a region in normal memory to 0. This functions complies with the
+ * AAPCS and can be called from C code.
+ *
+ * NOTE: MMU must be enabled when using this function as it can only operate on
+ *       normal memory. It is intended to be mainly used from C code when MMU
+ *       is usually enabled.
+ * -----------------------------------------------------------------------
+ */
+.equ	zero_normalmem, zeromem_dczva
+
+/* -----------------------------------------------------------------------
+ * void zeromem(void *mem, unsigned int length);
+ *
+ * Initialise a region of device memory to 0. This functions complies with the
+ * AAPCS and can be called from C code.
+ *
+ * NOTE: When data caches and MMU are enabled, zero_normalmem can usually be
+ *       used instead for faster zeroing.
+ *
+ * -----------------------------------------------------------------------
+ */
+func zeromem
+	/* x2 is the address past the last zeroed address */
+	add	x2, x0, x1
+	/*
+	 * Uses the fallback path that does not use DC ZVA instruction and
+	 * therefore does not need enabled MMU
+	 */
+	b	.Lzeromem_dczva_fallback_entry
+endfunc zeromem
+
+/* -----------------------------------------------------------------------
+ * void zeromem_dczva(void *mem, unsigned int length);
+ *
+ * Fill a region of normal memory of size "length" in bytes with null bytes.
+ * MMU must be enabled and the memory be of
+ * normal type. This is because this function internally uses the DC ZVA
+ * instruction, which generates an Alignment fault if used on any type of
+ * Device memory (see section D3.4.9 of the ARMv8 ARM, issue k). When the MMU
+ * is disabled, all memory behaves like Device-nGnRnE memory (see section
+ * D4.2.8), hence the requirement on the MMU being enabled.
+ * NOTE: The code assumes that the block size as defined in DCZID_EL0
+ *       register is at least 16 bytes.
+ *
+ * -----------------------------------------------------------------------
+ */
+func zeromem_dczva
+
+	/*
+	 * The function consists of a series of loops that zero memory one byte
+	 * at a time, 16 bytes at a time or using the DC ZVA instruction to
+	 * zero aligned block of bytes, which is assumed to be more than 16.
+	 * In the case where the DC ZVA instruction cannot be used or if the
+	 * first 16 bytes loop would overflow, there is fallback path that does
+	 * not use DC ZVA.
+	 * Note: The fallback path is also used by the zeromem function that
+	 *       branches to it directly.
+	 *
+	 *              +---------+   zeromem_dczva
+	 *              |  entry  |
+	 *              +----+----+
+	 *                   |
+	 *                   v
+	 *              +---------+
+	 *              | checks  |>o-------+ (If any check fails, fallback)
+	 *              +----+----+         |
+	 *                   |              |---------------+
+	 *                   v              | Fallback path |
+	 *            +------+------+       |---------------+
+	 *            | 1 byte loop |       |
+	 *            +------+------+ .Lzeromem_dczva_initial_1byte_aligned_end
+	 *                   |              |
+	 *                   v              |
+	 *           +-------+-------+      |
+	 *           | 16 bytes loop |      |
+	 *           +-------+-------+      |
+	 *                   |              |
+	 *                   v              |
+	 *            +------+------+ .Lzeromem_dczva_blocksize_aligned
+	 *            | DC ZVA loop |       |
+	 *            +------+------+       |
+	 *       +--------+  |              |
+	 *       |        |  |              |
+	 *       |        v  v              |
+	 *       |   +-------+-------+ .Lzeromem_dczva_final_16bytes_aligned
+	 *       |   | 16 bytes loop |      |
+	 *       |   +-------+-------+      |
+	 *       |           |              |
+	 *       |           v              |
+	 *       |    +------+------+ .Lzeromem_dczva_final_1byte_aligned
+	 *       |    | 1 byte loop |       |
+	 *       |    +-------------+       |
+	 *       |           |              |
+	 *       |           v              |
+	 *       |       +---+--+           |
+	 *       |       | exit |           |
+	 *       |       +------+           |
+	 *       |			    |
+	 *       |           +--------------+    +------------------+ zeromem
+	 *       |           |  +----------------| zeromem function |
+	 *       |           |  |                +------------------+
+	 *       |           v  v
+	 *       |    +-------------+ .Lzeromem_dczva_fallback_entry
+	 *       |    | 1 byte loop |
+	 *       |    +------+------+
+	 *       |           |
+	 *       +-----------+
+	 */
+
+	/*
+	 * Readable names for registers
+	 *
+	 * Registers x0, x1 and x2 are also set by zeromem which
+	 * branches into the fallback path directly, so cursor, length and
+	 * stop_address should not be retargeted to other registers.
+	 */
+	cursor       .req x0 /* Start address and then current address */
+	length       .req x1 /* Length in bytes of the region to zero out */
+	/* Reusing x1 as length is never used after block_mask is set */
+	block_mask   .req x1 /* Bitmask of the block size read in DCZID_EL0 */
+	stop_address .req x2 /* Address past the last zeroed byte */
+	block_size   .req x3 /* Size of a block in bytes as read in DCZID_EL0 */
+	tmp1         .req x4
+	tmp2         .req x5
+
 #if ASM_ASSERTION
-	tst	x0, #0xf
-	ASM_ASSERT(eq)
+	/*
+	 * Check for M bit (MMU enabled) of the current SCTLR_EL(1|3)
+	 * register value and panic if the MMU is disabled.
+	 */
+#if defined(IMAGE_BL1) || defined(IMAGE_BL31)
+	mrs	tmp1, sctlr_el3
+#else
+	mrs	tmp1, sctlr_el1
 #endif
-	add	x2, x0, x1
-/* zero 16 bytes at a time */
-z_loop16:
-	sub	x3, x2, x0
-	cmp	x3, #16
-	b.lt	z_loop1
-	stp	xzr, xzr, [x0], #16
-	b	z_loop16
-/* zero byte per byte */
-z_loop1:
-	cmp	x0, x2
-	b.eq	z_end
-	strb	wzr, [x0], #1
-	b	z_loop1
-z_end:
+
+	tst	tmp1, #SCTLR_M_BIT
+	ASM_ASSERT(ne)
+#endif /* ASM_ASSERTION */
+
+	/* stop_address is the address past the last to zero */
+	add	stop_address, cursor, length
+
+	/*
+	 * Get block_size = (log2(<block size>) >> 2) (see encoding of
+	 * dczid_el0 reg)
+	 */
+	mrs	block_size, dczid_el0
+
+	/*
+	 * Select the 4 lowest bits and convert the extracted log2(<block size
+	 * in words>) to <block size in bytes>
+	 */
+	ubfx	block_size, block_size, #0, #4
+	mov	tmp2, #(1 << 2)
+	lsl	block_size, tmp2, block_size
+
+#if ASM_ASSERTION
+	/*
+	 * Assumes block size is at least 16 bytes to avoid manual realignment
+	 * of the cursor at the end of the DCZVA loop.
+	 */
+	cmp	block_size, #16
+	ASM_ASSERT(hs)
+#endif
+	/*
+	 * Not worth doing all the setup for a region less than a block and
+	 * protects against zeroing a whole block when the area to zero is
+	 * smaller than that. Also, as it is assumed that the block size is at
+	 * least 16 bytes, this also protects the initial aligning loops from
+	 * trying to zero 16 bytes when length is less than 16.
+	 */
+	cmp	length, block_size
+	b.lo	.Lzeromem_dczva_fallback_entry
+
+	/*
+	 * Calculate the bitmask of the block alignment. It will never
+	 * underflow as the block size is between 4 bytes and 2kB.
+	 * block_mask = block_size - 1
+	 */
+	sub	block_mask, block_size, #1
+
+	/*
+	 * length alias should not be used after this point unless it is
+	 * defined as a register other than block_mask's.
+	 */
+	 .unreq length
+
+	/*
+	 * If the start address is already aligned to zero block size, go
+	 * straight to the cache zeroing loop. This is safe because at this
+	 * point, the length cannot be smaller than a block size.
+	 */
+	tst	cursor, block_mask
+	b.eq	.Lzeromem_dczva_blocksize_aligned
+
+	/*
+	 * Calculate the first block-size-aligned address. It is assumed that
+	 * the zero block size is at least 16 bytes. This address is the last
+	 * address of this initial loop.
+	 */
+	orr	tmp1, cursor, block_mask
+	add	tmp1, tmp1, #1
+
+	/*
+	 * If the addition overflows, skip the cache zeroing loops. This is
+	 * quite unlikely however.
+	 */
+	cbz	tmp1, .Lzeromem_dczva_fallback_entry
+
+	/*
+	 * If the first block-size-aligned address is past the last address,
+	 * fallback to the simpler code.
+	 */
+	cmp	tmp1, stop_address
+	b.hi	.Lzeromem_dczva_fallback_entry
+
+	/*
+	 * If the start address is already aligned to 16 bytes, skip this loop.
+	 * It is safe to do this because tmp1 (the stop address of the initial
+	 * 16 bytes loop) will never be greater than the final stop address.
+	 */
+	tst	cursor, #0xf
+	b.eq	.Lzeromem_dczva_initial_1byte_aligned_end
+
+	/* Calculate the next address aligned to 16 bytes */
+	orr	tmp2, cursor, #0xf
+	add	tmp2, tmp2, #1
+	/* If it overflows, fallback to the simple path (unlikely) */
+	cbz	tmp2, .Lzeromem_dczva_fallback_entry
+	/*
+	 * Next aligned address cannot be after the stop address because the
+	 * length cannot be smaller than 16 at this point.
+	 */
+
+	/* First loop: zero byte per byte */
+1:
+	strb	wzr, [cursor], #1
+	cmp	cursor, tmp2
+	b.ne	1b
+.Lzeromem_dczva_initial_1byte_aligned_end:
+
+	/*
+	 * Second loop: we need to zero 16 bytes at a time from cursor to tmp1
+	 * before being able to use the code that deals with block-size-aligned
+	 * addresses.
+	 */
+	cmp	cursor, tmp1
+	b.hs	2f
+1:
+	stp	xzr, xzr, [cursor], #16
+	cmp	cursor, tmp1
+	b.lo	1b
+2:
+
+	/*
+	 * Third loop: zero a block at a time using DC ZVA cache block zeroing
+	 * instruction.
+	 */
+.Lzeromem_dczva_blocksize_aligned:
+	/*
+	 * Calculate the last block-size-aligned address. If the result equals
+	 * to the start address, the loop will exit immediately.
+	 */
+	bic	tmp1, stop_address, block_mask
+
+	cmp	cursor, tmp1
+	b.hs	2f
+1:
+	/* Zero the block containing the cursor */
+	dc	zva, cursor
+	/* Increment the cursor by the size of a block */
+	add	cursor, cursor, block_size
+	cmp	cursor, tmp1
+	b.lo	1b
+2:
+
+	/*
+	 * Fourth loop: zero 16 bytes at a time and then byte per byte the
+	 * remaining area
+	 */
+.Lzeromem_dczva_final_16bytes_aligned:
+	/*
+	 * Calculate the last 16 bytes aligned address. It is assumed that the
+	 * block size will never be smaller than 16 bytes so that the current
+	 * cursor is aligned to at least 16 bytes boundary.
+	 */
+	bic	tmp1, stop_address, #15
+
+	cmp	cursor, tmp1
+	b.hs	2f
+1:
+	stp	xzr, xzr, [cursor], #16
+	cmp	cursor, tmp1
+	b.lo	1b
+2:
+
+	/* Fifth and final loop: zero byte per byte */
+.Lzeromem_dczva_final_1byte_aligned:
+	cmp	cursor, stop_address
+	b.eq	2f
+1:
+	strb	wzr, [cursor], #1
+	cmp	cursor, stop_address
+	b.ne	1b
+2:
 	ret
-endfunc zeromem16
 
+	/* Fallback for unaligned start addresses */
+.Lzeromem_dczva_fallback_entry:
+	/*
+	 * If the start address is already aligned to 16 bytes, skip this loop.
+	 */
+	tst	cursor, #0xf
+	b.eq	.Lzeromem_dczva_final_16bytes_aligned
+
+	/* Calculate the next address aligned to 16 bytes */
+	orr	tmp1, cursor, #15
+	add	tmp1, tmp1, #1
+	/* If it overflows, fallback to byte per byte zeroing */
+	cbz	tmp1, .Lzeromem_dczva_final_1byte_aligned
+	/* If the next aligned address is after the stop address, fall back */
+	cmp	tmp1, stop_address
+	b.hs	.Lzeromem_dczva_final_1byte_aligned
+
+	/* Fallback entry loop: zero byte per byte */
+1:
+	strb	wzr, [cursor], #1
+	cmp	cursor, tmp1
+	b.ne	1b
+
+	b	.Lzeromem_dczva_final_16bytes_aligned
+
+	.unreq	cursor
+	/*
+	 * length is already unreq'ed to reuse the register for another
+	 * variable.
+	 */
+	.unreq	stop_address
+	.unreq	block_size
+	.unreq	block_mask
+	.unreq	tmp1
+	.unreq	tmp2
+endfunc zeromem_dczva
 
 /* --------------------------------------------------------------------------
  * void memcpy16(void *dest, const void *src, unsigned int length)
diff --git a/lib/cpus/aarch64/denver.S b/lib/cpus/aarch64/denver.S
index 0b61440d..3e238a1c 100644
--- a/lib/cpus/aarch64/denver.S
+++ b/lib/cpus/aarch64/denver.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2015-2016, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,6 +35,8 @@
 #include <cpu_macros.S>
 #include <plat_macros.S>
 
+	.global	denver_disable_dco
+
 	/* ---------------------------------------------
 	 * Disable debug interfaces
 	 * ---------------------------------------------
@@ -111,22 +113,6 @@ func denver_core_pwr_dwn
 
 	mov	x19, x30
 
-	/* ----------------------------------------------------
-	 * We enter the 'core power gated with ARM state not
-	 * retained' power state during CPU power down. We let
-	 * DCO know that we expect to enter this power state
-	 * by writing to the ACTLR_EL1 register.
- 	 * ----------------------------------------------------
- 	 */
-	mov	x0, #DENVER_CPU_STATE_POWER_DOWN
-	msr	actlr_el1, x0
-
-	/* ---------------------------------------------
-	 * Force DCO to be quiescent
-	 * ---------------------------------------------
-	 */
-	bl	denver_disable_dco
-
 	/* ---------------------------------------------
 	 * Force the debug interfaces to be quiescent
 	 * ---------------------------------------------
@@ -163,7 +149,27 @@ func denver_cpu_reg_dump
 	ret
 endfunc denver_cpu_reg_dump
 
-declare_cpu_ops denver, DENVER_1_0_MIDR, \
+declare_cpu_ops denver, DENVER_MIDR_PN0, \
+	denver_reset_func, \
+	denver_core_pwr_dwn, \
+	denver_cluster_pwr_dwn
+
+declare_cpu_ops denver, DENVER_MIDR_PN1, \
+	denver_reset_func, \
+	denver_core_pwr_dwn, \
+	denver_cluster_pwr_dwn
+
+declare_cpu_ops denver, DENVER_MIDR_PN2, \
+	denver_reset_func, \
+	denver_core_pwr_dwn, \
+	denver_cluster_pwr_dwn
+
+declare_cpu_ops denver, DENVER_MIDR_PN3, \
+	denver_reset_func, \
+	denver_core_pwr_dwn, \
+	denver_cluster_pwr_dwn
+
+declare_cpu_ops denver, DENVER_MIDR_PN4, \
 	denver_reset_func, \
 	denver_core_pwr_dwn, \
 	denver_cluster_pwr_dwn
diff --git a/lib/el3_runtime/aarch32/context_mgmt.c b/lib/el3_runtime/aarch32/context_mgmt.c
index 51b77595..df22eaf5 100644
--- a/lib/el3_runtime/aarch32/context_mgmt.c
+++ b/lib/el3_runtime/aarch32/context_mgmt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,6 +38,7 @@
 #include <platform_def.h>
 #include <smcc_helpers.h>
 #include <string.h>
+#include <utils.h>
 
 /*******************************************************************************
  * Context management library initialisation routine. This library is used by
@@ -84,7 +85,7 @@ static void cm_init_context_common(cpu_context_t *ctx, const entry_point_info_t
 	security_state = GET_SECURITY_STATE(ep->h.attr);
 
 	/* Clear any residual register values from the context */
-	memset(ctx, 0, sizeof(*ctx));
+	zeromem(ctx, sizeof(*ctx));
 
 	reg_ctx = get_regs_ctx(ctx);
 
diff --git a/lib/el3_runtime/aarch64/context_mgmt.c b/lib/el3_runtime/aarch64/context_mgmt.c
index e26950df..5cce8793 100644
--- a/lib/el3_runtime/aarch64/context_mgmt.c
+++ b/lib/el3_runtime/aarch64/context_mgmt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,6 +39,7 @@
 #include <platform_def.h>
 #include <smcc_helpers.h>
 #include <string.h>
+#include <utils.h>
 
 
 /*******************************************************************************
@@ -91,7 +92,7 @@ static void cm_init_context_common(cpu_context_t *ctx, const entry_point_info_t
 	security_state = GET_SECURITY_STATE(ep->h.attr);
 
 	/* Clear any residual register values from the context */
-	memset(ctx, 0, sizeof(*ctx));
+	zeromem(ctx, sizeof(*ctx));
 
 	/*
 	 * Base the context SCR on the current value, adjust for entry point
diff --git a/lib/locks/exclusive/aarch64/spinlock.S b/lib/locks/exclusive/aarch64/spinlock.S
index 1ca59123..bdc9ea0f 100644
--- a/lib/locks/exclusive/aarch64/spinlock.S
+++ b/lib/locks/exclusive/aarch64/spinlock.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,7 +33,66 @@
 	.globl	spin_lock
 	.globl	spin_unlock
 
+#if (ARM_ARCH_MAJOR > 8) || ((ARM_ARCH_MAJOR == 8) && (ARM_ARCH_MINOR >= 1))
 
+/*
+ * When compiled for ARMv8.1 or later, choose spin locks based on Compare and
+ * Swap instruction.
+ */
+# define USE_CAS	1
+
+/*
+ * Lock contenders using CAS, upon failing to acquire the lock, wait with the
+ * monitor in open state. Therefore, a normal store upon unlocking won't
+ * generate an SEV. Use explicit SEV instruction with CAS unlock.
+ */
+# define COND_SEV()	sev
+
+#else
+
+# define USE_CAS	0
+
+/*
+ * Lock contenders using exclusive pairs, upon failing to acquire the lock, wait
+ * with the monitor in exclusive state. A normal store upon unlocking will
+ * implicitly generate an envent; so, no explicit SEV with unlock is required.
+ */
+# define COND_SEV()
+
+#endif
+
+#if USE_CAS
+
+	.arch	armv8.1-a
+
+/*
+ * Acquire lock using Compare and Swap instruction.
+ *
+ * Compare for 0 with acquire semantics, and swap 1. Wait until CAS returns
+ * 0.
+ *
+ * void spin_lock(spinlock_t *lock);
+ */
+func spin_lock
+	mov	w2, #1
+	sevl
+1:
+	wfe
+	mov	w1, wzr
+	casa	w1, w2, [x0]
+	cbnz	w1, 1b
+	ret
+endfunc spin_lock
+
+	.arch	armv8-a
+
+#else /* !USE_CAS */
+
+/*
+ * Acquire lock using load-/store-exclusive instruction pair.
+ *
+ * void spin_lock(spinlock_t *lock);
+ */
 func spin_lock
 	mov	w2, #1
 	sevl
@@ -45,8 +104,17 @@ l2:	ldaxr	w1, [x0]
 	ret
 endfunc spin_lock
 
+#endif /* USE_CAS */
 
+/*
+ * Release lock previously acquired by spin_lock.
+ *
+ * Unconditionally write 0, and conditionally generate an event.
+ *
+ * void spin_unlock(spinlock_t *lock);
+ */
 func spin_unlock
 	stlr	wzr, [x0]
+	COND_SEV()
 	ret
 endfunc spin_unlock
diff --git a/lib/psci/psci_common.c b/lib/psci/psci_common.c
index 68cdd6eb..9fdce498 100644
--- a/lib/psci/psci_common.c
+++ b/lib/psci/psci_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,6 +37,7 @@
 #include <debug.h>
 #include <platform.h>
 #include <string.h>
+#include <utils.h>
 #include "psci_private.h"
 
 /*
@@ -622,7 +623,7 @@ static int psci_get_ns_ep_info(entry_point_info_t *ep,
 	SET_PARAM_HEAD(ep, PARAM_EP, VERSION_1, ep_attr);
 
 	ep->pc = entrypoint;
-	memset(&ep->args, 0, sizeof(ep->args));
+	zeromem(&ep->args, sizeof(ep->args));
 	ep->args.arg0 = context_id;
 
 	mode = scr & SCR_HCE_BIT ? MODE32_hyp : MODE32_svc;
@@ -659,7 +660,7 @@ static int psci_get_ns_ep_info(entry_point_info_t *ep,
 	SET_PARAM_HEAD(ep, PARAM_EP, VERSION_1, ep_attr);
 
 	ep->pc = entrypoint;
-	memset(&ep->args, 0, sizeof(ep->args));
+	zeromem(&ep->args, sizeof(ep->args));
 	ep->args.arg0 = context_id;
 
 	/*
@@ -760,13 +761,7 @@ void psci_warmboot_entrypoint(void)
 				      cpu_idx);
 
 #if ENABLE_PSCI_STAT
-	/*
-	 * Capture power up time-stamp.
-	 * No cache maintenance is required as caches are off
-	 * and writes are direct to the main memory.
-	 */
-	PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_EXIT_LOW_PWR,
-		PMF_NO_CACHE_MAINT);
+	plat_psci_stat_accounting_stop(&state_info);
 #endif
 
 	psci_get_target_local_pwr_states(end_pwrlvl, &state_info);
@@ -801,7 +796,7 @@ void psci_warmboot_entrypoint(void)
 	 * Since caches are now enabled, it's necessary to do cache
 	 * maintenance before reading that same data.
 	 */
-	psci_stats_update_pwr_up(end_pwrlvl, &state_info, PMF_CACHE_MAINT);
+	psci_stats_update_pwr_up(end_pwrlvl, &state_info);
 #endif
 
 	/*
@@ -957,7 +952,7 @@ unsigned int psci_get_max_phys_off_afflvl(void)
 {
 	psci_power_state_t state_info;
 
-	memset(&state_info, 0, sizeof(state_info));
+	zeromem(&state_info, sizeof(state_info));
 	psci_get_target_local_pwr_states(PLAT_MAX_PWR_LVL, &state_info);
 
 	return psci_find_target_suspend_lvl(&state_info);
diff --git a/lib/psci/psci_main.c b/lib/psci/psci_main.c
index 0a3a60ac..5e166b52 100644
--- a/lib/psci/psci_main.c
+++ b/lib/psci/psci_main.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -117,13 +117,7 @@ int psci_cpu_suspend(unsigned int power_state,
 		psci_set_cpu_local_state(cpu_pd_state);
 
 #if ENABLE_PSCI_STAT
-		/*
-		 * Capture time-stamp before CPU standby
-		 * No cache maintenance is needed as caches
-		 * are ON through out the CPU standby operation.
-		 */
-		PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
-			PMF_NO_CACHE_MAINT);
+		plat_psci_stat_accounting_start(&state_info);
 #endif
 
 #if ENABLE_RUNTIME_INSTRUMENTATION
@@ -144,13 +138,10 @@ int psci_cpu_suspend(unsigned int power_state,
 #endif
 
 #if ENABLE_PSCI_STAT
-		/* Capture time-stamp after CPU standby */
-		PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_EXIT_LOW_PWR,
-			PMF_NO_CACHE_MAINT);
+		plat_psci_stat_accounting_stop(&state_info);
 
 		/* Update PSCI stats */
-		psci_stats_update_pwr_up(PSCI_CPU_PWR_LVL, &state_info,
-			PMF_NO_CACHE_MAINT);
+		psci_stats_update_pwr_up(PSCI_CPU_PWR_LVL, &state_info);
 #endif
 
 		return PSCI_E_SUCCESS;
diff --git a/lib/psci/psci_off.c b/lib/psci/psci_off.c
index 897bf319..394aaa3b 100644
--- a/lib/psci/psci_off.c
+++ b/lib/psci/psci_off.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -137,13 +137,7 @@ int psci_do_cpu_off(unsigned int end_pwrlvl)
 	psci_plat_pm_ops->pwr_domain_off(&state_info);
 
 #if ENABLE_PSCI_STAT
-	/*
-	 * Capture time-stamp while entering low power state.
-	 * No cache maintenance needed because caches are off
-	 * and writes are direct to main memory.
-	 */
-	PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
-		PMF_NO_CACHE_MAINT);
+	plat_psci_stat_accounting_start(&state_info);
 #endif
 
 exit:
diff --git a/lib/psci/psci_private.h b/lib/psci/psci_private.h
index 781b3b52..ca8291e4 100644
--- a/lib/psci/psci_private.h
+++ b/lib/psci/psci_private.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,6 @@
 #include <bakery_lock.h>
 #include <bl_common.h>
 #include <cpu_data.h>
-#include <pmf.h>
 #include <psci.h>
 #include <spinlock.h>
 
@@ -106,15 +105,6 @@
 #define is_cpu_standby_req(is_power_down_state, retn_lvl) \
 		(((!(is_power_down_state)) && ((retn_lvl) == 0)) ? 1 : 0)
 
-/* Following are used as ID's to capture time-stamp */
-#define PSCI_STAT_ID_ENTER_LOW_PWR		0
-#define PSCI_STAT_ID_EXIT_LOW_PWR		1
-#define PSCI_STAT_TOTAL_IDS			2
-
-/* Declare PMF service functions for PSCI */
-PMF_DECLARE_CAPTURE_TIMESTAMP(psci_svc)
-PMF_DECLARE_GET_TIMESTAMP(psci_svc)
-
 /*******************************************************************************
  * The following two data structures implement the power domain tree. The tree
  * is used to track the state of all the nodes i.e. power domain instances
@@ -246,8 +236,7 @@ void __dead2 psci_system_reset(void);
 void psci_stats_update_pwr_down(unsigned int end_pwrlvl,
 			const psci_power_state_t *state_info);
 void psci_stats_update_pwr_up(unsigned int end_pwrlvl,
-			const psci_power_state_t *state_info,
-			unsigned int flags);
+			const psci_power_state_t *state_info);
 u_register_t psci_stat_residency(u_register_t target_cpu,
 			unsigned int power_state);
 u_register_t psci_stat_count(u_register_t target_cpu,
diff --git a/lib/psci/psci_stat.c b/lib/psci/psci_stat.c
index ecbe592b..d8034a5d 100644
--- a/lib/psci/psci_stat.c
+++ b/lib/psci/psci_stat.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,9 +38,6 @@
 #define PLAT_MAX_PWR_LVL_STATES 2
 #endif
 
-/* Ticks elapsed in one second by a signal of 1 MHz */
-#define MHZ_TICKS_PER_SEC 1000000
-
 /* Following structure is used for PSCI STAT */
 typedef struct psci_stat {
 	u_register_t residency;
@@ -62,27 +59,6 @@ static psci_stat_t psci_cpu_stat[PLATFORM_CORE_COUNT]
 static psci_stat_t psci_non_cpu_stat[PSCI_NUM_NON_CPU_PWR_DOMAINS]
 				[PLAT_MAX_PWR_LVL_STATES];
 
-/* Register PMF PSCI service */
-PMF_REGISTER_SERVICE(psci_svc, PMF_PSCI_STAT_SVC_ID,
-	 PSCI_STAT_TOTAL_IDS, PMF_STORE_ENABLE)
-
-/* The divisor to use to convert raw timestamp into microseconds */
-u_register_t residency_div;
-
-/*
- * This macro calculates the stats residency in microseconds,
- * taking in account the wrap around condition.
- */
-#define calc_stat_residency(_pwrupts, _pwrdnts, _res)		\
-	do {							\
-		if (_pwrupts < _pwrdnts)			\
-			_res = UINT64_MAX - _pwrdnts + _pwrupts;\
-		else						\
-			_res = _pwrupts - _pwrdnts;		\
-		/* Convert timestamp into microseconds */	\
-		_res = _res/residency_div;			\
-	} while (0)
-
 /*
  * This functions returns the index into the `psci_stat_t` array given the
  * local power state and power domain level. If the platform implements the
@@ -150,44 +126,23 @@ void psci_stats_update_pwr_down(unsigned int end_pwrlvl,
  * It is called with caches enabled and locks acquired(for NON-CPU domain)
  ******************************************************************************/
 void psci_stats_update_pwr_up(unsigned int end_pwrlvl,
-			const psci_power_state_t *state_info,
-			unsigned int flags)
+			const psci_power_state_t *state_info)
 {
 	int parent_idx, cpu_idx = plat_my_core_pos();
 	int lvl, stat_idx;
 	plat_local_state_t local_state;
-	unsigned long long pwrup_ts = 0, pwrdn_ts = 0;
 	u_register_t residency;
 
 	assert(end_pwrlvl <= PLAT_MAX_PWR_LVL);
 	assert(state_info);
 
-	/* Initialize the residency divisor if not already initialized */
-	if (!residency_div) {
-		/* Pre-calculate divisor so that it can be directly used to
-		   convert time-stamp into microseconds */
-		residency_div = read_cntfrq_el0() / MHZ_TICKS_PER_SEC;
-		assert(residency_div);
-	}
-
-	/* Get power down time-stamp for current CPU */
-	PMF_GET_TIMESTAMP_BY_INDEX(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
-			cpu_idx, flags, pwrdn_ts);
-
-	/* In the case of 1st power on just return */
-	if (!pwrdn_ts)
-		return;
-
-	/* Get power up time-stamp for current CPU */
-	PMF_GET_TIMESTAMP_BY_INDEX(psci_svc, PSCI_STAT_ID_EXIT_LOW_PWR,
-			cpu_idx, flags, pwrup_ts);
-
 	/* Get the index into the stats array */
 	local_state = state_info->pwr_domain_state[PSCI_CPU_PWR_LVL];
 	stat_idx = get_stat_idx(local_state, PSCI_CPU_PWR_LVL);
 
-	/* Calculate stats residency */
-	calc_stat_residency(pwrup_ts, pwrdn_ts, residency);
+	/* Call into platform interface to calculate residency. */
+	residency = plat_psci_stat_get_residency(PSCI_CPU_PWR_LVL,
+	    state_info, cpu_idx);
 
 	/* Update CPU stats. */
 	psci_cpu_stat[cpu_idx][stat_idx].residency += residency;
@@ -207,10 +162,9 @@ void psci_stats_update_pwr_up(unsigned int end_pwrlvl,
 
 		assert(last_cpu_in_non_cpu_pd[parent_idx] != -1);
 
-		/* Get power down time-stamp for last CPU */
-		PMF_GET_TIMESTAMP_BY_INDEX(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
-				last_cpu_in_non_cpu_pd[parent_idx],
-				flags, pwrdn_ts);
+		/* Call into platform interface to calculate residency. */
+		residency = plat_psci_stat_get_residency(lvl, state_info,
+		    last_cpu_in_non_cpu_pd[parent_idx]);
 
 		/* Initialize back to reset value */
 		last_cpu_in_non_cpu_pd[parent_idx] = -1;
@@ -218,9 +172,6 @@ void psci_stats_update_pwr_up(unsigned int end_pwrlvl,
 		/* Get the index into the stats array */
 		stat_idx = get_stat_idx(local_state, lvl);
 
-		/* Calculate stats residency */
-		calc_stat_residency(pwrup_ts, pwrdn_ts, residency);
-
 		/* Update non cpu stats */
 		psci_non_cpu_stat[parent_idx][stat_idx].residency += residency;
 		psci_non_cpu_stat[parent_idx][stat_idx].count++;
diff --git a/lib/psci/psci_suspend.c b/lib/psci/psci_suspend.c
index dc2ab774..302116bd 100644
--- a/lib/psci/psci_suspend.c
+++ b/lib/psci/psci_suspend.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -211,13 +211,7 @@ void psci_cpu_suspend_start(entry_point_info_t *ep,
 	psci_plat_pm_ops->pwr_domain_suspend(state_info);
 
 #if ENABLE_PSCI_STAT
-	/*
-	 * Capture time-stamp while entering low power state.
-	 * No cache maintenance needed because caches are off
-	 * and writes are direct to main memory.
-	 */
-	PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
-		PMF_NO_CACHE_MAINT);
+	plat_psci_stat_accounting_start(state_info);
 #endif
 
 exit:
@@ -257,6 +251,10 @@ exit:
 	    PMF_NO_CACHE_MAINT);
 #endif
 
+#if ENABLE_PSCI_STAT
+	plat_psci_stat_accounting_start(state_info);
+#endif
+
 	/*
 	 * We will reach here if only retention/standby states have been
 	 * requested at multiple power levels. This means that the cpu
@@ -264,6 +262,11 @@ exit:
 	 */
 	wfi();
 
+#if ENABLE_PSCI_STAT
+	plat_psci_stat_accounting_stop(state_info);
+	psci_stats_update_pwr_up(end_pwrlvl, state_info);
+#endif
+
 #if ENABLE_RUNTIME_INSTRUMENTATION
 	PMF_CAPTURE_TIMESTAMP(rt_instr_svc,
 	    RT_INSTR_EXIT_HW_LOW_PWR,