23 files changed, 1281 insertions, 42 deletions
diff --git a/libc/ports/ChangeLog.aarch64 b/libc/ports/ChangeLog.aarch64
index dd1d64cb6..c7487f5b7 100644
--- a/libc/ports/ChangeLog.aarch64
+++ b/libc/ports/ChangeLog.aarch64
@@ -1,3 +1,42 @@
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
+        * sysdeps/aarch64/strlen.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
+        * sysdeps/aarch64/strcmp.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
+        * sysdeps/aarch64/bzero.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
+        * sysdeps/aarch64/memmove.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
+        * sysdeps/aarch64/memcpy.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
+	* sysdeps/aarch64/memset.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
+	* sysdeps/aarch64/sysdep.h (ENTRY_ALIGN): New.
+	* sysdeps/aarch64/memcmp.S: New file.
+
+2013-01-17  Marcus Shawcroft  <marcus.shawcroft@linaro.org>
+
+	* sysdeps/aarch64/sysdep.h (ENTRY, END): Adjust
+	whitespace.
+
+2013-01-10  Joseph Myers  <joseph@codesourcery.com>
+
+	* sysdeps/aarch64/bits/setjmp.h (__jmp_buf): Use __extension__
+	with long long.
+
 2013-01-02  Joseph Myers  <joseph@codesourcery.com>
 
 	* All files with FSF copyright notices: Update copyright dates
diff --git a/libc/ports/ChangeLog.hppa b/libc/ports/ChangeLog.hppa
index 2b798529b..ef904c267 100644
--- a/libc/ports/ChangeLog.hppa
+++ b/libc/ports/ChangeLog.hppa
@@ -1,3 +1,11 @@
+2013-01-10  Joseph Myers  <joseph@codesourcery.com>
+
+	* sysdeps/hppa/fpu/fpu_control.h (_FPU_GETCW): Use __extension__
+	with long long.
+	(_FPU_SETCW): Likewise.
+	* sysdeps/unix/sysv/linux/hppa/bits/ipc.h (struct ipc_perm):
+	Likewise.
+
 2013-01-08  Andreas Jaeger  <aj@suse.de>
 
 	[BZ# 14985]
diff --git a/libc/ports/ChangeLog.m68k b/libc/ports/ChangeLog.m68k
index b6e93669c..20eaac716 100644
--- a/libc/ports/ChangeLog.m68k
+++ b/libc/ports/ChangeLog.m68k
@@ -1,3 +1,8 @@
+2013-01-10  Joseph Myers  <joseph@codesourcery.com>
+
+	* sysdeps/m68k/bits/byteswap.h (__bswap_64): Use __extension__
+	with long long.
+
 2013-01-04  Andreas Schwab  <schwab@suse.de>
 
 	* sysdeps/m68k/m680x0/fpu/libm-test-ulps: Update.
diff --git a/libc/ports/ChangeLog.mips b/libc/ports/ChangeLog.mips
index 5a80540bc..9e2bcffc6 100644
--- a/libc/ports/ChangeLog.mips
+++ b/libc/ports/ChangeLog.mips
@@ -1,3 +1,8 @@
+2013-01-10  Joseph Myers  <joseph@codesourcery.com>
+
+	* sysdeps/unix/sysv/linux/mips/bits/sigcontext.h (struct
+	sigcontext): Use __extension__ with long long in all definitions.
+
 2013-01-08  Steve Ellcey  <sellcey@mips.com>
 
 	* sysdeps/mips/memcpy.S: Change prefetch hint, reorder partial
diff --git a/libc/ports/ChangeLog.tile b/libc/ports/ChangeLog.tile
index e70742bb3..a5eaec3a9 100644
--- a/libc/ports/ChangeLog.tile
+++ b/libc/ports/ChangeLog.tile
@@ -1,3 +1,9 @@
+2013-01-10  Chris Metcalf  <cmetcalf@tilera.com>
+
+	* sysdeps/unix/sysv/linux/tile/tilegx/ldd-rewrite.sed: New file.
+	* sysdeps/unix/sysv/linux/tile/tilegx/configure.in: New file.
+	* sysdeps/unix/sysv/linux/tile/tilegx/configure: New file.
+
 2013-01-02  Joseph Myers  <joseph@codesourcery.com>
 
 	* All files with FSF copyright notices: Update copyright dates
diff --git a/libc/ports/sysdeps/aarch64/bits/setjmp.h b/libc/ports/sysdeps/aarch64/bits/setjmp.h
index 6d78b7e87..6a93e0a25 100644
--- a/libc/ports/sysdeps/aarch64/bits/setjmp.h
+++ b/libc/ports/sysdeps/aarch64/bits/setjmp.h
@@ -27,7 +27,7 @@
 /* Jump buffer contains:
    x19-x28, x29(fp), x30(lr), (x31)sp, d8-d15.  Other registers are not
    saved.  */
-typedef unsigned long long __jmp_buf [22];
+__extension__ typedef unsigned long long __jmp_buf [22];
 
 #endif
 #endif
diff --git a/libc/ports/sysdeps/aarch64/bzero.S b/libc/ports/sysdeps/aarch64/bzero.S
new file mode 100644
index 000000000..228c0a5f3
--- /dev/null
+++ b/libc/ports/sysdeps/aarch64/bzero.S
@@ -0,0 +1,27 @@
+/* Copyright (C) 2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY(__bzero)
+	mov	x2, x1
+	mov	x1, xzr
+	b	__memset
+END(__bzero)
+weak_alias (__bzero, bzero)
diff --git a/libc/ports/sysdeps/aarch64/memcmp.S b/libc/ports/sysdeps/aarch64/memcmp.S
new file mode 100644
index 000000000..6398ddd3e
--- /dev/null
+++ b/libc/ports/sysdeps/aarch64/memcmp.S
@@ -0,0 +1,151 @@
+/* memcmp - compare memory
+
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x3
+#define data1w		w3
+#define data2		x4
+#define data2w		w4
+#define has_nul		x5
+#define diff		x6
+#define endloop		x7
+#define tmp1		x8
+#define tmp2		x9
+#define tmp3		x10
+#define pos		x11
+#define limit_wd	x12
+#define mask		x13
+
+ENTRY_ALIGN (memcmp, 6)
+	cbz	limit, L(ret0)
+	eor	tmp1, src1, src2
+	tst	tmp1, #7
+	b.ne	L(misaligned8)
+	ands	tmp1, src1, #7
+	b.ne	L(mutual_align)
+	add	limit_wd, limit, #7
+	lsr	limit_wd, limit_wd, #3
+	/* Start of performance-critical section  -- one 64B cache line.  */
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	subs	limit_wd, limit_wd, #1
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
+	cbz	endloop, L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+	/* Not reached the limit, must have found a diff.  */
+	cbnz	limit_wd, L(not_limit)
+
+	/* Limit % 8 == 0 => all bytes significant.  */
+	ands	limit, limit, #7
+	b.eq	L(not_limit)
+
+	lsl	limit, limit, #3	/* Bits -> bytes.  */
+	mov	mask, #~0
+#ifdef __AARCH64EB__
+	lsr	mask, mask, limit
+#else
+	lsl	mask, mask, limit
+#endif
+	bic	data1, data1, mask
+	bic	data2, data2, mask
+
+	orr	diff, diff, mask
+L(not_limit):
+
+#ifndef	__AARCH64EB__
+	rev	diff, diff
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	/* The MS-non-zero bit of DIFF marks either the first bit
+	   that is different, or the end of the significant data.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, diff
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	RET
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that precede the start point.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	add	limit_wd, limit, #7
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	lsr	limit_wd, limit_wd, #3
+	b	L(start_realigned)
+
+L(ret0):
+	mov	result, #0
+	RET
+
+	.p2align 6
+L(misaligned8):
+	sub	limit, limit, #1
+1:
+	/* Perhaps we can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	subs	limit, limit, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	1b
+	sub	result, data1, data2
+	RET
+END (memcmp)
+#undef bcmp
+weak_alias (memcmp, bcmp)
+libc_hidden_builtin_def (memcmp)
diff --git a/libc/ports/sysdeps/aarch64/memcpy.S b/libc/ports/sysdeps/aarch64/memcpy.S
new file mode 100644
index 000000000..4f4e36c06
--- /dev/null
+++ b/libc/ports/sysdeps/aarch64/memcpy.S
@@ -0,0 +1,176 @@
+/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Unaligned accesses
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define tmp1	x3
+#define tmp1w	w3
+#define tmp2	x4
+#define tmp2w	w4
+#define tmp3	x5
+#define tmp3w	w5
+#define dst	x6
+
+#define A_l	x7
+#define A_h	x8
+#define B_l	x9
+#define B_h	x10
+#define C_l	x11
+#define C_h	x12
+#define D_l	x13
+#define D_h	x14
+
+#include <sysdep.h>
+
+ENTRY_ALIGN (memcpy, 6)
+
+	mov	dst, dstin
+	cmp	count, #64
+	b.ge	L(cpy_not_short)
+	cmp	count, #15
+	b.le	L(tail15tiny)
+
+	/* Deal with small copies quickly by dropping straight into the
+	 * exit block.  */
+L(tail63):
+	/* Copy up to 48 bytes of data.  At this point we only need the
+	 * bottom 6 bits of count to be accurate.  */
+	ands	tmp1, count, #0x30
+	b.eq	L(tail15)
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src, #-48]
+	stp	A_l, A_h, [dst, #-48]
+1:
+	ldp	A_l, A_h, [src, #-32]
+	stp	A_l, A_h, [dst, #-32]
+2:
+	ldp	A_l, A_h, [src, #-16]
+	stp	A_l, A_h, [dst, #-16]
+
+L(tail15):
+	ands	count, count, #15
+	beq	1f
+	add	src, src, count
+	ldp	A_l, A_h, [src, #-16]
+	add	dst, dst, count
+	stp	A_l, A_h, [dst, #-16]
+1:
+	RET
+
+L(tail15tiny):
+	/* Copy up to 15 bytes of data.  Does not assume additional data
+	   being copied.  */
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	count, #0, 1f
+	ldrb	tmp1w, [src]
+	strb	tmp1w, [dst]
+1:
+	RET
+
+L(cpy_not_short):
+	/* We don't much care about the alignment of DST, but we want SRC
+	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
+	 * boundaries on both loads and stores.  */
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
+	b.eq	2f
+	sub	count, count, tmp2
+	/* Copy more data than needed; it's faster than jumping
+	 * around copying sub-Quadword quantities.  We know that
+	 * it can't overrun.  */
+	ldp	A_l, A_h, [src]
+	add	src, src, tmp2
+	stp	A_l, A_h, [dst]
+	add	dst, dst, tmp2
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	L(tail63)
+2:
+	subs	count, count, #128
+	b.ge	L(cpy_body_large)
+	/* Less than 128 bytes to copy, so handle 64 here and then jump
+	 * to the tail.  */
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+	stp	A_l, A_h, [dst]
+	stp	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+	stp	D_l, D_h, [dst, #48]
+	tst	count, #0x3f
+	add	src, src, #64
+	add	dst, dst, #64
+	b.ne	L(tail63)
+	RET
+
+	/* Critical loop.  Start at a new cache line boundary.  Assuming
+	 * 64 bytes per line this ensures the entire loop is in one line.  */
+	.p2align 6
+L(cpy_body_large):
+	/* There are at least 128 bytes to copy.  */
+	ldp	A_l, A_h, [src, #0]
+	sub	dst, dst, #16		/* Pre-bias.  */
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
+1:
+	stp	A_l, A_h, [dst, #16]
+	ldp	A_l, A_h, [src, #16]
+	stp	B_l, B_h, [dst, #32]
+	ldp	B_l, B_h, [src, #32]
+	stp	C_l, C_h, [dst, #48]
+	ldp	C_l, C_h, [src, #48]
+	stp	D_l, D_h, [dst, #64]!
+	ldp	D_l, D_h, [src, #64]!
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst, #16]
+	stp	B_l, B_h, [dst, #32]
+	stp	C_l, C_h, [dst, #48]
+	stp	D_l, D_h, [dst, #64]
+	add	src, src, #16
+	add	dst, dst, #64 + 16
+	tst	count, #0x3f
+	b.ne	L(tail63)
+	RET
+END (memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/libc/ports/sysdeps/aarch64/memmove.S b/libc/ports/sysdeps/aarch64/memmove.S
new file mode 100644
index 000000000..c42eb1c13
--- /dev/null
+++ b/libc/ports/sysdeps/aarch64/memmove.S
@@ -0,0 +1,312 @@
+/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Unaligned accesses
+ */
+
+/* Parameters and result.  */
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define tmp1	x3
+#define tmp1w	w3
+#define tmp2	x4
+#define tmp2w	w4
+#define tmp3	x5
+#define tmp3w	w5
+#define dst	x6
+
+#define A_l	x7
+#define A_h	x8
+#define B_l	x9
+#define B_h	x10
+#define C_l	x11
+#define C_h	x12
+#define D_l	x13
+#define D_h	x14
+
+ENTRY_ALIGN (memmove, 6)
+
+	cmp	dstin, src
+	b.lo	L(downwards)
+	add	tmp1, src, count
+	cmp	dstin, tmp1
+	b.hs	memcpy		/* No overlap.  */
+
+	/* Upwards move with potential overlap.
+	 * Need to move from the tail backwards.  SRC and DST point one
+	 * byte beyond the remaining data to move.  */
+	add	dst, dstin, count
+	add	src, src, count
+	cmp	count, #64
+	b.ge	L(mov_not_short_up)
+
+	/* Deal with small moves quickly by dropping straight into the
+	 * exit block.  */
+L(tail63up):
+	/* Move up to 48 bytes of data.  At this point we only need the
+	 * bottom 6 bits of count to be accurate.  */
+	ands	tmp1, count, #0x30
+	b.eq	L(tail15up)
+	sub	dst, dst, tmp1
+	sub	src, src, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src, #32]
+	stp	A_l, A_h, [dst, #32]
+1:
+	ldp	A_l, A_h, [src, #16]
+	stp	A_l, A_h, [dst, #16]
+2:
+	ldp	A_l, A_h, [src]
+	stp	A_l, A_h, [dst]
+L(tail15up):
+	/* Move up to 15 bytes of data.  Does not assume additional data
+	 * being moved.  */
+	tbz	count, #3, 1f
+	ldr	tmp1, [src, #-8]!
+	str	tmp1, [dst, #-8]!
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src, #-4]!
+	str	tmp1w, [dst, #-4]!
+1:
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src, #-2]!
+	strh	tmp1w, [dst, #-2]!
+1:
+	tbz	count, #0, 1f
+	ldrb	tmp1w, [src, #-1]
+	strb	tmp1w, [dst, #-1]
+1:
+	RET
+
+L(mov_not_short_up):
+	/* We don't much care about the alignment of DST, but we want SRC
+	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
+	 * boundaries on both loads and stores.  */
+	ands	tmp2, src, #15		/* Bytes to reach alignment.  */
+	b.eq	2f
+	sub	count, count, tmp2
+	/* Move enough data to reach alignment; unlike memcpy, we have to
+	 * be aware of the overlap, which means we can't move data twice.  */
+	tbz	tmp2, #3, 1f
+	ldr	tmp1, [src, #-8]!
+	str	tmp1, [dst, #-8]!
+1:
+	tbz	tmp2, #2, 1f
+	ldr	tmp1w, [src, #-4]!
+	str	tmp1w, [dst, #-4]!
+1:
+	tbz	tmp2, #1, 1f
+	ldrh	tmp1w, [src, #-2]!
+	strh	tmp1w, [dst, #-2]!
+1:
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src, #-1]!
+	strb	tmp1w, [dst, #-1]!
+1:
+
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	L(tail63up)
+2:
+	subs	count, count, #128
+	b.ge	L(mov_body_large_up)
+	/* Less than 128 bytes to move, so handle 64 here and then jump
+	 * to the tail.  */
+	ldp	A_l, A_h, [src, #-64]!
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+	stp	A_l, A_h, [dst, #-64]!
+	stp	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+	stp	D_l, D_h, [dst, #48]
+	tst	count, #0x3f
+	b.ne	L(tail63up)
+	RET
+
+	/* Critical loop.  Start at a new Icache line boundary.  Assuming
+	 * 64 bytes per line this ensures the entire loop is in one line.  */
+	.p2align 6
+L(mov_body_large_up):
+	/* There are at least 128 bytes to move.  */
+	ldp	A_l, A_h, [src, #-16]
+	ldp	B_l, B_h, [src, #-32]
+	ldp	C_l, C_h, [src, #-48]
+	ldp	D_l, D_h, [src, #-64]!
+1:
+	stp	A_l, A_h, [dst, #-16]
+	ldp	A_l, A_h, [src, #-16]
+	stp	B_l, B_h, [dst, #-32]
+	ldp	B_l, B_h, [src, #-32]
+	stp	C_l, C_h, [dst, #-48]
+	ldp	C_l, C_h, [src, #-48]
+	stp	D_l, D_h, [dst, #-64]!
+	ldp	D_l, D_h, [src, #-64]!
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst, #-16]
+	stp	B_l, B_h, [dst, #-32]
+	stp	C_l, C_h, [dst, #-48]
+	stp	D_l, D_h, [dst, #-64]!
+	tst	count, #0x3f
+	b.ne	L(tail63up)
+	RET
+
+L(downwards):
+	/* For a downwards move we can safely use memcpy provided that
+	 * DST is more than 16 bytes away from SRC.  */
+	sub	tmp1, src, #16
+	cmp	dstin, tmp1
+	b.ls	memcpy		/* May overlap, but not critically.  */
+
+	mov	dst, dstin	/* Preserve DSTIN for return value.  */
+	cmp	count, #64
+	b.ge	L(mov_not_short_down)
+
+	/* Deal with small moves quickly by dropping straight into the
+	 * exit block.  */
+L(tail63down):
+	/* Move up to 48 bytes of data.  At this point we only need the
+	 * bottom 6 bits of count to be accurate.  */
+	ands	tmp1, count, #0x30
+	b.eq	L(tail15down)
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src, #-48]
+	stp	A_l, A_h, [dst, #-48]
+1:
+	ldp	A_l, A_h, [src, #-32]
+	stp	A_l, A_h, [dst, #-32]
+2:
+	ldp	A_l, A_h, [src, #-16]
+	stp	A_l, A_h, [dst, #-16]
+L(tail15down):
+	/* Move up to 15 bytes of data.  Does not assume additional data
+	   being moved.  */
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	count, #0, 1f
+	ldrb	tmp1w, [src]
+	strb	tmp1w, [dst]
+1:
+	RET
+
+L(mov_not_short_down):
+	/* We don't much care about the alignment of DST, but we want SRC
+	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
+	 * boundaries on both loads and stores.  */
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
+	b.eq	2f
+	sub	count, count, tmp2
+	/* Move enough data to reach alignment; unlike memcpy, we have to
+	 * be aware of the overlap, which means we can't move data twice.  */
+	tbz	tmp2, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	tmp2, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	tmp2, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src], #1
+	strb	tmp1w, [dst], #1
+1:
+
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	L(tail63down)
+2:
+	subs	count, count, #128
+	b.ge	L(mov_body_large_down)
+	/* Less than 128 bytes to move, so handle 64 here and then jump
+	 * to the tail.  */
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+	stp	A_l, A_h, [dst]
+	stp	B_l, B_h, [dst, #16]
+	stp	C_l, C_h, [dst, #32]
+	stp	D_l, D_h, [dst, #48]
+	tst	count, #0x3f
+	add	src, src, #64
+	add	dst, dst, #64
+	b.ne	L(tail63down)
+	RET
+
+	/* Critical loop.  Start at a new cache line boundary.  Assuming
+	 * 64 bytes per line this ensures the entire loop is in one line.  */
+	.p2align 6
+L(mov_body_large_down):
+	/* There are at least 128 bytes to move.  */
+	ldp	A_l, A_h, [src, #0]
+	sub	dst, dst, #16		/* Pre-bias.  */
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
+1:
+	stp	A_l, A_h, [dst, #16]
+	ldp	A_l, A_h, [src, #16]
+	stp	B_l, B_h, [dst, #32]
+	ldp	B_l, B_h, [src, #32]
+	stp	C_l, C_h, [dst, #48]
+	ldp	C_l, C_h, [src, #48]
+	stp	D_l, D_h, [dst, #64]!
+	ldp	D_l, D_h, [src, #64]!
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst, #16]
+	stp	B_l, B_h, [dst, #32]
+	stp	C_l, C_h, [dst, #48]
+	stp	D_l, D_h, [dst, #64]
+	add	src, src, #16
+	add	dst, dst, #64 + 16
+	tst	count, #0x3f
+	b.ne	L(tail63down)
+	RET
+END (memmove)
+
+libc_hidden_builtin_def (memmove)
diff --git a/libc/ports/sysdeps/aarch64/memset.S b/libc/ports/sysdeps/aarch64/memset.S
new file mode 100644
index 000000000..f96f6a6ba
--- /dev/null
+++ b/libc/ports/sysdeps/aarch64/memset.S
@@ -0,0 +1,229 @@
+/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Unaligned accesses
+ *
+ */
+
+#include <sysdep.h>
+
+/* By default we assume that the DC instruction can be used to zero
+   data blocks more efficiently.  In some circumstances this might be
+   unsafe, for example in an asymmetric multiprocessor environment with
+   different DC clear lengths (neither the upper nor lower lengths are
+   safe to use).  The feature can be disabled by defining DONT_USE_DC.
+
+   If code may be run in a virtualized environment, then define
+   MAYBE_VIRT.  This will cause the code to cache the system register
+   values rather than re-reading them each call.  */
+
+#define dstin		x0
+#define val		w1
+#define count		x2
+#define tmp1		x3
+#define tmp1w		w3
+#define tmp2		x4
+#define tmp2w		w4
+#define zva_len_x	x5
+#define zva_len		w5
+#define zva_bits_x	x6
+
+#define A_l		x7
+#define A_lw		w7
+#define dst		x8
+#define tmp3w		w9
+
+ENTRY_ALIGN (__memset, 6)
+
+	mov	dst, dstin		/* Preserve return value.  */
+	ands	A_lw, val, #255
+#ifndef DONT_USE_DC
+	b.eq	L(zero_mem)
+#endif
+	orr	A_lw, A_lw, A_lw, lsl #8
+	orr	A_lw, A_lw, A_lw, lsl #16
+	orr	A_l, A_l, A_l, lsl #32
+L(tail_maybe_long):
+	cmp	count, #64
+	b.ge	L(not_short)
+L(tail_maybe_tiny):
+	cmp	count, #15
+	b.le	L(tail15tiny)
+L(tail63):
+	ands	tmp1, count, #0x30
+	b.eq	L(tail15)
+	add	dst, dst, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	stp	A_l, A_l, [dst, #-48]
+1:
+	stp	A_l, A_l, [dst, #-32]
+2:
+	stp	A_l, A_l, [dst, #-16]
+
+L(tail15):
+	and	count, count, #15
+	add	dst, dst, count
+	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
+	RET
+
+L(tail15tiny):
+	/* Set up to 15 bytes.  Does not assume earlier memory
+	   being set.  */
+	tbz	count, #3, 1f
+	str	A_l, [dst], #8
+1:
+	tbz	count, #2, 1f
+	str	A_lw, [dst], #4
+1:
+	tbz	count, #1, 1f
+	strh	A_lw, [dst], #2
+1:
+	tbz	count, #0, 1f
+	strb	A_lw, [dst]
+1:
+	RET
+
+	/* Critical loop.  Start at a new cache line boundary.  Assuming
+	 * 64 bytes per line, this ensures the entire loop is in one line.  */
+	.p2align 6
+L(not_short):
+	neg	tmp2, dst
+	ands	tmp2, tmp2, #15
+	b.eq	2f
+	/* Bring DST to 128-bit (16-byte) alignment.  We know that there's
+	 * more than that to set, so we simply store 16 bytes and advance by
+	 * the amount required to reach alignment.  */
+	sub	count, count, tmp2
+	stp	A_l, A_l, [dst]
+	add	dst, dst, tmp2
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	L(tail63)
+2:
+	sub	dst, dst, #16		/* Pre-bias.  */
+	sub	count, count, #64
+1:
+	stp	A_l, A_l, [dst, #16]
+	stp	A_l, A_l, [dst, #32]
+	stp	A_l, A_l, [dst, #48]
+	stp	A_l, A_l, [dst, #64]!
+	subs	count, count, #64
+	b.ge	1b
+	tst	count, #0x3f
+	add	dst, dst, #16
+	b.ne	L(tail63)
+	RET
+
+#ifndef DONT_USE_DC
+	/* For zeroing memory, check to see if we can use the ZVA feature to
+	 * zero entire 'cache' lines.  */
+L(zero_mem):
+	mov	A_l, #0
+	cmp	count, #63
+	b.le	L(tail_maybe_tiny)
+	neg	tmp2, dst
+	ands	tmp2, tmp2, #15
+	b.eq	1f
+	sub	count, count, tmp2
+	stp	A_l, A_l, [dst]
+	add	dst, dst, tmp2
+	cmp	count, #63
+	b.le	L(tail63)
+1:
+	/* For zeroing small amounts of memory, it's not worth setting up
+	 * the line-clear code.  */
+	cmp	count, #128
+	b.lt	L(not_short)
+#ifdef MAYBE_VIRT
+	/* For efficiency when virtualized, we cache the ZVA capability.  */
+	adrp	tmp2, L(cache_clear)
+	ldr	zva_len, [tmp2, #:lo12:L(cache_clear)]
+	tbnz	zva_len, #31, L(not_short)
+	cbnz	zva_len, L(zero_by_line)
+	mrs	tmp1, dczid_el0
+	tbz	tmp1, #4, 1f
+	/* ZVA not available.  Remember this for next time.  */
+	mov	zva_len, #~0
+	str	zva_len, [tmp2, #:lo12:L(cache_clear)]
+	b	L(not_short)
+1:
+	mov	tmp3w, #4
+	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
+	lsl	zva_len, tmp3w, zva_len
+	str	zva_len, [tmp2, #:lo12:L(cache_clear)]
+#else
+	mrs	tmp1, dczid_el0
+	tbnz	tmp1, #4, L(not_short)
+	mov	tmp3w, #4
+	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
+	lsl	zva_len, tmp3w, zva_len
+#endif
+
+L(zero_by_line):
+	/* Compute how far we need to go to become suitably aligned.  We're
+	 * already at quad-word alignment.  */
+	cmp	count, zva_len_x
+	b.lt	L(not_short)		/* Not enough to reach alignment.  */
+	sub	zva_bits_x, zva_len_x, #1
+	neg	tmp2, dst
+	ands	tmp2, tmp2, zva_bits_x
+	b.eq	1f			/* Already aligned.  */
+	/* Not aligned, check that there's enough to copy after alignment.  */
+	sub	tmp1, count, tmp2
+	cmp	tmp1, #64
+	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
+	b.lt	L(not_short)
+	/* We know that there's at least 64 bytes to zero and that it's safe
+	 * to overrun by 64 bytes.  */
+	mov	count, tmp1
+2:
+	stp	A_l, A_l, [dst]
+	stp	A_l, A_l, [dst, #16]
+	stp	A_l, A_l, [dst, #32]
+	subs	tmp2, tmp2, #64
+	stp	A_l, A_l, [dst, #48]
+	add	dst, dst, #64
+	b.ge	2b
+	/* We've overrun a bit, so adjust dst downwards.  */
+	add	dst, dst, tmp2
+1:
+	sub	count, count, zva_len_x
+3:
+	dc	zva, dst
+	add	dst, dst, zva_len_x
+	subs	count, count, zva_len_x
+	b.ge	3b
+	ands	count, count, zva_bits_x
+	b.ne	L(tail_maybe_long)
+	RET
+#ifdef MAYBE_VIRT
+	.bss
+	.p2align 2
+L(cache_clear):
+	.space 4
+#endif
+#endif /* DONT_USE_DC */
+
+END (__memset)
+weak_alias (__memset, memset)
+libc_hidden_builtin_def (memset)
diff --git a/libc/ports/sysdeps/aarch64/strcmp.S b/libc/ports/sysdeps/aarch64/strcmp.S
new file mode 100644
index 000000000..fa4705c8b
--- /dev/null
+++ b/libc/ports/sysdeps/aarch64/strcmp.S
@@ -0,0 +1,155 @@
+/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+#include <sysdep.h>
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+/* Parameters and result.  */
+#define src1		x0
+#define src2		x1
+#define result		x0
+
+/* Internal variables.  */
+#define data1		x2
+#define data1w		w2
+#define data2		x3
+#define data2w		w3
+#define has_nul		x4
+#define diff		x5
+#define syndrome	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define zeroones	x10
+#define pos		x11
+
+	/* Start of performance-critical section  -- one 64B cache line.  */
+ENTRY_ALIGN(strcmp, 6)
+
+	eor	tmp1, src1, src2
+	mov	zeroones, #REP8_01
+	tst	tmp1, #7
+	b.ne	L(misaligned8)
+	ands	tmp1, src1, #7
+	b.ne	L(mutual_align)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+L(loop_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+L(start_realigned):
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	bic	has_nul, tmp1, tmp2	/* Non-zero if NUL terminator.  */
+	orr	syndrome, diff, has_nul
+	cbz	syndrome, L(loop_aligned)
+	/* End of performance-critical section  -- one 64B cache line.  */
+
+#ifndef	__AARCH64EB__
+	rev	syndrome, syndrome
+	rev	data1, data1
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	clz	pos, syndrome
+	rev	data2, data2
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	RET
+#else
+	/* For big-endian we cannot use the trick with the syndrome value
+	   as carry-propagation can corrupt the upper bits if the trailing
+	   bytes in the string contain 0x01.  */
+	/* However, if there is no NUL byte in the dword, we can generate
+	   the result directly.  We can't just subtract the bytes as the
+	   MSB might be significant.  */
+	cbnz	has_nul, 1f
+	cmp	data1, data2
+	cset	result, ne
+	cneg	result, result, lo
+	RET
+1:
+	/* Re-compute the NUL-byte detection, using a byte-reversed value.  */
+	rev	tmp3, data1
+	sub	tmp1, tmp3, zeroones
+	orr	tmp2, tmp3, #REP8_7f
+	bic	has_nul, tmp1, tmp2
+	rev	has_nul, has_nul
+	orr	syndrome, diff, has_nul
+	clz	pos, syndrome
+	/* The MS-non-zero bit of the syndrome marks either the first bit
+	   that is different, or the top bit of the first zero byte.
+	   Shifting left now will bring the critical information into the
+	   top bits.  */
+	lsl	data1, data1, pos
+	lsl	data2, data2, pos
+	/* But we need to zero-extend (char is unsigned) the value and then
+	   perform a signed 32-bit subtraction.  */
+	lsr	data1, data1, #56
+	sub	result, data1, data2, lsr #56
+	RET
+#endif
+
+L(mutual_align):
+	/* Sources are mutually aligned, but are not currently at an
+	   alignment boundary.  Round down the addresses and then mask off
+	   the bytes that preceed the start point.  */
+	bic	src1, src1, #7
+	bic	src2, src2, #7
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	ldr	data1, [src1], #8
+	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	ldr	data2, [src2], #8
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2, data2, tmp2
+	b	L(start_realigned)
+
+L(misaligned8):
+	/* We can do better than this.  */
+	ldrb	data1w, [src1], #1
+	ldrb	data2w, [src2], #1
+	cmp	data1w, #1
+	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
+	b.eq	L(misaligned8)
+	sub	result, data1, data2
+	RET
+END(strcmp)
+libc_hidden_builtin_def (strcmp)
diff --git a/libc/ports/sysdeps/aarch64/strlen.S b/libc/ports/sysdeps/aarch64/strlen.S
new file mode 100644
index 000000000..ba05009c6
--- /dev/null
+++ b/libc/ports/sysdeps/aarch64/strlen.S
@@ -0,0 +1,117 @@
+/* Copyright (C) 2012-2013 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+/* Arguments and results.  */
+#define srcin		x0
+#define len		x0
+
+/* Locals and temporaries.  */
+#define src		x1
+#define data1		x2
+#define data2		x3
+#define data2a		x4
+#define has_nul1	x5
+#define has_nul2	x6
+#define tmp1		x7
+#define tmp2		x8
+#define tmp3		x9
+#define tmp4		x10
+#define zeroones	x11
+#define pos		x12
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+	/* Start of critial section -- keep to one 64Byte cache line.  */
+ENTRY_ALIGN (strlen, 6)
+	mov	zeroones, #REP8_01
+	bic	src, srcin, #15
+	ands	tmp1, srcin, #15
+	b.ne	L(misaligned)
+	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+	   can be done in parallel across the entire word.  */
+	/* The inner loop deals with two Dwords at a time.  This has a
+	   slightly higher start-up cost, but we should win quite quickly,
+	   especially on cores with a high number of issue slots per
+	   cycle, as we get much better parallelism out of the operations.  */
+L(loop):
+	ldp	data1, data2, [src], #16
+L(realigned):
+	sub	tmp1, data1, zeroones
+	orr	tmp2, data1, #REP8_7f
+	sub	tmp3, data2, zeroones
+	orr	tmp4, data2, #REP8_7f
+	bic	has_nul1, tmp1, tmp2
+	bics	has_nul2, tmp3, tmp4
+	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
+	b.eq	L(loop)
+	/* End of critical section -- keep to one 64Byte cache line.  */
+
+	sub	len, src, srcin
+	cbz	has_nul1, L(nul_in_data2)
+#ifdef __AARCH64EB__
+	mov	data2, data1
+#endif
+	sub	len, len, #8
+	mov	has_nul2, has_nul1
+L(nul_in_data2):
+#ifdef __AARCH64EB__
+	/* For big-endian, carry propagation (if the final byte in the
+	   string is 0x01) means we cannot use has_nul directly.  The
+	   easiest way to get the correct byte is to byte-swap the data
+	   and calculate the syndrome a second time.  */
+	rev	data2, data2
+	sub	tmp1, data2, zeroones
+	orr	tmp2, data2, #REP8_7f
+	bic	has_nul2, tmp1, tmp2
+#endif
+	sub	len, len, #8
+	rev	has_nul2, has_nul2
+	clz	pos, has_nul2
+	add	len, len, pos, lsr #3		/* Bits to bytes.  */
+	RET
+
+L(misaligned):
+	cmp	tmp1, #8
+	neg	tmp1, tmp1
+	ldp	data1, data2, [src], #16
+	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	mov	tmp2, #~0
+#ifdef __AARCH64EB__
+	/* Big-endian.  Early bytes are at MSB.  */
+	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#else
+	/* Little-endian.  Early bytes are at LSB.  */
+	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+#endif
+	orr	data1, data1, tmp2
+	orr	data2a, data2, tmp2
+	csinv	data1, data1, xzr, le
+	csel	data2, data2, data2a, le
+	b	L(realigned)
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/libc/ports/sysdeps/aarch64/sysdep.h b/libc/ports/sysdeps/aarch64/sysdep.h
index d9469b8e6..6b75ada14 100644
--- a/libc/ports/sysdeps/aarch64/sysdep.h
+++ b/libc/ports/sysdeps/aarch64/sysdep.h
@@ -25,24 +25,33 @@
 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name
 
 /* Define an entry point visible from C.  */
-#define ENTRY(name)							      \
-  .globl C_SYMBOL_NAME(name);						      \
-  .type C_SYMBOL_NAME(name),%function;					      \
-  .align 4;								      \
-  C_LABEL(name)								      \
-  cfi_startproc;							      \
+#define ENTRY(name)						\
+  .globl C_SYMBOL_NAME(name);					\
+  .type C_SYMBOL_NAME(name),%function;				\
+  .align 4;							\
+  C_LABEL(name)							\
+  cfi_startproc;						\
+  CALL_MCOUNT
+
+/* Define an entry point visible from C.  */
+#define ENTRY_ALIGN(name, align)				\
+  .globl C_SYMBOL_NAME(name);					\
+  .type C_SYMBOL_NAME(name),%function;				\
+  .p2align align;						\
+  C_LABEL(name)							\
+  cfi_startproc;						\
   CALL_MCOUNT
 
 #undef	END
-#define END(name)							      \
-  cfi_endproc;								      \
+#define END(name)						\
+  cfi_endproc;							\
   ASM_SIZE_DIRECTIVE(name)
 
 /* If compiled for profiling, call `mcount' at the start of each function.  */
 #ifdef	PROF
-# define CALL_MCOUNT			\
-	str	x30, [sp, #-16]!;	\
-	bl	mcount;			\
+# define CALL_MCOUNT						\
+	str	x30, [sp, #-16]!;				\
+	bl	mcount;						\
 	ldr	x30, [sp], #16	;
 #else
 # define CALL_MCOUNT		/* Do nothing.  */
diff --git a/libc/ports/sysdeps/arm/Makefile b/libc/ports/sysdeps/arm/Makefile
index 355f5b309..7c19398ff 100644
--- a/libc/ports/sysdeps/arm/Makefile
+++ b/libc/ports/sysdeps/arm/Makefile
@@ -39,11 +39,6 @@ endif
 
 ifeq ($(subdir),debug)
 CFLAGS-backtrace.c += -funwind-tables
-CFLAGS-tst-backtrace2.c += -funwind-tables
-CFLAGS-tst-backtrace3.c += -funwind-tables
-CFLAGS-tst-backtrace4.c += -funwind-tables
-CFLAGS-tst-backtrace5.c += -funwind-tables
-CFLAGS-tst-backtrace6.c += -funwind-tables
 endif
 
 ifeq ($(subdir),math)
diff --git a/libc/ports/sysdeps/hppa/fpu/fpu_control.h b/libc/ports/sysdeps/hppa/fpu/fpu_control.h
index 7aa16c9dd..5cac3344d 100644
--- a/libc/ports/sysdeps/hppa/fpu/fpu_control.h
+++ b/libc/ports/sysdeps/hppa/fpu/fpu_control.h
@@ -44,7 +44,7 @@ typedef unsigned int fpu_control_t;
 /* Macros for accessing the hardware control word.  */
 #define _FPU_GETCW(cw) \
 ({										\
-  union { unsigned long long __fpreg; unsigned int __halfreg[2]; } __fullfp;	\
+  union { __extension__ unsigned long long __fpreg; unsigned int __halfreg[2]; } __fullfp; \
   /* Get the current status word. */						\
   __asm__ ("fstd %%fr0,0(%1)\n\t"						\
            "fldd 0(%1),%%fr0\n\t"						\
@@ -54,7 +54,7 @@ typedef unsigned int fpu_control_t;
 
 #define _FPU_SETCW(cw) \
 ({										\
-  union { unsigned long long __fpreg; unsigned int __halfreg[2]; } __fullfp;	\
+  union { __extension__ unsigned long long __fpreg; unsigned int __halfreg[2]; } __fullfp;	\
   __fullfp.__halfreg[0] = cw;							\
   __asm__ ("fldd 0(%1),%%fr0\n\t"						\
 	   : : "m" (__fullfp.__fpreg), "r" (__fullfp.__fpreg) : "%r0" );	\
diff --git a/libc/ports/sysdeps/m68k/bits/byteswap.h b/libc/ports/sysdeps/m68k/bits/byteswap.h
index 4e4dd2341..9f0a7b707 100644
--- a/libc/ports/sysdeps/m68k/bits/byteswap.h
+++ b/libc/ports/sysdeps/m68k/bits/byteswap.h
@@ -74,6 +74,7 @@ __bswap_32 (unsigned int __bsx)
    | (((x) & 0x00000000000000ffull) << 56))
 
 /* Swap bytes in 64 bit value.  */
+__extension__
 static __inline unsigned long long
 __bswap_64 (unsigned long long __bsx)
 {
diff --git a/libc/ports/sysdeps/mips/Makefile b/libc/ports/sysdeps/mips/Makefile
index d87b2c406..a1526998e 100644
--- a/libc/ports/sysdeps/mips/Makefile
+++ b/libc/ports/sysdeps/mips/Makefile
@@ -13,11 +13,6 @@ endif
 
 ifeq ($(subdir),debug)
 CFLAGS-backtrace.c += -funwind-tables
-CFLAGS-tst-backtrace2.c += -funwind-tables
-CFLAGS-tst-backtrace3.c += -funwind-tables
-CFLAGS-tst-backtrace4.c += -funwind-tables
-CFLAGS-tst-backtrace5.c += -funwind-tables
-CFLAGS-tst-backtrace6.c += -funwind-tables
 endif
 
 ifeq ($(subdir),csu)
diff --git a/libc/ports/sysdeps/unix/sysv/linux/hppa/bits/ipc.h b/libc/ports/sysdeps/unix/sysv/linux/hppa/bits/ipc.h
index bc07c1fa3..04a75e2e8 100644
--- a/libc/ports/sysdeps/unix/sysv/linux/hppa/bits/ipc.h
+++ b/libc/ports/sysdeps/unix/sysv/linux/hppa/bits/ipc.h
@@ -57,6 +57,6 @@ struct ipc_perm
 #endif
     unsigned short int __seq;		/* Sequence number.  */
     unsigned int __pad3;
-    unsigned long long int __unused1;
-    unsigned long long int __unused2;
+    __extension__ unsigned long long int __unused1;
+    __extension__ unsigned long long int __unused2;
   };
diff --git a/libc/ports/sysdeps/unix/sysv/linux/mips/bits/sigcontext.h b/libc/ports/sysdeps/unix/sysv/linux/mips/bits/sigcontext.h
index 085a00d04..f3c5180b8 100644
--- a/libc/ports/sysdeps/unix/sysv/linux/mips/bits/sigcontext.h
+++ b/libc/ports/sysdeps/unix/sysv/linux/mips/bits/sigcontext.h
@@ -39,16 +39,16 @@
 struct sigcontext {
   unsigned int sc_regmask;
   unsigned int sc_status;
-  unsigned long long sc_pc;
-  unsigned long long sc_regs[32];
-  unsigned long long sc_fpregs[32];
+  __extension__ unsigned long long sc_pc;
+  __extension__ unsigned long long sc_regs[32];
+  __extension__ unsigned long long sc_fpregs[32];
   unsigned int sc_ownedfp;
   unsigned int sc_fpc_csr;
   unsigned int sc_fpc_eir;
   unsigned int sc_used_math;
   unsigned int sc_dsp;
-  unsigned long long sc_mdhi;
-  unsigned long long sc_mdlo;
+  __extension__ unsigned long long sc_mdhi;
+  __extension__ unsigned long long sc_mdlo;
   unsigned long sc_hi1;
   unsigned long sc_lo1;
   unsigned long sc_hi2;
@@ -61,17 +61,17 @@ struct sigcontext {
 
 /* This structure changed in 2.6.12-rc4 when DSP support was added.  */
 struct sigcontext {
-  unsigned long long sc_regs[32];
-  unsigned long long sc_fpregs[32];
-  unsigned long long sc_mdhi;
-  unsigned long long sc_hi1;
-  unsigned long long sc_hi2;
-  unsigned long long sc_hi3;
-  unsigned long long sc_mdlo;
-  unsigned long long sc_lo1;
-  unsigned long long sc_lo2;
-  unsigned long long sc_lo3;
-  unsigned long long sc_pc;
+  __extension__ unsigned long long sc_regs[32];
+  __extension__ unsigned long long sc_fpregs[32];
+  __extension__ unsigned long long sc_mdhi;
+  __extension__ unsigned long long sc_hi1;
+  __extension__ unsigned long long sc_hi2;
+  __extension__ unsigned long long sc_hi3;
+  __extension__ unsigned long long sc_mdlo;
+  __extension__ unsigned long long sc_lo1;
+  __extension__ unsigned long long sc_lo2;
+  __extension__ unsigned long long sc_lo3;
+  __extension__ unsigned long long sc_pc;
   unsigned int sc_fpc_csr;
   unsigned int sc_used_math;
   unsigned int sc_dsp;
diff --git a/libc/ports/sysdeps/unix/sysv/linux/tile/tilegx/configure b/libc/ports/sysdeps/unix/sysv/linux/tile/tilegx/configure
new file mode 100644
index 000000000..bfa30f6dc
--- /dev/null
+++ b/libc/ports/sysdeps/unix/sysv/linux/tile/tilegx/configure
@@ -0,0 +1,4 @@
+# This file is generated from configure.in by Autoconf.  DO NOT EDIT!
+ # Local configure fragment for sysdeps/unix/sysv/linux/tile/tilegx
+
+ldd_rewrite_script=$dir/ldd-rewrite.sed
diff --git a/libc/ports/sysdeps/unix/sysv/linux/tile/tilegx/configure.in b/libc/ports/sysdeps/unix/sysv/linux/tile/tilegx/configure.in
new file mode 100644
index 000000000..87d86bd4c
--- /dev/null
+++ b/libc/ports/sysdeps/unix/sysv/linux/tile/tilegx/configure.in
@@ -0,0 +1,4 @@
+GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory.
+# Local configure fragment for sysdeps/unix/sysv/linux/tile/tilegx
+
+ldd_rewrite_script=$dir/ldd-rewrite.sed
diff --git a/libc/ports/sysdeps/unix/sysv/linux/tile/tilegx/ldd-rewrite.sed b/libc/ports/sysdeps/unix/sysv/linux/tile/tilegx/ldd-rewrite.sed
new file mode 100644
index 000000000..8b0bb691c
--- /dev/null
+++ b/libc/ports/sysdeps/unix/sysv/linux/tile/tilegx/ldd-rewrite.sed
@@ -0,0 +1 @@
+s_^\(RTLDLIST=\)\(.*lib\)\(\|32\)\(/[^/]*\.so\.[0-9.]*\)[ 	]*$_\1"\2\4 \232\4"_