29 files changed, 1456 insertions, 71 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 203d16eed3..bdf7964d14 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -26,6 +26,29 @@ CFLAGS-strstr.c += -msse4
 CFLAGS-strcasestr.c += -msse4
 CFLAGS-strcasestr-nonascii.c += -msse4
 endif
+
+ifeq ($(enable-mpx), yes)
+sysdep_routines += memcpy-ssse3-back-1 mempcpy-ssse3-back-1 memmove-ssse3-back-1 \
+						 memcpy-c memmove-c mempcpy-c memcpy_chk-c mempcpy_chk-c memmove_chk-c
+#These are C versions written with intrinsics. We need to add checks as intrinsics manually
+CFLAGS-varshift.c +=  -fno-chkp-check-read -fno-chkp-check-write
+CFLAGS-strcspn-c.c += -fno-chkp-check-read -fno-chkp-check-write
+CFLAGS-strpbrk-c.c += -fno-chkp-check-read -fno-chkp-check-write
+CFLAGS-strspn-c.c +=  -fno-chkp-check-read -fno-chkp-check-write
+CFLAGS-strstr.c +=    -fno-chkp-check-read -fno-chkp-check-write
+CFLAGS-strcasestr.c += -fno-chkp-check-read -fno-chkp-check-write
+CFLAGS-strcasestr-nonascii.c += -fno-chkp-check-read -fno-chkp-check-write
+#Checks are put manually for these routines.
+CFLAGS-memcpy-c.c += -fno-chkp-check-read -fno-chkp-check-write
+CFLAGS-mempcpy-c.c += -fno-chkp-check-read -fno-chkp-check-write
+CFLAGS-memmove-c.c += -fno-chkp-check-read -fno-chkp-check-write
+endif
+
+ifeq ($(enable-mpx-write-only), yes)
+CFLAGS-memcpy-c.c += -D__CHKWR__
+CFLAGS-memmove-c.c += -D__CHKWR__
+endif
+
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/multiarch/Versions b/sysdeps/x86_64/multiarch/Versions
index 59b185ac8d..5325bdece6 100644
--- a/sysdeps/x86_64/multiarch/Versions
+++ b/sysdeps/x86_64/multiarch/Versions
@@ -2,4 +2,17 @@ libc {
   GLIBC_PRIVATE {
     __get_cpu_features;
   }
+%ifdef __CHKP__
+  GLIBC_2.17 {
+   chkp_memcpy_nobnd;
+   chkp_memmove_nobnd;
+   chkp_mempcpy_nobnd;
+   chkp_memcpy_nobnd_nochk;
+   chkp_memmove_nobnd_nochk;
+   chkp_mempcpy_nobnd_nochk;
+   chkp_memcpy_nochk;
+   chkp_memmove_nochk;
+   chkp_mempcpy_nochk;
+  }
+%endif
 }
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
index 639f02bde3..9809d471ba 100644
--- a/sysdeps/x86_64/multiarch/bcopy.S
+++ b/sysdeps/x86_64/multiarch/bcopy.S
@@ -3,5 +3,10 @@
 	.text
 ENTRY(bcopy)
 	xchg	%rdi, %rsi
+#ifdef __CHKP__
+	bndmov %bnd0, %bnd2
+	bndmov %bnd1, %bnd0
+	bndmov %bnd2, %bnd1
+#endif
 	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
 END(bcopy)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d0992e113f..e3a4163c5b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -44,6 +44,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
+#ifndef __CHKP__
+  /* We use specific version for MPX glibc */
   /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
   IFUNC_IMPL (i, name, __memmove_chk,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
@@ -60,6 +62,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
+#endif
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
   IFUNC_IMPL (i, name, stpncpy,
@@ -207,6 +210,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
 #ifdef SHARED
+#ifndef __CHKP__
+  /* We use specific version of memcpy, memcpy_chk, mempcpy if Intel MPX is enabled.  */
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
@@ -240,6 +245,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
+#endif
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index 1ed4200f4c..b5c6675d31 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -48,6 +48,13 @@ ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
 	shl	$2, %rdx
 # endif
+# ifdef __CHKP__
+	testq	%rdx, %rdx
+	jz	L(NoEntryCheck)
+	bndcl	(%rdi), %bnd0
+	bndcl	(%rsi), %bnd1
+L(NoEntryCheck):
+# endif
 	pxor	%xmm0, %xmm0
 	cmp	$79, %rdx
 	ja	L(79bytesormore)
@@ -70,6 +77,10 @@ L(firstbyte):
 
 	ALIGN (4)
 L(79bytesormore):
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rsi), %xmm1
 	movdqu	(%rdi), %xmm2
 	pxor	%xmm1, %xmm2
@@ -90,21 +101,37 @@ L(79bytesormore):
 L(less128bytes):
 	sub	$64, %rdx
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqu	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqu	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -112,11 +139,19 @@ L(less128bytes):
 	cmp	$32, %rdx
 	jb	L(less32bytesin64)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqu	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqu	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -139,41 +174,73 @@ L(128bytesormore):
 L(less256bytes):
 	sub	$128, %rdx
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqu	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqu	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(64bytesin256)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqu	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqu	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(96bytesin256)
 
+# ifdef __CHKP__
+	bndcu	96(%rdi), %bnd0
+	bndcu	96(%rsi), %bnd1
+# endif
 	movdqu	96(%rdi), %xmm2
 	pxor	96(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(112bytesin256)
 
+# ifdef __CHKP__
+	bndcu	112(%rdi), %bnd0
+	bndcu	112(%rsi), %bnd1
+# endif
 	movdqu	112(%rdi), %xmm2
 	pxor	112(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -188,11 +255,19 @@ L(less256bytes):
 	cmp	$32, %rdx
 	jb	L(less32bytesin128)
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -207,81 +282,145 @@ L(less32bytesin128):
 
 L(less512bytes):
 	sub	$256, %rdx
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqu	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqu	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(64bytesin256)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqu	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqu	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(96bytesin256)
 
+# ifdef __CHKP__
+	bndcu	96(%rdi), %bnd0
+	bndcu	96(%rsi), %bnd1
+# endif
 	movdqu	96(%rdi), %xmm2
 	pxor	96(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(112bytesin256)
 
+# ifdef __CHKP__
+	bndcu	112(%rdi), %bnd0
+	bndcu	112(%rsi), %bnd1
+# endif
 	movdqu	112(%rdi), %xmm2
 	pxor	112(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(128bytesin256)
 
+# ifdef __CHKP__
+	bndcu	128(%rdi), %bnd0
+	bndcu	128(%rsi), %bnd1
+# endif
 	movdqu	128(%rdi), %xmm2
 	pxor	128(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(144bytesin256)
 
+# ifdef __CHKP__
+	bndcu	144(%rdi), %bnd0
+	bndcu	144(%rsi), %bnd1
+# endif
 	movdqu	144(%rdi), %xmm2
 	pxor	144(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(160bytesin256)
 
+# ifdef __CHKP__
+	bndcu	160(%rdi), %bnd0
+	bndcu	160(%rsi), %bnd1
+# endif
 	movdqu	160(%rdi), %xmm2
 	pxor	160(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(176bytesin256)
 
+# ifdef __CHKP__
+	bndcu	176(%rdi), %bnd0
+	bndcu	176(%rsi), %bnd1
+# endif
 	movdqu	176(%rdi), %xmm2
 	pxor	176(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(192bytesin256)
 
+# ifdef __CHKP__
+	bndcu	192(%rdi), %bnd0
+	bndcu	192(%rsi), %bnd1
+# endif
 	movdqu	192(%rdi), %xmm2
 	pxor	192(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(208bytesin256)
 
+# ifdef __CHKP__
+	bndcu	208(%rdi), %bnd0
+	bndcu	208(%rsi), %bnd1
+# endif
 	movdqu	208(%rdi), %xmm2
 	pxor	208(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(224bytesin256)
 
+# ifdef __CHKP__
+	bndcu	224(%rdi), %bnd0
+	bndcu	224(%rsi), %bnd1
+# endif
 	movdqu	224(%rdi), %xmm2
 	pxor	224(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(240bytesin256)
 
+# ifdef __CHKP__
+	bndcu	240(%rdi), %bnd0
+	bndcu	240(%rsi), %bnd1
+# endif
 	movdqu	240(%rdi), %xmm2
 	pxor	240(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -299,11 +438,19 @@ L(less512bytes):
 	cmp	$32, %rdx
 	jb	L(less32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -331,18 +478,34 @@ L(512bytesormore):
 	sub	$64, %rdx
 	ALIGN (4)
 L(64bytesormore_loop):
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	movdqa	%xmm2, %xmm1
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm3
 	pxor	16(%rsi), %xmm3
 	por	%xmm3, %xmm1
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqu	32(%rdi), %xmm4
 	pxor	32(%rsi), %xmm4
 	por	%xmm4, %xmm1
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqu	48(%rdi), %xmm5
 	pxor	48(%rsi), %xmm5
 	por	%xmm5, %xmm1
@@ -365,18 +528,34 @@ L(L2_L3_cache_unaglined):
 L(L2_L3_unaligned_128bytes_loop):
 	prefetchnta 0x1c0(%rdi)
 	prefetchnta 0x1c0(%rsi)
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	movdqa	%xmm2, %xmm1
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm3
 	pxor	16(%rsi), %xmm3
 	por	%xmm3, %xmm1
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqu	32(%rdi), %xmm4
 	pxor	32(%rsi), %xmm4
 	por	%xmm4, %xmm1
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqu	48(%rdi), %xmm5
 	pxor	48(%rsi), %xmm5
 	por	%xmm5, %xmm1
@@ -403,21 +582,37 @@ L(2aligned):
 L(less128bytesin2aligned):
 	sub	$64, %rdx
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqa	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqa	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqa	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -425,11 +620,19 @@ L(less128bytesin2aligned):
 	cmp	$32, %rdx
 	jb	L(less32bytesin64in2alinged)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqa	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqa	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -453,41 +656,73 @@ L(128bytesormorein2aligned):
 L(less256bytesin2alinged):
 	sub	$128, %rdx
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqa	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqa	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqa	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(64bytesin256)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqa	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqa	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(96bytesin256)
 
+# ifdef __CHKP__
+	bndcu	96(%rdi), %bnd0
+	bndcu	96(%rsi), %bnd1
+# endif
 	movdqa	96(%rdi), %xmm2
 	pxor	96(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(112bytesin256)
 
+# ifdef __CHKP__
+	bndcu	112(%rdi), %bnd0
+	bndcu	112(%rsi), %bnd1
+# endif
 	movdqa	112(%rdi), %xmm2
 	pxor	112(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -502,11 +737,19 @@ L(less256bytesin2alinged):
 	cmp	$32, %rdx
 	jb	L(less32bytesin128in2aligned)
 
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqu	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqu	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -523,81 +766,145 @@ L(less32bytesin128in2aligned):
 L(256bytesormorein2aligned):
 
 	sub	$256, %rdx
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(16bytesin256)
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqa	16(%rdi), %xmm2
 	pxor	16(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(32bytesin256)
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqa	32(%rdi), %xmm2
 	pxor	32(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(48bytesin256)
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqa	48(%rdi), %xmm2
 	pxor	48(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(64bytesin256)
 
+# ifdef __CHKP__
+	bndcu	64(%rdi), %bnd0
+	bndcu	64(%rsi), %bnd1
+# endif
 	movdqa	64(%rdi), %xmm2
 	pxor	64(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(80bytesin256)
 
+# ifdef __CHKP__
+	bndcu	80(%rdi), %bnd0
+	bndcu	80(%rsi), %bnd1
+# endif
 	movdqa	80(%rdi), %xmm2
 	pxor	80(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(96bytesin256)
 
+# ifdef __CHKP__
+	bndcu	96(%rdi), %bnd0
+	bndcu	96(%rsi), %bnd1
+# endif
 	movdqa	96(%rdi), %xmm2
 	pxor	96(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(112bytesin256)
 
+# ifdef __CHKP__
+	bndcu	112(%rdi), %bnd0
+	bndcu	112(%rsi), %bnd1
+# endif
 	movdqa	112(%rdi), %xmm2
 	pxor	112(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(128bytesin256)
 
+# ifdef __CHKP__
+	bndcu	128(%rdi), %bnd0
+	bndcu	128(%rsi), %bnd1
+# endif
 	movdqa	128(%rdi), %xmm2
 	pxor	128(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(144bytesin256)
 
+# ifdef __CHKP__
+	bndcu	144(%rdi), %bnd0
+	bndcu	144(%rsi), %bnd1
+# endif
 	movdqa	144(%rdi), %xmm2
 	pxor	144(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(160bytesin256)
 
+# ifdef __CHKP__
+	bndcu	160(%rdi), %bnd0
+	bndcu	160(%rsi), %bnd1
+# endif
 	movdqa	160(%rdi), %xmm2
 	pxor	160(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(176bytesin256)
 
+# ifdef __CHKP__
+	bndcu	176(%rdi), %bnd0
+	bndcu	176(%rsi), %bnd1
+# endif
 	movdqa	176(%rdi), %xmm2
 	pxor	176(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(192bytesin256)
 
+# ifdef __CHKP__
+	bndcu	192(%rdi), %bnd0
+	bndcu	192(%rsi), %bnd1
+# endif
 	movdqa	192(%rdi), %xmm2
 	pxor	192(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(208bytesin256)
 
+# ifdef __CHKP__
+	bndcu	208(%rdi), %bnd0
+	bndcu	208(%rsi), %bnd1
+# endif
 	movdqa	208(%rdi), %xmm2
 	pxor	208(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(224bytesin256)
 
+# ifdef __CHKP__
+	bndcu	224(%rdi), %bnd0
+	bndcu	224(%rsi), %bnd1
+# endif
 	movdqa	224(%rdi), %xmm2
 	pxor	224(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
 	jnc	L(240bytesin256)
 
+# ifdef __CHKP__
+	bndcu	240(%rdi), %bnd0
+	bndcu	240(%rsi), %bnd1
+# endif
 	movdqa	240(%rdi), %xmm2
 	pxor	240(%rsi), %xmm2
 	ptest	%xmm2, %xmm0
@@ -648,18 +955,34 @@ L(512bytesormorein2aligned):
 	sub	$64, %rdx
 	ALIGN (4)
 L(64bytesormore_loopin2aligned):
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	movdqa	%xmm2, %xmm1
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqa	16(%rdi), %xmm3
 	pxor	16(%rsi), %xmm3
 	por	%xmm3, %xmm1
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqa	32(%rdi), %xmm4
 	pxor	32(%rsi), %xmm4
 	por	%xmm4, %xmm1
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqa	48(%rdi), %xmm5
 	pxor	48(%rsi), %xmm5
 	por	%xmm5, %xmm1
@@ -682,18 +1005,34 @@ L(L2_L3_cache_aglined):
 L(L2_L3_aligned_128bytes_loop):
 	prefetchnta 0x1c0(%rdi)
 	prefetchnta 0x1c0(%rsi)
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+	bndcu	(%rsi), %bnd1
+# endif
 	movdqa	(%rdi), %xmm2
 	pxor	(%rsi), %xmm2
 	movdqa	%xmm2, %xmm1
 
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+	bndcu	16(%rsi), %bnd1
+# endif
 	movdqa	16(%rdi), %xmm3
 	pxor	16(%rsi), %xmm3
 	por	%xmm3, %xmm1
 
+# ifdef __CHKP__
+	bndcu	32(%rdi), %bnd0
+	bndcu	32(%rsi), %bnd1
+# endif
 	movdqa	32(%rdi), %xmm4
 	pxor	32(%rsi), %xmm4
 	por	%xmm4, %xmm1
 
+# ifdef __CHKP__
+	bndcu	48(%rdi), %bnd0
+	bndcu	48(%rsi), %bnd1
+# endif
 	movdqa	48(%rdi), %xmm5
 	pxor	48(%rsi), %xmm5
 	por	%xmm5, %xmm1
diff --git a/sysdeps/x86_64/multiarch/memcpy-c.c b/sysdeps/x86_64/multiarch/memcpy-c.c
new file mode 100644
index 0000000000..6fa50eada1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-c.c
@@ -0,0 +1,80 @@
+/* C-version of memcpy for using when Intel MPX is on
+   in order to prosess with a buffer of pointers correctly.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+
+void *
+__memcpy (void *dst, const void *src, size_t n)
+{
+  if (!n) return dst;
+
+  __bnd_chk_ptr_lbounds(dst);
+  __bnd_chk_ptr_ubounds(dst+n-1);
+#ifndef __CHKWR__
+  __bnd_chk_ptr_lbounds(src);
+  __bnd_chk_ptr_ubounds(src+n-1);
+#endif
+
+  return chkp_memcpy_nochk(dst, src, n);
+}
+
+void *
+chkp_memcpy_nochk (void *dst, const void *src, size_t n)
+{
+  const char *s = src;
+  char *d = dst;
+  void *ret = dst;
+  size_t offset_src = ((size_t) s) & (sizeof(size_t) - 1);
+  size_t offset_dst = ((size_t) d) & (sizeof(size_t) - 1);
+
+  if (offset_src != offset_dst)
+  {
+    while (n--)
+      *d++ = *s++;
+  }
+  else
+  {
+    if (offset_src) offset_src = sizeof(size_t) - offset_src;
+    while (n-- && offset_src--)
+      *d++ = *s++;
+    n++;
+    if (!n) return ret;
+    void **d1 = (void **)d;
+    void **s1 = (void **)s;
+    while (n >= sizeof(void *))
+    {
+      n -= sizeof(void *);
+      *d1++ = *s1++;
+    }
+    s = (char *)s1;
+    d = (char *)d1;
+    while (n--)
+      *d++ = *s++;
+  }
+  return ret;
+}
+
+weak_alias (__memcpy, __GI_memcpy)
+
+# if defined SHARED && !defined NOT_IN_libc && !defined IA32
+#  include <shlib-compat.h>
+versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
+# else
+weak_alias (__memcpy, memcpy)
+# endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back-1.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back-1.S
new file mode 100644
index 0000000000..7fedbeef8e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back-1.S
@@ -0,0 +1,5 @@
+/* optimized version of memcpy without any checks or copying bounds.  */
+#define MEMCPY chkp_memcpy_nobnd_nochk
+#undef __CHKP__
+#undef __CHKWR__
+#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index fc9fcef27d..16b4e680a1 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -27,7 +27,11 @@
 #include "asm-syntax.h"
 
 #ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_back
+# if defined  __CHKP__ || defined __CHKWR__
+#  define MEMCPY		chkp_memcpy_nobnd
+# else
+#  define MEMCPY		__memcpy_ssse3_back
+# endif
 # define MEMCPY_CHK	__memcpy_chk_ssse3_back
 #endif
 
@@ -48,7 +52,7 @@
   ud2
 
 	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_BCOPY
+#if !defined USE_AS_BCOPY && defined MEMCPY_CHK
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
@@ -56,6 +60,15 @@ END (MEMCPY_CHK)
 #endif
 
 ENTRY (MEMCPY)
+#ifdef __CHKP__
+	testq	%rdx, %rdx
+	jz	L(NoEntryCheck)
+	bndcl	(%rdi), %bnd0
+	bndcu	-1(%rdi, %rdx), %bnd0
+	bndcl	(%rsi), %bnd1
+	bndcu	-1(%rsi, %rdx), %bnd1
+#endif
+
 	mov	%rdi, %rax
 #ifdef USE_AS_MEMPCPY
 	add	%rdx, %rax
@@ -87,6 +100,15 @@ L(bk_write):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 #endif
 
+#ifdef __CHKP__
+L(NoEntryCheck):
+	mov	%rdi, %rax
+# ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+# endif
+	ret
+#endif
+
 	ALIGN (4)
 L(144bytesormore):
 
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index a1e5031376..fc5ab2da03 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -18,14 +18,15 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <shlib-compat.h>
-#include <init-arch.h>
+#if !defined  __CHKP__ && !defined __CHKWR__
+# include <sysdep.h>
+# include <shlib-compat.h>
+# include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
    DSO.  In static binaries we need memcpy before the initialization
    happened.  */
-#if defined SHARED && !defined NOT_IN_libc
+# if defined SHARED && !defined NOT_IN_libc
 	.text
 ENTRY(__new_memcpy)
 	.type	__new_memcpy, @gnu_indirect_function
@@ -43,37 +44,39 @@ ENTRY(__new_memcpy)
 3:	ret
 END(__new_memcpy)
 
-# undef ENTRY
-# define ENTRY(name) \
+#  undef ENTRY
+#  define ENTRY(name) \
 	.type __memcpy_sse2, @function; \
 	.globl __memcpy_sse2; \
 	.hidden __memcpy_sse2; \
 	.p2align 4; \
 	__memcpy_sse2: cfi_startproc; \
 	CALL_MCOUNT
-# undef END
-# define END(name) \
+#  undef END
+#  define END(name) \
 	cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2
 
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
+#  undef ENTRY_CHK
+#  define ENTRY_CHK(name) \
 	.type __memcpy_chk_sse2, @function; \
 	.globl __memcpy_chk_sse2; \
 	.p2align 4; \
 	__memcpy_chk_sse2: cfi_startproc; \
 	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
+#  undef END_CHK
+#  define END_CHK(name) \
 	cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2
 
-# undef libc_hidden_builtin_def
+#  undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memcpy calls through a PLT.
    The speedup we get from using SSSE3 instruction is likely eaten away
    by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
+#  define libc_hidden_builtin_def(name) \
 	.globl __GI_memcpy; __GI_memcpy = __memcpy_sse2
 
 versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14);
-#endif
+# endif
+
+# include "../memcpy.S"
 
-#include "../memcpy.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk-c.c b/sysdeps/x86_64/multiarch/memcpy_chk-c.c
new file mode 100644
index 0000000000..1eee86c639
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy_chk-c.c
@@ -0,0 +1 @@
+#include <debug/memcpy_chk.c>
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index ad01d8cd9f..6f87f2686d 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -18,14 +18,15 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
+#if !defined  __CHKP__ && !defined __CHKWR__
+# include <sysdep.h>
+# include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
    DSO.  There are no multiarch memcpy functions for static binaries.
  */
-#ifndef NOT_IN_libc
-# ifdef SHARED
+# ifndef NOT_IN_libc
+#  ifdef SHARED
 	.text
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
@@ -41,7 +42,8 @@ ENTRY(__memcpy_chk)
 	leaq	__memcpy_chk_ssse3_back(%rip), %rax
 2:	ret
 END(__memcpy_chk)
-# else
-#  include "../memcpy_chk.S"
+#  else
+#   include "../memcpy_chk.S"
+#  endif
 # endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove-c.c b/sysdeps/x86_64/multiarch/memmove-c.c
new file mode 100644
index 0000000000..7111128e75
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-c.c
@@ -0,0 +1,118 @@
+/* C-version of memmove for using when Intel MPX is enabled
+   in order to prosess with a buffer of pointers correctly.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+
+void *
+__memmove (void *dst, const void *src, size_t n)
+{
+  if (n == 0) return dst;
+
+  __bnd_chk_ptr_lbounds(dst);
+  __bnd_chk_ptr_ubounds(dst+n-1);
+#ifndef __CHKWR__
+  __bnd_chk_ptr_lbounds(src);
+  __bnd_chk_ptr_ubounds(src+n-1);
+#endif
+  return chkp_memmove_nochk(dst, src, n);
+}
+
+
+void *
+chkp_memmove_nochk (void *dst, const void *src, size_t n)
+{
+  const char *s = src;
+  char *d = dst;
+  void *ret = dst;
+  size_t offset_src = ((size_t) s) & (sizeof(size_t) - 1);
+  size_t offset_dst = ((size_t) d) & (sizeof(size_t) - 1);
+
+  if (offset_src != offset_dst)
+  {
+    if (s < d)
+    {
+      /* backward copying */
+      d += n;
+      s += n;
+      while (n--)
+        *--d = *--s;
+    }
+    else
+      /* forward copying */
+      while (n--)
+        *d++ = *s++;
+  }
+  else
+  {
+    if (s < d)
+    {
+      offset_src = (offset_src + (size_t)src) & (sizeof(size_t) - 1);
+      /* backward copying */
+      d += n;
+      s += n;
+      while (n-- && offset_src--)
+        *--d = *--s;
+      n++;
+      if (!n) return ret;
+      void **d1 = (void **)d;
+      void **s1 = (void **)s;
+      while (n >= sizeof(void *))
+      {
+        n -= sizeof(void *);
+        *--d1 = *--s1;
+      }
+      s = (char *)s1;
+      d = (char *)d1;
+      while (n--)
+        *--d = *--s;
+    }
+    else
+    {
+      if (offset_src) offset_src = sizeof(size_t) - offset_src;
+      /* forward copying */
+      while (n-- && offset_src--)
+        *d++ = *s++;
+      n++;
+      if (!n) return ret;
+      void **d1 = (void **)d;
+      void **s1 = (void **)s;
+      while (n >= sizeof(void *))
+      {
+        n -= sizeof(void *);
+        *d1++ = *s1++;
+      }
+      s = (char *)s1;
+      d = (char *)d1;
+      while (n--)
+        *d++ = *s++;
+    }
+  }
+  return ret;
+}
+
+weak_alias (__memmove, __libc_memmove)
+weak_alias (__memmove, __GI_memmove)
+weak_alias (__memmove, memmove)
+
+# if defined SHARED && !defined NOT_IN_libc
+#  include <shlib-compat.h>
+#  if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
+#  endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back-1.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back-1.S
new file mode 100644
index 0000000000..2a1f3e67b7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3-back-1.S
@@ -0,0 +1,6 @@
+/* optimized version of memmove without any checks or copying bounds.  */
+#define USE_AS_MEMMOVE
+#define MEMCPY chkp_memmove_nobnd_nochk
+#undef __CHKP__
+#undef __CHKWR__
+#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
index f9a4e9aff9..478141b14a 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
@@ -1,4 +1,10 @@
 #define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3_back
+#if defined __CHKP__ || defined __CHKWR__
+/* version of memmove with no copying of bounds support
+   if there are pointers in the source buffer. */
+# define MEMCPY	   chkp_memmove_nobnd
+# else
+# define MEMCPY		__memmove_ssse3_back
+#endif
 #define MEMCPY_CHK	__memmove_chk_ssse3_back
 #include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index 8149c487d5..0d2c6f0266 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -17,31 +17,32 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef NOT_IN_libc
-# define MEMMOVE __memmove_sse2
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-#  define libc_hidden_builtin_def(name) \
+#ifndef __CHKP__
+# ifndef NOT_IN_libc
+#  define MEMMOVE __memmove_sse2
+#  ifdef SHARED
+#   undef libc_hidden_builtin_def
+#   define libc_hidden_builtin_def(name) \
   __hidden_ver1 (__memmove_sse2, __GI_memmove, __memmove_sse2);
-# endif
+#  endif
 
 /* Redefine memmove so that the compiler won't complain about the type
    mismatch with the IFUNC selector in strong_alias, below.  */
-# undef memmove
-# define memmove __redirect_memmove
-# include <string.h>
-# undef memmove
+#  undef memmove
+#  define memmove __redirect_memmove
+#  include <string.h>
+#  undef memmove
 
 extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
-#endif
+# endif
 
-#include "string/memmove.c"
+# include "string/memmove.c"
 
-#ifndef NOT_IN_libc
-# include <shlib-compat.h>
-# include "init-arch.h"
+# ifndef NOT_IN_libc
+#  include <shlib-compat.h>
+#  include "init-arch.h"
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
@@ -54,7 +55,8 @@ libc_ifunc (__libc_memmove,
 
 strong_alias (__libc_memmove, memmove)
 
-# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+#  if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
 compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
+#  endif
 # endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove_chk-c.c b/sysdeps/x86_64/multiarch/memmove_chk-c.c
new file mode 100644
index 0000000000..bbf53d00d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove_chk-c.c
@@ -0,0 +1 @@
+#include <debug/memmove_chk.c>
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index 17ed460324..c1b0b9304b 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -17,19 +17,21 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <string.h>
-#include "init-arch.h"
+#ifndef __CHKP__
+# include <string.h>
+# include "init-arch.h"
 
-#define MEMMOVE_CHK __memmove_chk_sse2
+# define MEMMOVE_CHK __memmove_chk_sse2
 
 extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
 extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
 
-#include "debug/memmove_chk.c"
+# include "debug/memmove_chk.c"
 
 libc_ifunc (__memmove_chk,
 	    HAS_SSSE3
 	    ? (HAS_FAST_COPY_BACKWARD
 	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
 	    : __memmove_chk_sse2);
+#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy-c.c b/sysdeps/x86_64/multiarch/mempcpy-c.c
new file mode 100644
index 0000000000..522fb86e3e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-c.c
@@ -0,0 +1,36 @@
+/* C-version of mempcpy for using when Intel MPX is enabled
+   in order to process with an array of pointers correctly.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stddef.h>
+
+void *
+mempcpy (void *dst, const void *src, size_t n)
+{
+  return memcpy(dst, src, n) + n;
+}
+
+void *
+chkp_mempcpy_nochk (void *dst, const void *src, size_t n)
+{
+  return chkp_memcpy_nochk(dst, src, n) + n;
+}
+
+weak_alias (mempcpy, __GI_mempcpy)
+weak_alias (mempcpy, __GI___mempcpy)
+weak_alias (mempcpy, __mempcpy)
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back-1.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back-1.S
new file mode 100644
index 0000000000..eb929f4182
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back-1.S
@@ -0,0 +1,6 @@
+/* optimized version of mempcpy without any checks or copying bounds.  */
+#define USE_AS_MEMPCPY
+#define MEMCPY chkp_mempcpy_nobnd_nochk
+#undef __CHKP__
+#undef __CHKWR__
+#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
index 82ffacb8fb..f32ecfc76e 100644
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
@@ -1,4 +1,12 @@
 #define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_ssse3_back
-#define MEMCPY_CHK	__mempcpy_chk_ssse3_back
+
+#if defined __CHKP__ || defined __CHKWR__
+/* version of mempcpy with no copying of bounds support
+   if there are pointers in the source buffer. */
+# define MEMCPY  chkp_mempcpy_nobnd
+#else
+# define MEMCPY	__mempcpy_ssse3_back
+#endif
+
+#define MEMCPY_CHK __mempcpy_chk_ssse3_back
 #include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b8b7fcd121..4ec5825989 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -18,13 +18,14 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
+#if !defined  __CHKP__ && !defined __CHKWR__
+# include <sysdep.h>
+# include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
    DSO.  In static binaries we need mempcpy before the initialization
    happened.  */
-#if defined SHARED && !defined NOT_IN_libc
+# if defined SHARED && !defined NOT_IN_libc
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
 	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
@@ -40,38 +41,40 @@ ENTRY(__mempcpy)
 2:	ret
 END(__mempcpy)
 
-# undef ENTRY
-# define ENTRY(name) \
+#  undef ENTRY
+#  define ENTRY(name) \
 	.type __mempcpy_sse2, @function; \
 	.p2align 4; \
 	.globl __mempcpy_sse2; \
 	.hidden __mempcpy_sse2; \
 	__mempcpy_sse2: cfi_startproc; \
 	CALL_MCOUNT
-# undef END
-# define END(name) \
+#  undef END
+#  define END(name) \
 	cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2
 
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
+#  undef ENTRY_CHK
+#  define ENTRY_CHK(name) \
 	.type __mempcpy_chk_sse2, @function; \
 	.globl __mempcpy_chk_sse2; \
 	.p2align 4; \
 	__mempcpy_chk_sse2: cfi_startproc; \
 	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
+#  undef END_CHK
+#  define END_CHK(name) \
 	cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2
 
-# undef libc_hidden_def
-# undef libc_hidden_builtin_def
+#  undef libc_hidden_def
+#  undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal mempcpy calls through a PLT.
    The speedup we get from using SSSE3 instruction is likely eaten away
    by the indirect call in the PLT.  */
-# define libc_hidden_def(name) \
+#  define libc_hidden_def(name) \
 	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2
-# define libc_hidden_builtin_def(name) \
+#  define libc_hidden_builtin_def(name) \
 	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2
-#endif
+# endif
+
+# include "../mempcpy.S"
 
-#include "../mempcpy.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk-c.c b/sysdeps/x86_64/multiarch/mempcpy_chk-c.c
new file mode 100644
index 0000000000..ba170784c3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk-c.c
@@ -0,0 +1 @@
+#include <debug/mempcpy_chk.c>
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 3801db399b..98acf9691c 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -18,14 +18,15 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
+#if !defined  __CHKP__ && !defined __CHKWR__
+# include <sysdep.h>
+# include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
    DSO.  There are no multiarch mempcpy functions for static binaries.
  */
-#ifndef NOT_IN_libc
-# ifdef SHARED
+# ifndef NOT_IN_libc
+#  ifdef SHARED
 	.text
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
@@ -41,7 +42,8 @@ ENTRY(__mempcpy_chk)
 	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
 2:	ret
 END(__mempcpy_chk)
-# else
-#  include "../mempcpy_chk.S"
+#  else
+#   include "../mempcpy_chk.S"
+#  endif
 # endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 028c6d3d74..a3535ad500 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -25,6 +25,14 @@
 #  define STRCAT  __strcat_sse2_unaligned
 # endif
 
+# ifdef __CHKP__
+#  define RETURN \
+	bndcu	-1(%rdi, %rax), %bnd0; \
+	ret
+# else
+#  define RETURN ret
+# endif
+
 # define USE_AS_STRCAT
 
 .text
@@ -37,6 +45,10 @@ ENTRY (STRCAT)
 /* Inline corresponding strlen file, temporary until new strcpy
    implementation gets merged.  */
 
+# ifdef __CHKP__
+	bndcl  (%rdi), %bnd0
+	bndcu	(%rdi), %bnd0
+# endif
 	xor	%rax, %rax
 	mov	%edi, %ecx
 	and	$0x3f, %ecx
@@ -67,84 +79,132 @@ L(align16_start):
 	pxor	%xmm1, %xmm1
 	pxor	%xmm2, %xmm2
 	pxor	%xmm3, %xmm3
+# ifdef __CHKP__
+	bndcu	16(%rax), %bnd0
+# endif
 	pcmpeqb	16(%rax), %xmm0
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)
 
+# ifdef __CHKP__
+	bndcu	32(%rax), %bnd0
+# endif
 	pcmpeqb	32(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32)
 
+# ifdef __CHKP__
+	bndcu	48(%rax), %bnd0
+# endif
 	pcmpeqb	48(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48)
 
+# ifdef __CHKP__
+	bndcu	64(%rax), %bnd0
+# endif
 	pcmpeqb	64(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
 	jnz	L(exit64)
 
+# ifdef __CHKP__
+	bndcu	80(%rax), %bnd0
+# endif
 	pcmpeqb	80(%rax), %xmm0
 	add	$64, %rax
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)
 
+# ifdef __CHKP__
+	bndcu	32(%rax), %bnd0
+# endif
 	pcmpeqb	32(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32)
 
+# ifdef __CHKP__
+	bndcu	48(%rax), %bnd0
+# endif
 	pcmpeqb	48(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48)
 
+# ifdef __CHKP__
+	bndcu	64(%rax), %bnd0
+# endif
 	pcmpeqb	64(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
 	jnz	L(exit64)
 
+# ifdef __CHKP__
+	bndcu	80(%rax), %bnd0
+# endif
 	pcmpeqb	80(%rax), %xmm0
 	add	$64, %rax
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)
 
+# ifdef __CHKP__
+	bndcu	32(%rax), %bnd0
+# endif
 	pcmpeqb	32(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32)
 
+# ifdef __CHKP__
+	bndcu	48(%rax), %bnd0
+# endif
 	pcmpeqb	48(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48)
 
+# ifdef __CHKP__
+	bndcu	64(%rax), %bnd0
+# endif
 	pcmpeqb	64(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
 	jnz	L(exit64)
 
+# ifdef __CHKP__
+	bndcu	80(%rax), %bnd0
+# endif
 	pcmpeqb	80(%rax), %xmm0
 	add	$64, %rax
 	pmovmskb %xmm0, %edx
 	test	%edx, %edx
 	jnz	L(exit16)
 
+# ifdef __CHKP__
+	bndcu	32(%rax), %bnd0
+# endif
 	pcmpeqb	32(%rax), %xmm1
 	pmovmskb %xmm1, %edx
 	test	%edx, %edx
 	jnz	L(exit32)
 
+# ifdef __CHKP__
+	bndcu	48(%rax), %bnd0
+# endif
 	pcmpeqb	48(%rax), %xmm2
 	pmovmskb %xmm2, %edx
 	test	%edx, %edx
 	jnz	L(exit48)
 
+# ifdef __CHKP__
+	bndcu	64(%rax), %bnd0
+# endif
 	pcmpeqb	64(%rax), %xmm3
 	pmovmskb %xmm3, %edx
 	test	%edx, %edx
@@ -153,6 +213,9 @@ L(align16_start):
 	test	$0x3f, %rax
 	jz	L(align64_loop)
 
+# ifdef __CHKP__
+	bndcu	80(%rax), %bnd0
+# endif
 	pcmpeqb	80(%rax), %xmm0
 	add	$80, %rax
 	pmovmskb %xmm0, %edx
@@ -162,6 +225,9 @@ L(align16_start):
 	test	$0x3f, %rax
 	jz	L(align64_loop)
 
+# ifdef __CHKP__
+	bndcu	16(%rax), %bnd0
+# endif
 	pcmpeqb	16(%rax), %xmm1
 	add	$16, %rax
 	pmovmskb %xmm1, %edx
@@ -171,6 +237,9 @@ L(align16_start):
 	test	$0x3f, %rax
 	jz	L(align64_loop)
 
+# ifdef __CHKP__
+	bndcu	16(%rax), %bnd0
+# endif
 	pcmpeqb	16(%rax), %xmm2
 	add	$16, %rax
 	pmovmskb %xmm2, %edx
@@ -180,6 +249,9 @@ L(align16_start):
 	test	$0x3f, %rax
 	jz	L(align64_loop)
 
+# ifdef __CHKP__
+	bndcu	16(%rax), %bnd0
+# endif
 	pcmpeqb	16(%rax), %xmm3
 	add	$16, %rax
 	pmovmskb %xmm3, %edx
@@ -187,8 +259,12 @@ L(align16_start):
 	jnz	L(exit)
 
 	add	$16, %rax
+
 	.p2align 4
 	L(align64_loop):
+# ifdef __CHKP__
+	bndcu	(%rax), %bnd0
+# endif
 	movaps	(%rax),	%xmm4
 	pminub	16(%rax),	%xmm4
 	movaps	32(%rax),	%xmm5
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index f170238b55..4311e8689c 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -91,6 +91,10 @@ __strchr_sse42:
 	CALL_MCOUNT
 	testb	%sil, %sil
 	je	__strend_sse4
+# ifdef __CHKP__
+	bndcl  (%rdi), %bnd0
+	bndcu  (%rdi), %bnd0
+# endif
 	pxor	%xmm2, %xmm2
 	movd	%esi, %xmm1
 	movl	%edi, %ecx
@@ -124,6 +128,9 @@ __strchr_sse42:
 	ja	L(return_null)
 L(unaligned_match):
 	addq	%rdi, %rax
+# ifdef __CHKP__
+	bndcu 	(%rax), %bnd0
+# endif
 	ret
 
 	.p2align 4
@@ -135,15 +142,27 @@ L(unaligned_no_match):
 L(loop):
 	addq	$16, %r8
 L(aligned_start):
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	pcmpistri	$0x2, (%r8), %xmm1
 	jbe	L(wrap)
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	pcmpistri	$0x2, (%r8), %xmm1
 	jbe	L(wrap)
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	pcmpistri       $0x2, (%r8), %xmm1
 	jbe     L(wrap)
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	pcmpistri	$0x2, (%r8), %xmm1
 	jbe	L(wrap)
 	jmp	L(loop)
@@ -159,6 +178,9 @@ L(return_null):
 	.p2align 4
 L(loop_exit):
 	leaq	(%r8,%rcx), %rax
+# ifdef __CHKP__
+	bndcu 	(%rax), %bnd0
+# endif
 	ret
 	cfi_endproc
 	.size	__strchr_sse42, .-__strchr_sse42
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index c84f1c2b31..edfa915707 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -127,6 +127,14 @@ STRCMP_SSE42:
 	je	LABEL(Byte0)
 	mov	%rdx, %r11
 #endif
+
+#ifdef __CHKP__
+	bndcl 	(%rdi), %bnd0
+	bndcu 	(%rdi), %bnd0
+	bndcl 	(%rsi), %bnd1
+	bndcu 	(%rsi), %bnd1
+#endif
+
 	mov	%esi, %ecx
 	mov	%edi, %eax
 /* Use 64bit AND here to avoid long NOP padding.  */
@@ -210,6 +218,10 @@ LABEL(touppermask):
 #endif
 	add	$16, %rsi		/* prepare to search next 16 bytes */
 	add	$16, %rdi		/* prepare to search next 16 bytes */
+#ifdef __CHKP__
+	bndcu 	(%rdi), %bnd0
+	bndcu 	(%rsi), %bnd1
+#endif
 
 	/*
 	 * Determine source and destination string offsets from 16-byte
@@ -231,6 +243,11 @@ LABEL(crosscache):
 	mov	%edx, %r8d		/* r8d is offset flag for exit tail */
 	xchg	%ecx, %eax
 	xchg	%rsi, %rdi
+#ifdef __CHKP__
+	bndmov	%bnd0, %bnd2
+	bndmov	%bnd1, %bnd0
+	bndmov	%bnd2, %bnd1
+#endif
 LABEL(bigger):
 	movdqa	(%rdi), %xmm2
 	movdqa	(%rsi), %xmm1
@@ -280,6 +297,10 @@ LABEL(ashr_0):
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
 	.p2align 4
 LABEL(ashr_0_use):
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rdx), %bnd0
+	bndcu	-1(%rsi, %rdx), %bnd1
+#endif
 	movdqa	(%rdi,%rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
@@ -295,6 +316,10 @@ LABEL(ashr_0_use):
 	jbe	LABEL(strcmp_exitz)
 #endif
 
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rdx), %bnd0
+	bndcu	-1(%rsi, %rdx), %bnd1
+#endif
 	movdqa	(%rdi,%rdx), %xmm0
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpistri      $0x1a,(%rsi,%rdx), %xmm0
@@ -320,6 +345,10 @@ LABEL(ashr_0_exit_use):
 	jbe	LABEL(strcmp_exitz)
 #endif
 	lea	-16(%rdx, %rcx), %rcx
+#ifdef __CHKP__
+	bndcu	-1(%rdi, %rcx), %bnd0
+	bndcu	-1(%rsi, %rcx), %bnd1
+#endif
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %edx
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
@@ -362,6 +391,15 @@ LABEL(ashr_1):
 	and	$0xfff, %r10		/* offset into 4K page */
 	sub	$0x1000, %r10		/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_1_use)
+LABEL(ashr_1_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_1_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_1_use):
@@ -416,7 +454,11 @@ LABEL(nibble_ashr_1_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$14, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_1_check)
+#else
 	ja	LABEL(nibble_ashr_1_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -450,6 +492,15 @@ LABEL(ashr_2):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_2_use)
+LABEL(ashr_2_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_2_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_2_use):
@@ -504,7 +555,11 @@ LABEL(nibble_ashr_2_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$13, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_2_check)
+#else
 	ja	LABEL(nibble_ashr_2_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -539,6 +594,15 @@ LABEL(ashr_3):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_3_use)
+LABEL(ashr_3_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_3_restart_use)
+#endif
 
 LABEL(loop_ashr_3_use):
 	add	$16, %r10
@@ -592,7 +656,11 @@ LABEL(nibble_ashr_3_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$12, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_3_check)
+#else
 	ja	LABEL(nibble_ashr_3_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -627,6 +695,15 @@ LABEL(ashr_4):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_4_use)
+LABEL(ashr_4_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_4_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_4_use):
@@ -681,7 +758,11 @@ LABEL(nibble_ashr_4_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$11, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_4_check)
+#else
 	ja	LABEL(nibble_ashr_4_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -716,6 +797,15 @@ LABEL(ashr_5):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_5_use)
+LABEL(ashr_5_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_5_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_5_use):
@@ -771,7 +861,11 @@ LABEL(nibble_ashr_5_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$10, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_5_check)
+#else
 	ja	LABEL(nibble_ashr_5_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -806,6 +900,15 @@ LABEL(ashr_6):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_6_use)
+LABEL(ashr_6_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_6_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_6_use):
@@ -860,7 +963,11 @@ LABEL(nibble_ashr_6_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$9, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_6_check)
+#else
 	ja	LABEL(nibble_ashr_6_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -895,6 +1002,15 @@ LABEL(ashr_7):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_7_use)
+LABEL(ashr_7_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_7_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_7_use):
@@ -949,7 +1065,11 @@ LABEL(nibble_ashr_7_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$8, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_7_check)
+#else
 	ja	LABEL(nibble_ashr_7_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -984,6 +1104,15 @@ LABEL(ashr_8):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_8_use)
+LABEL(ashr_8_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_8_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_8_use):
@@ -1038,7 +1167,11 @@ LABEL(nibble_ashr_8_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$7, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_8_check)
+#else
 	ja	LABEL(nibble_ashr_8_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1073,6 +1206,15 @@ LABEL(ashr_9):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_9_use)
+LABEL(ashr_9_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_9_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_9_use):
@@ -1128,7 +1270,11 @@ LABEL(nibble_ashr_9_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$6, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_9_check)
+#else
 	ja	LABEL(nibble_ashr_9_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1163,6 +1309,15 @@ LABEL(ashr_10):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_10_use)
+LABEL(ashr_10_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_10_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_10_use):
@@ -1217,7 +1372,11 @@ LABEL(nibble_ashr_10_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$5, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_10_check)
+#else
 	ja	LABEL(nibble_ashr_10_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1252,6 +1411,15 @@ LABEL(ashr_11):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_11_use)
+LABEL(ashr_11_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_11_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_11_use):
@@ -1306,7 +1474,11 @@ LABEL(nibble_ashr_11_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$4, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_11_check)
+#else
 	ja	LABEL(nibble_ashr_11_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1341,6 +1513,15 @@ LABEL(ashr_12):
 	and	$0xfff, %r10	/* offset into 4K page */
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_12_use)
+LABEL(ashr_12_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_12_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_12_use):
@@ -1395,7 +1576,11 @@ LABEL(nibble_ashr_12_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$3, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_12_check)
+#else
 	ja	LABEL(nibble_ashr_12_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1431,6 +1616,15 @@ LABEL(ashr_13):
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_13_use)
+LABEL(ashr_13_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_13_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_13_use):
@@ -1485,7 +1679,11 @@ LABEL(nibble_ashr_13_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$2, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_13_check)
+#else
 	ja	LABEL(nibble_ashr_13_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1521,6 +1719,15 @@ LABEL(ashr_14):
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_14_use)
+LABEL(ashr_14_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_14_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_14_use):
@@ -1575,7 +1782,11 @@ LABEL(nibble_ashr_14_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$1, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_14_check)
+#else
 	ja	LABEL(nibble_ashr_14_restart_use)
+#endif
 
 	jmp	LABEL(nibble_ashr_exit_use)
 
@@ -1613,6 +1824,15 @@ LABEL(ashr_15):
 	sub	$0x1000, %r10	/* subtract 4K pagesize */
 
 	mov	%rcx, %rdx	/* only for offset of sse4 instruction loop*/
+#ifdef __CHKP__
+	bndcu	-16(%rdi, %rdx), %bnd0
+	bndcu	-16(%rsi, %rdx), %bnd1
+	jmp	LABEL(loop_ashr_15_use)
+LABEL(ashr_15_check):
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+	jmp	LABEL(nibble_ashr_15_restart_use)
+#endif
 
 	.p2align 4
 LABEL(loop_ashr_15_use):
@@ -1667,7 +1887,11 @@ LABEL(nibble_ashr_15_use):
 	jae	LABEL(nibble_ashr_exit_use)
 #endif
 	cmp	$0, %ecx
+#ifdef __CHKP__
+	ja	LABEL(ashr_15_check)
+#else
 	ja	LABEL(nibble_ashr_15_restart_use)
+#endif
 
 LABEL(nibble_ashr_exit_use):
 #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
@@ -1691,6 +1915,11 @@ LABEL(exit_use):
 	test	%r8d, %r8d
 	jz	LABEL(ret_use)
 	xchg	%eax, %edx
+#ifdef __CHKP__
+	bndmov	%bnd0, %bnd2
+	bndmov	%bnd1, %bnd0
+	bndmov	%bnd2, %bnd1
+#endif
 LABEL(ret_use):
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
@@ -1707,6 +1936,11 @@ LABEL(less32bytes):
 	test	%r8d, %r8d
 	jz	LABEL(ret)
 	xchg	%rsi, %rdi		/* recover original order according to flag(%r8d) */
+#ifdef __CHKP__
+	bndmov	%bnd0, %bnd2
+	bndmov	%bnd1, %bnd0
+	bndmov	%bnd2, %bnd1
+#endif
 
 	.p2align 4
 LABEL(ret):
@@ -1717,6 +1951,10 @@ LABEL(less16bytes):
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz)
 #endif
+#ifdef __CHKP__
+	bndcu	(%rdi, %rdx), %bnd0
+	bndcu	(%rsi, %rdx), %bnd1
+#endif
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax
 
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index 7710173c68..e6baee92db 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -33,7 +33,7 @@
 	lea	TABLE(%rip), %r11;                              \
 	movslq	(%r11, INDEX, SCALE), %rcx;                     \
 	lea	(%r11, %rcx), %rcx;                             \
-	jmp	*%rcx
+	jmp *%rcx
 
 # ifndef USE_AS_STRCAT
 
@@ -51,6 +51,16 @@ ENTRY (STRCPY)
 
 # endif
 
+# ifdef __CHKP__
+	bndcl	(%rdi), %bnd0
+	bndcu	(%rdi), %bnd0
+	bndcl	(%rsi), %bnd1
+	bndcu	(%rsi), %bnd1
+#  if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	bndcu	-1(%rdi, %rdx), %bnd0
+#  endif
+# endif
+
 	and	$63, %rcx
 	cmp	$32, %rcx
 	jbe	L(SourceStringAlignmentLess32)
@@ -79,6 +89,9 @@ ENTRY (STRCPY)
 	test	%rdx, %rdx
 	jnz	L(CopyFrom1To16BytesTail)
 
+# ifdef __CHKP__
+	bndcu	16(%rsi), %bnd1
+# endif
 	pcmpeqb	16(%rsi), %xmm0
 	pmovmskb %xmm0, %rdx
 
@@ -91,6 +104,9 @@ ENTRY (STRCPY)
 	jnz	L(CopyFrom1To32Bytes)
 
 	movdqu	(%rsi, %rcx), %xmm1   /* copy 16 bytes */
+# ifdef __CHKP__
+	bndcu	15(%rdi), %bnd0
+# endif
 	movdqu	%xmm1, (%rdi)
 
 /* If source address alignment != destination address alignment */
@@ -101,6 +117,10 @@ L(Unalign16Both):
 	add	%rcx, %r8
 # endif
 	mov	$16, %rcx
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movdqa	(%rsi, %rcx), %xmm1
 	movaps	16(%rsi, %rcx), %xmm2
 	movdqu	%xmm1, (%rdi, %rcx)
@@ -118,6 +138,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movaps	16(%rsi, %rcx), %xmm3
 	movdqu	%xmm2, (%rdi, %rcx)
 	pcmpeqb	%xmm3, %xmm0
@@ -134,6 +158,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movaps	16(%rsi, %rcx), %xmm4
 	movdqu	%xmm3, (%rdi, %rcx)
 	pcmpeqb	%xmm4, %xmm0
@@ -150,6 +178,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movaps	16(%rsi, %rcx), %xmm1
 	movdqu	%xmm4, (%rdi, %rcx)
 	pcmpeqb	%xmm1, %xmm0
@@ -166,6 +198,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movaps	16(%rsi, %rcx), %xmm2
 	movdqu	%xmm1, (%rdi, %rcx)
 	pcmpeqb	%xmm2, %xmm0
@@ -182,6 +218,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movaps	16(%rsi, %rcx), %xmm3
 	movdqu	%xmm2, (%rdi, %rcx)
 	pcmpeqb	%xmm3, %xmm0
@@ -198,6 +238,10 @@ L(Unalign16Both):
 	jnz	L(CopyFrom1To16Bytes)
 # endif
 
+# ifdef __CHKP__
+	bndcu	16(%rsi, %rcx), %bnd1
+	bndcu	15(%rdi, %rcx), %bnd0
+# endif
 	movdqu	%xmm3, (%rdi, %rcx)
 	mov	%rsi, %rdx
 	lea	16(%rsi, %rcx), %rsi
@@ -208,6 +252,9 @@ L(Unalign16Both):
 	lea	128(%r8, %rdx), %r8
 # endif
 L(Unaligned64Loop):
+# ifdef __CHKP__
+	bndcu	48(%rsi), %bnd1
+# endif
 	movaps	(%rsi), %xmm2
 	movaps	%xmm2, %xmm4
 	movaps	16(%rsi), %xmm5
@@ -229,6 +276,10 @@ L(Unaligned64Loop):
 L(Unaligned64Loop_start):
 	add	$64, %rdi
 	add	$64, %rsi
+# ifdef __CHKP__
+	bndcu	(%rsi), %bnd1
+	bndcu	(%rdi), %bnd0
+# endif
 	movdqu	%xmm4, -64(%rdi)
 	movaps	(%rsi), %xmm2
 	movdqa	%xmm2, %xmm4
@@ -271,16 +322,28 @@ L(Unaligned64Leave):
 	jnz	L(CopyFrom1To16BytesUnaligned_32)
 
 	bsf	%rcx, %rdx
+# ifdef __CHKP__
+	bndcu	47(%rdi), %bnd0
+# endif
 	movdqu	%xmm4, (%rdi)
 	movdqu	%xmm5, 16(%rdi)
 	movdqu	%xmm6, 32(%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 # ifdef USE_AS_STPCPY
+#  ifdef __CHKP__
+	bndcu	48(%rdi, %rdx), %bnd0
+#  endif
 	lea	48(%rdi, %rdx), %rax
 # endif
+#  ifdef __CHKP__
+	bndcu	63(%rdi), %bnd0
+#  endif
 	movdqu	%xmm7, 48(%rdi)
 	add	$15, %r8
 	sub	%rdx, %r8
+#  ifdef __CHKP__
+	bndcu	49(%rdi, %rdx), %bnd0
+#  endif
 	lea	49(%rdi, %rdx), %rdi
 	jmp	L(StrncpyFillTailWithZero)
 # else
@@ -309,6 +372,10 @@ L(SourceStringAlignmentLess32):
 	test	%rdx, %rdx
 	jnz	L(CopyFrom1To16BytesTail1)
 
+# ifdef __CHKP__
+	bndcu	16(%rsi), %bnd1
+	bndcu	15(%rdi), %bnd0
+# endif
 	pcmpeqb	%xmm2, %xmm0
 	movdqu	%xmm1, (%rdi)
 	pmovmskb %xmm0, %rdx
@@ -372,6 +439,9 @@ L(CopyFrom1To16BytesUnaligned_0):
 # ifdef USE_AS_STPCPY
 	lea	(%rdi, %rdx), %rax
 # endif
+#  ifdef __CHKP__
+	bndcu	15(%rdi), %bnd0
+#  endif
 	movdqu	%xmm4, (%rdi)
 	add	$63, %r8
 	sub	%rdx, %r8
@@ -384,6 +454,9 @@ L(CopyFrom1To16BytesUnaligned_0):
 	.p2align 4
 L(CopyFrom1To16BytesUnaligned_16):
 	bsf	%rcx, %rdx
+#  ifdef __CHKP__
+	bndcu	31(%rdi), %bnd0
+#  endif
 	movdqu	%xmm4, (%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 # ifdef USE_AS_STPCPY
@@ -403,6 +476,9 @@ L(CopyFrom1To16BytesUnaligned_16):
 	.p2align 4
 L(CopyFrom1To16BytesUnaligned_32):
 	bsf	%rdx, %rdx
+#  ifdef __CHKP__
+	bndcu	47(%rdi), %bnd0
+#  endif
 	movdqu	%xmm4, (%rdi)
 	movdqu	%xmm5, 16(%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
@@ -529,6 +605,9 @@ L(CopyFrom1To16BytesTail1Case2OrCase3):
 
 	.p2align 4
 L(Exit1):
+# ifdef __CHKP__
+	bndcu	(%rdi), %bnd0
+# endif
 	mov	%dh, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	(%rdi), %rax
@@ -543,6 +622,9 @@ L(Exit1):
 	.p2align 4
 L(Exit2):
 	mov	(%rsi), %dx
+# ifdef __CHKP__
+	bndcu	1(%rdi), %bnd0
+# endif
 	mov	%dx, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	1(%rdi), %rax
@@ -557,6 +639,9 @@ L(Exit2):
 	.p2align 4
 L(Exit3):
 	mov	(%rsi), %cx
+# ifdef __CHKP__
+	bndcu	2(%rdi), %bnd0
+# endif
 	mov	%cx, (%rdi)
 	mov	%dh, 2(%rdi)
 # ifdef USE_AS_STPCPY
@@ -572,6 +657,9 @@ L(Exit3):
 	.p2align 4
 L(Exit4):
 	mov	(%rsi), %edx
+# ifdef __CHKP__
+	bndcu	3(%rdi), %bnd0
+# endif
 	mov	%edx, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	3(%rdi), %rax
@@ -586,6 +674,9 @@ L(Exit4):
 	.p2align 4
 L(Exit5):
 	mov	(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	4(%rdi), %bnd0
+# endif
 	mov	%dh, 4(%rdi)
 	mov	%ecx, (%rdi)
 # ifdef USE_AS_STPCPY
@@ -602,6 +693,9 @@ L(Exit5):
 L(Exit6):
 	mov	(%rsi), %ecx
 	mov	4(%rsi), %dx
+# ifdef __CHKP__
+	bndcu	5(%rdi), %bnd0
+# endif
 	mov	%ecx, (%rdi)
 	mov	%dx, 4(%rdi)
 # ifdef USE_AS_STPCPY
@@ -618,6 +712,9 @@ L(Exit6):
 L(Exit7):
 	mov	(%rsi), %ecx
 	mov	3(%rsi), %edx
+# ifdef __CHKP__
+	bndcu	6(%rdi), %bnd0
+# endif
 	mov	%ecx, (%rdi)
 	mov	%edx, 3(%rdi)
 # ifdef USE_AS_STPCPY
@@ -633,6 +730,9 @@ L(Exit7):
 	.p2align 4
 L(Exit8):
 	mov	(%rsi), %rdx
+# ifdef __CHKP__
+	bndcu	7(%rdi), %bnd0
+# endif
 	mov	%rdx, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	7(%rdi), %rax
@@ -647,6 +747,9 @@ L(Exit8):
 	.p2align 4
 L(Exit9):
 	mov	(%rsi), %rcx
+# ifdef __CHKP__
+	bndcu	8(%rdi), %bnd0
+# endif
 	mov	%dh, 8(%rdi)
 	mov	%rcx, (%rdi)
 # ifdef USE_AS_STPCPY
@@ -663,6 +766,9 @@ L(Exit9):
 L(Exit10):
 	mov	(%rsi), %rcx
 	mov	8(%rsi), %dx
+# ifdef __CHKP__
+	bndcu	9(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%dx, 8(%rdi)
 # ifdef USE_AS_STPCPY
@@ -679,6 +785,9 @@ L(Exit10):
 L(Exit11):
 	mov	(%rsi), %rcx
 	mov	7(%rsi), %edx
+# ifdef __CHKP__
+	bndcu	10(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%edx, 7(%rdi)
 # ifdef USE_AS_STPCPY
@@ -695,6 +804,9 @@ L(Exit11):
 L(Exit12):
 	mov	(%rsi), %rcx
 	mov	8(%rsi), %edx
+# ifdef __CHKP__
+	bndcu	11(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%edx, 8(%rdi)
 # ifdef USE_AS_STPCPY
@@ -711,6 +823,9 @@ L(Exit12):
 L(Exit13):
 	mov	(%rsi), %rcx
 	mov	5(%rsi), %rdx
+# ifdef __CHKP__
+	bndcu	12(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%rdx, 5(%rdi)
 # ifdef USE_AS_STPCPY
@@ -727,6 +842,9 @@ L(Exit13):
 L(Exit14):
 	mov	(%rsi), %rcx
 	mov	6(%rsi), %rdx
+# ifdef __CHKP__
+	bndcu	13(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%rdx, 6(%rdi)
 # ifdef USE_AS_STPCPY
@@ -743,6 +861,9 @@ L(Exit14):
 L(Exit15):
 	mov	(%rsi), %rcx
 	mov	7(%rsi), %rdx
+# ifdef __CHKP__
+	bndcu	14(%rdi), %bnd0
+# endif
 	mov	%rcx, (%rdi)
 	mov	%rdx, 7(%rdi)
 # ifdef USE_AS_STPCPY
@@ -758,6 +879,9 @@ L(Exit15):
 	.p2align 4
 L(Exit16):
 	movdqu	(%rsi), %xmm0
+# ifdef __CHKP__
+	bndcu	15(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 # ifdef USE_AS_STPCPY
 	lea	15(%rdi), %rax
@@ -772,6 +896,9 @@ L(Exit16):
 	.p2align 4
 L(Exit17):
 	movdqu	(%rsi), %xmm0
+# ifdef __CHKP__
+	bndcu	16(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%dh, 16(%rdi)
 # ifdef USE_AS_STPCPY
@@ -788,6 +915,9 @@ L(Exit17):
 L(Exit18):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %cx
+# ifdef __CHKP__
+	bndcu	17(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%cx, 16(%rdi)
 # ifdef USE_AS_STPCPY
@@ -804,6 +934,9 @@ L(Exit18):
 L(Exit19):
 	movdqu	(%rsi), %xmm0
 	mov	15(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	18(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 15(%rdi)
 # ifdef USE_AS_STPCPY
@@ -820,6 +953,9 @@ L(Exit19):
 L(Exit20):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	19(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
 # ifdef USE_AS_STPCPY
@@ -836,6 +972,9 @@ L(Exit20):
 L(Exit21):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	20(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%ecx, 16(%rdi)
 	mov	%dh, 20(%rdi)
@@ -853,6 +992,9 @@ L(Exit21):
 L(Exit22):
 	movdqu	(%rsi), %xmm0
 	mov	14(%rsi), %rcx
+# ifdef __CHKP__
+	bndcu	21(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 14(%rdi)
 # ifdef USE_AS_STPCPY
@@ -869,6 +1011,9 @@ L(Exit22):
 L(Exit23):
 	movdqu	(%rsi), %xmm0
 	mov	15(%rsi), %rcx
+# ifdef __CHKP__
+	bndcu	22(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 15(%rdi)
 # ifdef USE_AS_STPCPY
@@ -885,6 +1030,9 @@ L(Exit23):
 L(Exit24):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rcx
+# ifdef __CHKP__
+	bndcu	23(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 16(%rdi)
 # ifdef USE_AS_STPCPY
@@ -901,6 +1049,9 @@ L(Exit24):
 L(Exit25):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rcx
+# ifdef __CHKP__
+	bndcu	24(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rcx, 16(%rdi)
 	mov	%dh, 24(%rdi)
@@ -919,6 +1070,9 @@ L(Exit26):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	24(%rsi), %cx
+# ifdef __CHKP__
+	bndcu	25(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%cx, 24(%rdi)
@@ -937,6 +1091,9 @@ L(Exit27):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	23(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	26(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 23(%rdi)
@@ -955,6 +1112,9 @@ L(Exit28):
 	movdqu	(%rsi), %xmm0
 	mov	16(%rsi), %rdx
 	mov	24(%rsi), %ecx
+# ifdef __CHKP__
+	bndcu	27(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	mov	%rdx, 16(%rdi)
 	mov	%ecx, 24(%rdi)
@@ -972,6 +1132,9 @@ L(Exit28):
 L(Exit29):
 	movdqu	(%rsi), %xmm0
 	movdqu	13(%rsi), %xmm2
+# ifdef __CHKP__
+	bndcu	28(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 13(%rdi)
 # ifdef USE_AS_STPCPY
@@ -988,6 +1151,9 @@ L(Exit29):
 L(Exit30):
 	movdqu	(%rsi), %xmm0
 	movdqu	14(%rsi), %xmm2
+# ifdef __CHKP__
+	bndcu	29(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 14(%rdi)
 # ifdef USE_AS_STPCPY
@@ -1004,6 +1170,9 @@ L(Exit30):
 L(Exit31):
 	movdqu	(%rsi), %xmm0
 	movdqu	15(%rsi), %xmm2
+# ifdef __CHKP__
+	bndcu	30(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 15(%rdi)
 # ifdef USE_AS_STPCPY
@@ -1020,6 +1189,9 @@ L(Exit31):
 L(Exit32):
 	movdqu	(%rsi), %xmm0
 	movdqu	16(%rsi), %xmm2
+# ifdef __CHKP__
+	bndcu	31(%rdi), %bnd0
+# endif
 	movdqu	%xmm0, (%rdi)
 	movdqu	%xmm2, 16(%rdi)
 # ifdef USE_AS_STPCPY
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
index 3f92a41ef9..1fed105bf0 100644
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ b/sysdeps/x86_64/multiarch/strrchr.S
@@ -97,6 +97,10 @@ __strrchr_sse42:
 	CALL_MCOUNT
 	testb	%sil, %sil
 	je	__strend_sse4
+# ifdef __CHKP__
+	bndcl  (%rdi), %bnd0
+	bndcu  (%rdi), %bnd0
+# endif
 	xor	%eax,%eax	/* RAX has the last occurrence of s.  */
 	movd	%esi, %xmm1
 	punpcklbw	%xmm1, %xmm1
@@ -135,6 +139,9 @@ L(unaligned_no_byte):
 	   contain the NULL terminator.  */
 	jg	L(exit)
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 
 /* Loop start on aligned string.  */
 	.p2align 4
@@ -142,6 +149,9 @@ L(loop):
 	pcmpistri	$0x4a, (%r8), %xmm1
 	jbe	L(match_or_eos)
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	jmp	L(loop)
 	.p2align 4
 L(match_or_eos):
@@ -149,11 +159,17 @@ L(match_or_eos):
 L(match_no_eos):
 	leaq	(%r8,%rcx), %rax
 	addq	$16, %r8
+# ifdef __CHKP__
+	bndcu 	(%r8), %bnd0
+# endif
 	jmp     L(loop)
 	.p2align 4
 L(had_eos):
 	jnc     L(exit)
 	leaq	(%r8,%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	(%rax), %bnd0
+# endif
 	.p2align 4
 L(exit):
 	ret
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
index b7de092228..77889dd555 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -25,13 +25,27 @@ ENTRY (__wcscpy_ssse3)
 
 	mov	%rsi, %rcx
 	mov	%rdi, %rdx
+# ifdef __CHKP__
+	bndcl  	(%rdi), %bnd0
+	bndcl  	(%rsi), %bnd1
+	bndcu  	(%rsi), %bnd1
+# endif
 
 	cmpl	$0, (%rcx)
 	jz	L(Exit4)
+# ifdef __CHKP__
+	bndcu  	4(%rcx), %bnd1
+# endif
 	cmpl	$0, 4(%rcx)
 	jz	L(Exit8)
+# ifdef __CHKP__
+	bndcu  	8(%rcx), %bnd1
+# endif
 	cmpl	$0, 8(%rcx)
 	jz	L(Exit12)
+# ifdef __CHKP__
+	bndcu  	12(%rcx), %bnd1
+# endif
 	cmpl	$0, 12(%rcx)
 	jz	L(Exit16)
 
@@ -40,10 +54,19 @@ ENTRY (__wcscpy_ssse3)
 
 	pxor	%xmm0, %xmm0
 	mov	(%rcx), %r9
+# ifdef __CHKP__
+	bndcu  	7(%rdx), %bnd0
+# endif
 	mov	%r9, (%rdx)
 
+# ifdef __CHKP__
+	bndcu  	(%rsi), %bnd1
+# endif
 	pcmpeqd	(%rsi), %xmm0
 	mov	8(%rcx), %r9
+# ifdef __CHKP__
+	bndcu  	15(%rdx), %bnd0
+# endif
 	mov	%r9, 8(%rdx)
 
 	pmovmskb %xmm0, %rax
@@ -72,6 +95,10 @@ ENTRY (__wcscpy_ssse3)
 	jmp	L(Shl12)
 
 L(Align16Both):
+# ifdef __CHKP__
+	bndcu  	16(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	(%rcx), %xmm1
 	movaps	16(%rcx), %xmm2
 	movaps	%xmm1, (%rdx)
@@ -82,6 +109,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	16(%rcx, %rsi), %xmm3
 	movaps	%xmm2, (%rdx, %rsi)
 	pcmpeqd	%xmm3, %xmm0
@@ -91,6 +122,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	16(%rcx, %rsi), %xmm4
 	movaps	%xmm3, (%rdx, %rsi)
 	pcmpeqd	%xmm4, %xmm0
@@ -100,6 +135,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	16(%rcx, %rsi), %xmm1
 	movaps	%xmm4, (%rdx, %rsi)
 	pcmpeqd	%xmm1, %xmm0
@@ -109,6 +148,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	16(%rcx, %rsi), %xmm2
 	movaps	%xmm1, (%rdx, %rsi)
 	pcmpeqd	%xmm2, %xmm0
@@ -118,6 +161,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	16(%rcx, %rsi), %xmm3
 	movaps	%xmm2, (%rdx, %rsi)
 	pcmpeqd	%xmm3, %xmm0
@@ -127,6 +174,10 @@ L(Align16Both):
 	test	%rax, %rax
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	16(%rcx, %rsi), %bnd1
+	bndcu  	15(%rdx, %rsi), %bnd0
+# endif
 	movaps	%xmm3, (%rdx, %rsi)
 	mov	%rcx, %rax
 	lea	16(%rcx, %rsi), %rcx
@@ -138,6 +189,10 @@ L(Align16Both):
 
 	.p2align 4
 L(Aligned64Loop):
+# ifdef __CHKP__
+	bndcu  	(%rcx), %bnd1
+	bndcu  	63(%rdx), %bnd0
+# endif
 	movaps	(%rcx), %xmm2
 	movaps	%xmm2, %xmm4
 	movaps	16(%rcx), %xmm5
@@ -168,6 +223,9 @@ L(Aligned64Leave):
 	pcmpeqd	%xmm5, %xmm0
 
 	pmovmskb %xmm0, %rax
+# ifdef __CHKP__
+	bndcu  	-49(%rdx), %bnd0
+# endif
 	movaps	%xmm4, -64(%rdx)
 	test	%rax, %rax
 	lea	16(%rsi), %rsi
@@ -176,11 +234,17 @@ L(Aligned64Leave):
 	pcmpeqd	%xmm6, %xmm0
 
 	pmovmskb %xmm0, %rax
+# ifdef __CHKP__
+	bndcu  	-33(%rdx), %bnd0
+# endif
 	movaps	%xmm5, -48(%rdx)
 	test	%rax, %rax
 	lea	16(%rsi), %rsi
 	jnz	L(CopyFrom1To16Bytes)
 
+# ifdef __CHKP__
+	bndcu  	-17(%rdx), %bnd0
+# endif
 	movaps	%xmm6, -32(%rdx)
 	pcmpeqd	%xmm7, %xmm0
 
@@ -190,11 +254,17 @@ L(Aligned64Leave):
 	jnz	L(CopyFrom1To16Bytes)
 
 	mov	$-0x40, %rsi
+# ifdef __CHKP__
+	bndcu  	-1(%rdx), %bnd0
+# endif
 	movaps	%xmm7, -16(%rdx)
 	jmp	L(Aligned64Loop)
 
 	.p2align 4
 L(Shl4):
+# ifdef __CHKP__
+	bndcu  	12(%rcx), %bnd1
+# endif
 	movaps	-4(%rcx), %xmm1
 	movaps	12(%rcx), %xmm2
 L(Shl4Start):
@@ -206,6 +276,10 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	28(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -219,6 +293,10 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	28(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -232,6 +310,10 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	28(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	28(%rcx), %xmm2
 
@@ -244,6 +326,9 @@ L(Shl4Start):
 	jnz	L(Shl4LoopExit)
 
 	palignr	$4, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	lea	28(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -258,6 +343,9 @@ L(Shl4Start):
 
 	.p2align 4
 L(Shl4LoopStart):
+# ifdef __CHKP__
+	bndcu  	12(%rcx), %bnd1
+# endif
 	movaps	12(%rcx), %xmm2
 	movaps	28(%rcx), %xmm3
 	movaps	%xmm3, %xmm6
@@ -279,6 +367,9 @@ L(Shl4LoopStart):
 	lea	64(%rcx), %rcx
 	palignr	$4, %xmm1, %xmm2
 	movaps	%xmm7, %xmm1
+# ifdef __CHKP__
+	bndcu  	63(%rdx), %bnd0
+# endif
 	movaps	%xmm5, 48(%rdx)
 	movaps	%xmm4, 32(%rdx)
 	movaps	%xmm3, 16(%rdx)
@@ -287,6 +378,10 @@ L(Shl4LoopStart):
 	jmp	L(Shl4LoopStart)
 
 L(Shl4LoopExit):
+# ifdef __CHKP__
+	bndcu  	-4(%rcx), %bnd1
+	bndcu  	11(%rdx), %bnd0
+# endif
 	movdqu	-4(%rcx), %xmm1
 	mov	$12, %rsi
 	movdqu	%xmm1, -4(%rdx)
@@ -294,6 +389,9 @@ L(Shl4LoopExit):
 
 	.p2align 4
 L(Shl8):
+# ifdef __CHKP__
+	bndcu  	8(%rcx), %bnd1
+# endif
 	movaps	-8(%rcx), %xmm1
 	movaps	8(%rcx), %xmm2
 L(Shl8Start):
@@ -305,6 +403,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	24(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -318,6 +420,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	24(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -331,6 +437,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	24(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	24(%rcx), %xmm2
 
@@ -343,6 +453,10 @@ L(Shl8Start):
 	jnz	L(Shl8LoopExit)
 
 	palignr	$8, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	24(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	lea	24(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -357,6 +471,9 @@ L(Shl8Start):
 
 	.p2align 4
 L(Shl8LoopStart):
+# ifdef __CHKP__
+	bndcu  	8(%rcx), %bnd1
+# endif
 	movaps	8(%rcx), %xmm2
 	movaps	24(%rcx), %xmm3
 	movaps	%xmm3, %xmm6
@@ -378,6 +495,9 @@ L(Shl8LoopStart):
 	lea	64(%rcx), %rcx
 	palignr	$8, %xmm1, %xmm2
 	movaps	%xmm7, %xmm1
+# ifdef __CHKP__
+	bndcu  	63(%rdx), %bnd0
+# endif
 	movaps	%xmm5, 48(%rdx)
 	movaps	%xmm4, 32(%rdx)
 	movaps	%xmm3, 16(%rdx)
@@ -386,6 +506,10 @@ L(Shl8LoopStart):
 	jmp	L(Shl8LoopStart)
 
 L(Shl8LoopExit):
+# ifdef __CHKP__
+	bndcu  	(%rcx), %bnd1
+	bndcu  	7(%rdx), %bnd0
+# endif
 	mov	(%rcx), %r9
 	mov	$8, %rsi
 	mov	%r9, (%rdx)
@@ -393,6 +517,9 @@ L(Shl8LoopExit):
 
 	.p2align 4
 L(Shl12):
+# ifdef __CHKP__
+	bndcu  	4(%rcx), %bnd1
+# endif
 	movaps	-12(%rcx), %xmm1
 	movaps	4(%rcx), %xmm2
 L(Shl12Start):
@@ -404,6 +531,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	20(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -417,6 +548,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	20(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -430,6 +565,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm1, %xmm2
+# ifdef __CHKP__
+	bndcu  	20(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	movaps	20(%rcx), %xmm2
 
@@ -442,6 +581,10 @@ L(Shl12Start):
 	jnz	L(Shl12LoopExit)
 
 	palignr	$12, %xmm3, %xmm2
+# ifdef __CHKP__
+	bndcu  	20(%rcx), %bnd1
+	bndcu  	15(%rdx), %bnd0
+# endif
 	movaps	%xmm2, (%rdx)
 	lea	20(%rcx), %rcx
 	lea	16(%rdx), %rdx
@@ -456,6 +599,9 @@ L(Shl12Start):
 
 	.p2align 4
 L(Shl12LoopStart):
+# ifdef __CHKP__
+	bndcu  	4(%rcx), %bnd1
+# endif
 	movaps	4(%rcx), %xmm2
 	movaps	20(%rcx), %xmm3
 	movaps	%xmm3, %xmm6
@@ -476,6 +622,9 @@ L(Shl12LoopStart):
 	lea	64(%rcx), %rcx
 	palignr	$12, %xmm1, %xmm2
 	movaps	%xmm7, %xmm1
+# ifdef __CHKP__
+	bndcu  	63(%rdx), %bnd0
+# endif
 	movaps	%xmm5, 48(%rdx)
 	movaps	%xmm4, 32(%rdx)
 	movaps	%xmm3, 16(%rdx)
@@ -484,6 +633,10 @@ L(Shl12LoopStart):
 	jmp	L(Shl12LoopStart)
 
 L(Shl12LoopExit):
+# ifdef __CHKP__
+	bndcu  	(%rcx), %bnd1
+	bndcu  	3(%rdx), %bnd0
+# endif
 	mov	(%rcx), %r9d
 	mov	$4, %rsi
 	mov	%r9d, (%rdx)
@@ -500,6 +653,9 @@ L(CopyFrom1To16Bytes):
 	jnz	L(Exit4)
 
 	mov	(%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	7(%rdx), %bnd0
+# endif
 	mov	%rax, (%rdx)
 	mov	%rdi, %rax
 	ret
@@ -510,6 +666,9 @@ L(ExitHigh):
 	jnz	L(Exit12)
 
 	mov	(%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	15(%rdx), %bnd0
+# endif
 	mov	%rax, (%rdx)
 	mov	8(%rcx), %rax
 	mov	%rax, 8(%rdx)
@@ -519,6 +678,9 @@ L(ExitHigh):
 	.p2align 4
 L(Exit4):
 	movl	(%rcx), %eax
+# ifdef __CHKP__
+	bndcu  	3(%rdx), %bnd0
+# endif
 	movl	%eax, (%rdx)
 	mov	%rdi, %rax
 	ret
@@ -526,6 +688,9 @@ L(Exit4):
 	.p2align 4
 L(Exit8):
 	mov	(%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	7(%rdx), %bnd0
+# endif
 	mov	%rax, (%rdx)
 	mov	%rdi, %rax
 	ret
@@ -533,6 +698,9 @@ L(Exit8):
 	.p2align 4
 L(Exit12):
 	mov	(%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	11(%rdx), %bnd0
+# endif
 	mov	%rax, (%rdx)
 	mov	8(%rcx), %eax
 	mov	%eax, 8(%rdx)
@@ -542,6 +710,9 @@ L(Exit12):
 	.p2align 4
 L(Exit16):
 	mov	(%rcx), %rax
+# ifdef __CHKP__
+	bndcu  	15(%rdx), %bnd0
+# endif
 	mov	%rax, (%rdx)
 	mov	8(%rcx), %rax
 	mov	%rax, 8(%rdx)