8 files changed, 806 insertions, 234 deletions
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index e65d1cd89e28..855115ace98c 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -24,6 +24,20 @@
 #include <asm/barrier.h>
 
 #define ATOMIC_INIT(i)	{ (i) }
+
+#define __atomic_op_acquire(op, args...)				\
+({									\
+	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
+	__asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory");	\
+	__ret;								\
+})
+
+#define __atomic_op_release(op, args...)				\
+({									\
+	__asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory");	\
+	op##_relaxed(args);						\
+})
+
 static __always_inline int atomic_read(const atomic_t *v)
 {
 	return READ_ONCE(v->counter);
@@ -50,22 +64,23 @@ static __always_inline void atomic64_set(atomic64_t *v, long i)
  * have the AQ or RL bits set.  These don't return anything, so there's only
  * one version to worry about.
  */
-#define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix)				\
-static __always_inline void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)	\
-{											\
-	__asm__ __volatile__ (								\
-		"amo" #asm_op "." #asm_type " zero, %1, %0"				\
-		: "+A" (v->counter)							\
-		: "r" (I)								\
-		: "memory");								\
-}
+#define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix)		\
+static __always_inline							\
+void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)		\
+{									\
+	__asm__ __volatile__ (						\
+		"	amo" #asm_op "." #asm_type " zero, %1, %0"	\
+		: "+A" (v->counter)					\
+		: "r" (I)						\
+		: "memory");						\
+}									\
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, I)			\
+#define ATOMIC_OPS(op, asm_op, I)					\
         ATOMIC_OP (op, asm_op, I, w,  int,   )
 #else
-#define ATOMIC_OPS(op, asm_op, I)			\
-        ATOMIC_OP (op, asm_op, I, w,  int,   )	\
+#define ATOMIC_OPS(op, asm_op, I)					\
+        ATOMIC_OP (op, asm_op, I, w,  int,   )				\
         ATOMIC_OP (op, asm_op, I, d, long, 64)
 #endif
 
@@ -79,75 +94,115 @@ ATOMIC_OPS(xor, xor,  i)
 #undef ATOMIC_OPS
 
 /*
- * Atomic ops that have ordered, relaxed, acquire, and relese variants.
+ * Atomic ops that have ordered, relaxed, acquire, and release variants.
  * There's two flavors of these: the arithmatic ops have both fetch and return
  * versions, while the logical ops only have fetch versions.
  */
-#define ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, asm_type, c_type, prefix)				\
-static __always_inline c_type atomic##prefix##_fetch_##op##c_or(c_type i, atomic##prefix##_t *v)	\
-{													\
-	register c_type ret;										\
-	__asm__ __volatile__ (										\
-		"amo" #asm_op "." #asm_type #asm_or " %1, %2, %0"					\
-		: "+A" (v->counter), "=r" (ret)								\
-		: "r" (I)										\
-		: "memory");										\
-	return ret;											\
+#define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix)	\
+static __always_inline							\
+c_type atomic##prefix##_fetch_##op##_relaxed(c_type i,			\
+					     atomic##prefix##_t *v)	\
+{									\
+	register c_type ret;						\
+	__asm__ __volatile__ (						\
+		"	amo" #asm_op "." #asm_type " %1, %2, %0"	\
+		: "+A" (v->counter), "=r" (ret)				\
+		: "r" (I)						\
+		: "memory");						\
+	return ret;							\
+}									\
+static __always_inline							\
+c_type atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v)	\
+{									\
+	register c_type ret;						\
+	__asm__ __volatile__ (						\
+		"	amo" #asm_op "." #asm_type ".aqrl  %1, %2, %0"	\
+		: "+A" (v->counter), "=r" (ret)				\
+		: "r" (I)						\
+		: "memory");						\
+	return ret;							\
 }
 
-#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, asm_type, c_type, prefix)			\
-static __always_inline c_type atomic##prefix##_##op##_return##c_or(c_type i, atomic##prefix##_t *v)	\
-{													\
-        return atomic##prefix##_fetch_##op##c_or(i, v) c_op I;						\
+#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix)	\
+static __always_inline							\
+c_type atomic##prefix##_##op##_return_relaxed(c_type i,			\
+					      atomic##prefix##_t *v)	\
+{									\
+        return atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I;	\
+}									\
+static __always_inline							\
+c_type atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v)	\
+{									\
+        return atomic##prefix##_fetch_##op(i, v) c_op I;		\
 }
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, w,  int,   )	\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )
+#define ATOMIC_OPS(op, asm_op, c_op, I)					\
+        ATOMIC_FETCH_OP( op, asm_op,       I, w,  int,   )		\
+        ATOMIC_OP_RETURN(op, asm_op, c_op, I, w,  int,   )
 #else
-#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, w,  int,   )	\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )	\
-        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, d, long, 64)	\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, d, long, 64)
+#define ATOMIC_OPS(op, asm_op, c_op, I)					\
+        ATOMIC_FETCH_OP( op, asm_op,       I, w,  int,   )		\
+        ATOMIC_OP_RETURN(op, asm_op, c_op, I, w,  int,   )		\
+        ATOMIC_FETCH_OP( op, asm_op,       I, d, long, 64)		\
+        ATOMIC_OP_RETURN(op, asm_op, c_op, I, d, long, 64)
 #endif
 
-ATOMIC_OPS(add, add, +,  i,      , _relaxed)
-ATOMIC_OPS(add, add, +,  i, .aq  , _acquire)
-ATOMIC_OPS(add, add, +,  i, .rl  , _release)
-ATOMIC_OPS(add, add, +,  i, .aqrl,         )
+ATOMIC_OPS(add, add, +,  i)
+ATOMIC_OPS(sub, add, +, -i)
+
+#define atomic_add_return_relaxed	atomic_add_return_relaxed
+#define atomic_sub_return_relaxed	atomic_sub_return_relaxed
+#define atomic_add_return		atomic_add_return
+#define atomic_sub_return		atomic_sub_return
 
-ATOMIC_OPS(sub, add, +, -i,      , _relaxed)
-ATOMIC_OPS(sub, add, +, -i, .aq  , _acquire)
-ATOMIC_OPS(sub, add, +, -i, .rl  , _release)
-ATOMIC_OPS(sub, add, +, -i, .aqrl,         )
+#define atomic_fetch_add_relaxed	atomic_fetch_add_relaxed
+#define atomic_fetch_sub_relaxed	atomic_fetch_sub_relaxed
+#define atomic_fetch_add		atomic_fetch_add
+#define atomic_fetch_sub		atomic_fetch_sub
+
+#ifndef CONFIG_GENERIC_ATOMIC64
+#define atomic64_add_return_relaxed	atomic64_add_return_relaxed
+#define atomic64_sub_return_relaxed	atomic64_sub_return_relaxed
+#define atomic64_add_return		atomic64_add_return
+#define atomic64_sub_return		atomic64_sub_return
+
+#define atomic64_fetch_add_relaxed	atomic64_fetch_add_relaxed
+#define atomic64_fetch_sub_relaxed	atomic64_fetch_sub_relaxed
+#define atomic64_fetch_add		atomic64_fetch_add
+#define atomic64_fetch_sub		atomic64_fetch_sub
+#endif
 
 #undef ATOMIC_OPS
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w,  int,   )
+#define ATOMIC_OPS(op, asm_op, I)					\
+        ATOMIC_FETCH_OP(op, asm_op, I, w,  int,   )
 #else
-#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w,  int,   )	\
-        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, d, long, 64)
+#define ATOMIC_OPS(op, asm_op, I)					\
+        ATOMIC_FETCH_OP(op, asm_op, I, w,  int,   )			\
+        ATOMIC_FETCH_OP(op, asm_op, I, d, long, 64)
 #endif
 
-ATOMIC_OPS(and, and, i,      , _relaxed)
-ATOMIC_OPS(and, and, i, .aq  , _acquire)
-ATOMIC_OPS(and, and, i, .rl  , _release)
-ATOMIC_OPS(and, and, i, .aqrl,         )
+ATOMIC_OPS(and, and, i)
+ATOMIC_OPS( or,  or, i)
+ATOMIC_OPS(xor, xor, i)
 
-ATOMIC_OPS( or,  or, i,      , _relaxed)
-ATOMIC_OPS( or,  or, i, .aq  , _acquire)
-ATOMIC_OPS( or,  or, i, .rl  , _release)
-ATOMIC_OPS( or,  or, i, .aqrl,         )
+#define atomic_fetch_and_relaxed	atomic_fetch_and_relaxed
+#define atomic_fetch_or_relaxed		atomic_fetch_or_relaxed
+#define atomic_fetch_xor_relaxed	atomic_fetch_xor_relaxed
+#define atomic_fetch_and		atomic_fetch_and
+#define atomic_fetch_or			atomic_fetch_or
+#define atomic_fetch_xor		atomic_fetch_xor
 
-ATOMIC_OPS(xor, xor, i,      , _relaxed)
-ATOMIC_OPS(xor, xor, i, .aq  , _acquire)
-ATOMIC_OPS(xor, xor, i, .rl  , _release)
-ATOMIC_OPS(xor, xor, i, .aqrl,         )
+#ifndef CONFIG_GENERIC_ATOMIC64
+#define atomic64_fetch_and_relaxed	atomic64_fetch_and_relaxed
+#define atomic64_fetch_or_relaxed	atomic64_fetch_or_relaxed
+#define atomic64_fetch_xor_relaxed	atomic64_fetch_xor_relaxed
+#define atomic64_fetch_and		atomic64_fetch_and
+#define atomic64_fetch_or		atomic64_fetch_or
+#define atomic64_fetch_xor		atomic64_fetch_xor
+#endif
 
 #undef ATOMIC_OPS
 
@@ -157,22 +212,24 @@ ATOMIC_OPS(xor, xor, i, .aqrl,         )
 /*
  * The extra atomic operations that are constructed from one of the core
  * AMO-based operations above (aside from sub, which is easier to fit above).
- * These are required to perform a barrier, but they're OK this way because
- * atomic_*_return is also required to perform a barrier.
+ * These are required to perform a full barrier, but they're OK this way
+ * because atomic_*_return is also required to perform a full barrier.
+ *
  */
-#define ATOMIC_OP(op, func_op, comp_op, I, c_type, prefix)			\
-static __always_inline bool atomic##prefix##_##op(c_type i, atomic##prefix##_t *v) \
-{										\
-	return atomic##prefix##_##func_op##_return(i, v) comp_op I;		\
+#define ATOMIC_OP(op, func_op, comp_op, I, c_type, prefix)		\
+static __always_inline							\
+bool atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)		\
+{									\
+	return atomic##prefix##_##func_op##_return(i, v) comp_op I;	\
 }
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, func_op, comp_op, I)			\
-        ATOMIC_OP (op, func_op, comp_op, I,  int,   )
+#define ATOMIC_OPS(op, func_op, comp_op, I)				\
+        ATOMIC_OP(op, func_op, comp_op, I,  int,   )
 #else
-#define ATOMIC_OPS(op, func_op, comp_op, I)			\
-        ATOMIC_OP (op, func_op, comp_op, I,  int,   )		\
-        ATOMIC_OP (op, func_op, comp_op, I, long, 64)
+#define ATOMIC_OPS(op, func_op, comp_op, I)				\
+        ATOMIC_OP(op, func_op, comp_op, I,  int,   )			\
+        ATOMIC_OP(op, func_op, comp_op, I, long, 64)
 #endif
 
 ATOMIC_OPS(add_and_test, add, ==, 0)
@@ -182,51 +239,87 @@ ATOMIC_OPS(add_negative, add,  <, 0)
 #undef ATOMIC_OP
 #undef ATOMIC_OPS
 
-#define ATOMIC_OP(op, func_op, I, c_type, prefix)				\
-static __always_inline void atomic##prefix##_##op(atomic##prefix##_t *v)	\
-{										\
-	atomic##prefix##_##func_op(I, v);					\
+#define ATOMIC_OP(op, func_op, I, c_type, prefix)			\
+static __always_inline							\
+void atomic##prefix##_##op(atomic##prefix##_t *v)			\
+{									\
+	atomic##prefix##_##func_op(I, v);				\
 }
 
-#define ATOMIC_FETCH_OP(op, func_op, I, c_type, prefix)					\
-static __always_inline c_type atomic##prefix##_fetch_##op(atomic##prefix##_t *v)	\
-{											\
-	return atomic##prefix##_fetch_##func_op(I, v);					\
+#define ATOMIC_FETCH_OP(op, func_op, I, c_type, prefix)			\
+static __always_inline							\
+c_type atomic##prefix##_fetch_##op##_relaxed(atomic##prefix##_t *v)	\
+{									\
+	return atomic##prefix##_fetch_##func_op##_relaxed(I, v);	\
+}									\
+static __always_inline							\
+c_type atomic##prefix##_fetch_##op(atomic##prefix##_t *v)		\
+{									\
+	return atomic##prefix##_fetch_##func_op(I, v);			\
 }
 
-#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, c_type, prefix)				\
-static __always_inline c_type atomic##prefix##_##op##_return(atomic##prefix##_t *v)	\
-{											\
-        return atomic##prefix##_fetch_##op(v) c_op I;					\
+#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, c_type, prefix)		\
+static __always_inline							\
+c_type atomic##prefix##_##op##_return_relaxed(atomic##prefix##_t *v)	\
+{									\
+        return atomic##prefix##_fetch_##op##_relaxed(v) c_op I;		\
+}									\
+static __always_inline							\
+c_type atomic##prefix##_##op##_return(atomic##prefix##_t *v)		\
+{									\
+        return atomic##prefix##_fetch_##op(v) c_op I;			\
 }
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, c_op, I)						\
-        ATOMIC_OP       (op, asm_op,       I,  int,   )				\
-        ATOMIC_FETCH_OP (op, asm_op,       I,  int,   )				\
+#define ATOMIC_OPS(op, asm_op, c_op, I)					\
+        ATOMIC_OP(       op, asm_op,       I,  int,   )			\
+        ATOMIC_FETCH_OP( op, asm_op,       I,  int,   )			\
         ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )
 #else
-#define ATOMIC_OPS(op, asm_op, c_op, I)						\
-        ATOMIC_OP       (op, asm_op,       I,  int,   )				\
-        ATOMIC_FETCH_OP (op, asm_op,       I,  int,   )				\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )				\
-        ATOMIC_OP       (op, asm_op,       I, long, 64)				\
-        ATOMIC_FETCH_OP (op, asm_op,       I, long, 64)				\
+#define ATOMIC_OPS(op, asm_op, c_op, I)					\
+        ATOMIC_OP(       op, asm_op,       I,  int,   )			\
+        ATOMIC_FETCH_OP( op, asm_op,       I,  int,   )			\
+        ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )			\
+        ATOMIC_OP(       op, asm_op,       I, long, 64)			\
+        ATOMIC_FETCH_OP( op, asm_op,       I, long, 64)			\
         ATOMIC_OP_RETURN(op, asm_op, c_op, I, long, 64)
 #endif
 
 ATOMIC_OPS(inc, add, +,  1)
 ATOMIC_OPS(dec, add, +, -1)
 
+#define atomic_inc_return_relaxed	atomic_inc_return_relaxed
+#define atomic_dec_return_relaxed	atomic_dec_return_relaxed
+#define atomic_inc_return		atomic_inc_return
+#define atomic_dec_return		atomic_dec_return
+
+#define atomic_fetch_inc_relaxed	atomic_fetch_inc_relaxed
+#define atomic_fetch_dec_relaxed	atomic_fetch_dec_relaxed
+#define atomic_fetch_inc		atomic_fetch_inc
+#define atomic_fetch_dec		atomic_fetch_dec
+
+#ifndef CONFIG_GENERIC_ATOMIC64
+#define atomic64_inc_return_relaxed	atomic64_inc_return_relaxed
+#define atomic64_dec_return_relaxed	atomic64_dec_return_relaxed
+#define atomic64_inc_return		atomic64_inc_return
+#define atomic64_dec_return		atomic64_dec_return
+
+#define atomic64_fetch_inc_relaxed	atomic64_fetch_inc_relaxed
+#define atomic64_fetch_dec_relaxed	atomic64_fetch_dec_relaxed
+#define atomic64_fetch_inc		atomic64_fetch_inc
+#define atomic64_fetch_dec		atomic64_fetch_dec
+#endif
+
 #undef ATOMIC_OPS
 #undef ATOMIC_OP
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_OP_RETURN
 
-#define ATOMIC_OP(op, func_op, comp_op, I, prefix)				\
-static __always_inline bool atomic##prefix##_##op(atomic##prefix##_t *v)	\
-{										\
-	return atomic##prefix##_##func_op##_return(v) comp_op I;		\
+#define ATOMIC_OP(op, func_op, comp_op, I, prefix)			\
+static __always_inline							\
+bool atomic##prefix##_##op(atomic##prefix##_t *v)			\
+{									\
+	return atomic##prefix##_##func_op##_return(v) comp_op I;	\
 }
 
 ATOMIC_OP(inc_and_test, inc, ==, 0,   )
@@ -238,19 +331,19 @@ ATOMIC_OP(dec_and_test, dec, ==, 0, 64)
 
 #undef ATOMIC_OP
 
-/* This is required to provide a barrier on success. */
+/* This is required to provide a full barrier on success. */
 static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
        int prev, rc;
 
 	__asm__ __volatile__ (
-		"0:\n\t"
-		"lr.w.aqrl  %[p],  %[c]\n\t"
-		"beq        %[p],  %[u], 1f\n\t"
-		"add       %[rc],  %[p], %[a]\n\t"
-		"sc.w.aqrl %[rc], %[rc], %[c]\n\t"
-		"bnez      %[rc], 0b\n\t"
-		"1:"
+		"0:	lr.w     %[p],  %[c]\n"
+		"	beq      %[p],  %[u], 1f\n"
+		"	add      %[rc], %[p], %[a]\n"
+		"	sc.w.rl  %[rc], %[rc], %[c]\n"
+		"	bnez     %[rc], 0b\n"
+		"	fence    rw, rw\n"
+		"1:\n"
 		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
 		: [a]"r" (a), [u]"r" (u)
 		: "memory");
@@ -263,13 +356,13 @@ static __always_inline long __atomic64_add_unless(atomic64_t *v, long a, long u)
        long prev, rc;
 
 	__asm__ __volatile__ (
-		"0:\n\t"
-		"lr.d.aqrl  %[p],  %[c]\n\t"
-		"beq        %[p],  %[u], 1f\n\t"
-		"add       %[rc],  %[p], %[a]\n\t"
-		"sc.d.aqrl %[rc], %[rc], %[c]\n\t"
-		"bnez      %[rc], 0b\n\t"
-		"1:"
+		"0:	lr.d     %[p],  %[c]\n"
+		"	beq      %[p],  %[u], 1f\n"
+		"	add      %[rc], %[p], %[a]\n"
+		"	sc.d.rl  %[rc], %[rc], %[c]\n"
+		"	bnez     %[rc], 0b\n"
+		"	fence    rw, rw\n"
+		"1:\n"
 		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
 		: [a]"r" (a), [u]"r" (u)
 		: "memory");
@@ -300,37 +393,63 @@ static __always_inline long atomic64_inc_not_zero(atomic64_t *v)
 
 /*
  * atomic_{cmp,}xchg is required to have exactly the same ordering semantics as
- * {cmp,}xchg and the operations that return, so they need a barrier.
- */
-/*
- * FIXME: atomic_cmpxchg_{acquire,release,relaxed} are all implemented by
- * assigning the same barrier to both the LR and SC operations, but that might
- * not make any sense.  We're waiting on a memory model specification to
- * determine exactly what the right thing to do is here.
+ * {cmp,}xchg and the operations that return, so they need a full barrier.
  */
-#define ATOMIC_OP(c_t, prefix, c_or, size, asm_or)						\
-static __always_inline c_t atomic##prefix##_cmpxchg##c_or(atomic##prefix##_t *v, c_t o, c_t n) 	\
-{												\
-	return __cmpxchg(&(v->counter), o, n, size, asm_or, asm_or);				\
-}												\
-static __always_inline c_t atomic##prefix##_xchg##c_or(atomic##prefix##_t *v, c_t n) 		\
-{												\
-	return __xchg(n, &(v->counter), size, asm_or);						\
+#define ATOMIC_OP(c_t, prefix, size)					\
+static __always_inline							\
+c_t atomic##prefix##_xchg_relaxed(atomic##prefix##_t *v, c_t n)		\
+{									\
+	return __xchg_relaxed(&(v->counter), n, size);			\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_xchg_acquire(atomic##prefix##_t *v, c_t n)		\
+{									\
+	return __xchg_acquire(&(v->counter), n, size);			\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_xchg_release(atomic##prefix##_t *v, c_t n)		\
+{									\
+	return __xchg_release(&(v->counter), n, size);			\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_xchg(atomic##prefix##_t *v, c_t n)			\
+{									\
+	return __xchg(&(v->counter), n, size);				\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_cmpxchg_relaxed(atomic##prefix##_t *v,		\
+				     c_t o, c_t n)			\
+{									\
+	return __cmpxchg_relaxed(&(v->counter), o, n, size);		\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_cmpxchg_acquire(atomic##prefix##_t *v,		\
+				     c_t o, c_t n)			\
+{									\
+	return __cmpxchg_acquire(&(v->counter), o, n, size);		\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_cmpxchg_release(atomic##prefix##_t *v,		\
+				     c_t o, c_t n)			\
+{									\
+	return __cmpxchg_release(&(v->counter), o, n, size);		\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_cmpxchg(atomic##prefix##_t *v, c_t o, c_t n)	\
+{									\
+	return __cmpxchg(&(v->counter), o, n, size);			\
 }
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(c_or, asm_or)			\
-	ATOMIC_OP( int,   , c_or, 4, asm_or)
+#define ATOMIC_OPS()							\
+	ATOMIC_OP( int,   , 4)
 #else
-#define ATOMIC_OPS(c_or, asm_or)			\
-	ATOMIC_OP( int,   , c_or, 4, asm_or)		\
-	ATOMIC_OP(long, 64, c_or, 8, asm_or)
+#define ATOMIC_OPS()							\
+	ATOMIC_OP( int,   , 4)						\
+	ATOMIC_OP(long, 64, 8)
 #endif
 
-ATOMIC_OPS(        , .aqrl)
-ATOMIC_OPS(_acquire,   .aq)
-ATOMIC_OPS(_release,   .rl)
-ATOMIC_OPS(_relaxed,      )
+ATOMIC_OPS()
 
 #undef ATOMIC_OPS
 #undef ATOMIC_OP
@@ -340,13 +459,13 @@ static __always_inline int atomic_sub_if_positive(atomic_t *v, int offset)
        int prev, rc;
 
 	__asm__ __volatile__ (
-		"0:\n\t"
-		"lr.w.aqrl  %[p],  %[c]\n\t"
-		"sub       %[rc],  %[p], %[o]\n\t"
-		"bltz      %[rc],    1f\n\t"
-		"sc.w.aqrl %[rc], %[rc], %[c]\n\t"
-		"bnez      %[rc],    0b\n\t"
-		"1:"
+		"0:	lr.w     %[p],  %[c]\n"
+		"	sub      %[rc], %[p], %[o]\n"
+		"	bltz     %[rc], 1f\n"
+		"	sc.w.rl  %[rc], %[rc], %[c]\n"
+		"	bnez     %[rc], 0b\n"
+		"	fence    rw, rw\n"
+		"1:\n"
 		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
 		: [o]"r" (offset)
 		: "memory");
@@ -361,13 +480,13 @@ static __always_inline long atomic64_sub_if_positive(atomic64_t *v, int offset)
        long prev, rc;
 
 	__asm__ __volatile__ (
-		"0:\n\t"
-		"lr.d.aqrl  %[p],  %[c]\n\t"
-		"sub       %[rc],  %[p], %[o]\n\t"
-		"bltz      %[rc],    1f\n\t"
-		"sc.d.aqrl %[rc], %[rc], %[c]\n\t"
-		"bnez      %[rc],    0b\n\t"
-		"1:"
+		"0:	lr.d     %[p],  %[c]\n"
+		"	sub      %[rc], %[p], %[o]\n"
+		"	bltz     %[rc], 1f\n"
+		"	sc.d.rl  %[rc], %[rc], %[c]\n"
+		"	bnez     %[rc], 0b\n"
+		"	fence    rw, rw\n"
+		"1:\n"
 		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
 		: [o]"r" (offset)
 		: "memory");
diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h
index 5510366d169a..d4628e4b3a5e 100644
--- a/arch/riscv/include/asm/barrier.h
+++ b/arch/riscv/include/asm/barrier.h
@@ -38,6 +38,21 @@
 #define __smp_rmb()	RISCV_FENCE(r,r)
 #define __smp_wmb()	RISCV_FENCE(w,w)
 
+#define __smp_store_release(p, v)					\
+do {									\
+	compiletime_assert_atomic_type(*p);				\
+	RISCV_FENCE(rw,w);						\
+	WRITE_ONCE(*p, v);						\
+} while (0)
+
+#define __smp_load_acquire(p)						\
+({									\
+	typeof(*p) ___p1 = READ_ONCE(*p);				\
+	compiletime_assert_atomic_type(*p);				\
+	RISCV_FENCE(r,rw);						\
+	___p1;								\
+})
+
 /*
  * This is a very specific barrier: it's currently only used in two places in
  * the kernel, both in the scheduler.  See include/linux/spinlock.h for the two
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index db249dbc7b97..c12833f7b6bd 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -17,45 +17,153 @@
 #include <linux/bug.h>
 
 #include <asm/barrier.h>
+#include <asm/fence.h>
 
-#define __xchg(new, ptr, size, asm_or)				\
-({								\
-	__typeof__(ptr) __ptr = (ptr);				\
-	__typeof__(new) __new = (new);				\
-	__typeof__(*(ptr)) __ret;				\
-	switch (size) {						\
-	case 4:							\
-		__asm__ __volatile__ (				\
-			"amoswap.w" #asm_or " %0, %2, %1"	\
-			: "=r" (__ret), "+A" (*__ptr)		\
-			: "r" (__new)				\
-			: "memory");				\
-		break;						\
-	case 8:							\
-		__asm__ __volatile__ (				\
-			"amoswap.d" #asm_or " %0, %2, %1"	\
-			: "=r" (__ret), "+A" (*__ptr)		\
-			: "r" (__new)				\
-			: "memory");				\
-		break;						\
-	default:						\
-		BUILD_BUG();					\
-	}							\
-	__ret;							\
-})
-
-#define xchg(ptr, x)    (__xchg((x), (ptr), sizeof(*(ptr)), .aqrl))
-
-#define xchg32(ptr, x)				\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 4);	\
-	xchg((ptr), (x));			\
-})
-
-#define xchg64(ptr, x)				\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);	\
-	xchg((ptr), (x));			\
+#define __xchg_relaxed(ptr, new, size)					\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(new) __new = (new);					\
+	__typeof__(*(ptr)) __ret;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.w %0, %2, %1\n"			\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.d %0, %2, %1\n"			\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define xchg_relaxed(ptr, x)						\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg_relaxed((ptr),			\
+					    _x_, sizeof(*(ptr)));	\
+})
+
+#define __xchg_acquire(ptr, new, size)					\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(new) __new = (new);					\
+	__typeof__(*(ptr)) __ret;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.w %0, %2, %1\n"			\
+			RISCV_ACQUIRE_BARRIER				\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.d %0, %2, %1\n"			\
+			RISCV_ACQUIRE_BARRIER				\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define xchg_acquire(ptr, x)						\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg_acquire((ptr),			\
+					    _x_, sizeof(*(ptr)));	\
+})
+
+#define __xchg_release(ptr, new, size)					\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(new) __new = (new);					\
+	__typeof__(*(ptr)) __ret;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			RISCV_RELEASE_BARRIER				\
+			"	amoswap.w %0, %2, %1\n"			\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			RISCV_RELEASE_BARRIER				\
+			"	amoswap.d %0, %2, %1\n"			\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define xchg_release(ptr, x)						\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg_release((ptr),			\
+					    _x_, sizeof(*(ptr)));	\
+})
+
+#define __xchg(ptr, new, size)						\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(new) __new = (new);					\
+	__typeof__(*(ptr)) __ret;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.w.aqrl %0, %2, %1\n"		\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.d.aqrl %0, %2, %1\n"		\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define xchg(ptr, x)							\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg((ptr), _x_, sizeof(*(ptr)));	\
+})
+
+#define xchg32(ptr, x)							\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
+	xchg((ptr), (x));						\
+})
+
+#define xchg64(ptr, x)							\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	xchg((ptr), (x));						\
 })
 
 /*
@@ -63,7 +171,51 @@
  * store NEW in MEM.  Return the initial value in MEM.  Success is
  * indicated by comparing RETURN with OLD.
  */
-#define __cmpxchg(ptr, old, new, size, lrb, scb)			\
+#define __cmpxchg_relaxed(ptr, old, new, size)				\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(*(ptr)) __old = (old);				\
+	__typeof__(*(ptr)) __new = (new);				\
+	__typeof__(*(ptr)) __ret;					\
+	register unsigned int __rc;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			"0:	lr.w %0, %2\n"				\
+			"	bne  %0, %z3, 1f\n"			\
+			"	sc.w %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			"0:	lr.d %0, %2\n"				\
+			"	bne %0, %z3, 1f\n"			\
+			"	sc.d %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define cmpxchg_relaxed(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),			\
+					_o_, _n_, sizeof(*(ptr)));	\
+})
+
+#define __cmpxchg_acquire(ptr, old, new, size)				\
 ({									\
 	__typeof__(ptr) __ptr = (ptr);					\
 	__typeof__(*(ptr)) __old = (old);				\
@@ -73,24 +225,24 @@
 	switch (size) {							\
 	case 4:								\
 		__asm__ __volatile__ (					\
-		"0:"							\
-			"lr.w" #scb " %0, %2\n"				\
-			"bne         %0, %z3, 1f\n"			\
-			"sc.w" #lrb " %1, %z4, %2\n"			\
-			"bnez        %1, 0b\n"				\
-		"1:"							\
+			"0:	lr.w %0, %2\n"				\
+			"	bne  %0, %z3, 1f\n"			\
+			"	sc.w %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			RISCV_ACQUIRE_BARRIER				\
+			"1:\n"						\
 			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
 			: "rJ" (__old), "rJ" (__new)			\
 			: "memory");					\
 		break;							\
 	case 8:								\
 		__asm__ __volatile__ (					\
-		"0:"							\
-			"lr.d" #scb " %0, %2\n"				\
-			"bne         %0, %z3, 1f\n"			\
-			"sc.d" #lrb " %1, %z4, %2\n"			\
-			"bnez        %1, 0b\n"				\
-		"1:"							\
+			"0:	lr.d %0, %2\n"				\
+			"	bne %0, %z3, 1f\n"			\
+			"	sc.d %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			RISCV_ACQUIRE_BARRIER				\
+			"1:\n"						\
 			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
 			: "rJ" (__old), "rJ" (__new)			\
 			: "memory");					\
@@ -101,34 +253,131 @@
 	__ret;								\
 })
 
-#define cmpxchg(ptr, o, n) \
-	(__cmpxchg((ptr), (o), (n), sizeof(*(ptr)), .aqrl, .aqrl))
+#define cmpxchg_acquire(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_acquire((ptr),			\
+					_o_, _n_, sizeof(*(ptr)));	\
+})
 
-#define cmpxchg_local(ptr, o, n) \
-	(__cmpxchg((ptr), (o), (n), sizeof(*(ptr)), , ))
+#define __cmpxchg_release(ptr, old, new, size)				\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(*(ptr)) __old = (old);				\
+	__typeof__(*(ptr)) __new = (new);				\
+	__typeof__(*(ptr)) __ret;					\
+	register unsigned int __rc;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			RISCV_RELEASE_BARRIER				\
+			"0:	lr.w %0, %2\n"				\
+			"	bne  %0, %z3, 1f\n"			\
+			"	sc.w %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			RISCV_RELEASE_BARRIER				\
+			"0:	lr.d %0, %2\n"				\
+			"	bne %0, %z3, 1f\n"			\
+			"	sc.d %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define cmpxchg_release(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_release((ptr),			\
+					_o_, _n_, sizeof(*(ptr)));	\
+})
+
+#define __cmpxchg(ptr, old, new, size)					\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(*(ptr)) __old = (old);				\
+	__typeof__(*(ptr)) __new = (new);				\
+	__typeof__(*(ptr)) __ret;					\
+	register unsigned int __rc;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			"0:	lr.w %0, %2\n"				\
+			"	bne  %0, %z3, 1f\n"			\
+			"	sc.w.rl %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"	fence rw, rw\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			"0:	lr.d %0, %2\n"				\
+			"	bne %0, %z3, 1f\n"			\
+			"	sc.d.rl %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"	fence rw, rw\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
 
-#define cmpxchg32(ptr, o, n)			\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 4);	\
-	cmpxchg((ptr), (o), (n));		\
+#define cmpxchg(ptr, o, n)						\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg((ptr),				\
+				       _o_, _n_, sizeof(*(ptr)));	\
 })
 
-#define cmpxchg32_local(ptr, o, n)		\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 4);	\
-	cmpxchg_local((ptr), (o), (n));		\
+#define cmpxchg_local(ptr, o, n)					\
+	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
+
+#define cmpxchg32(ptr, o, n)						\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
+	cmpxchg((ptr), (o), (n));					\
 })
 
-#define cmpxchg64(ptr, o, n)			\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);	\
-	cmpxchg((ptr), (o), (n));		\
+#define cmpxchg32_local(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
+	cmpxchg_relaxed((ptr), (o), (n))				\
 })
 
-#define cmpxchg64_local(ptr, o, n)		\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);	\
-	cmpxchg_local((ptr), (o), (n));		\
+#define cmpxchg64(ptr, o, n)						\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg((ptr), (o), (n));					\
+})
+
+#define cmpxchg64_local(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg_relaxed((ptr), (o), (n));				\
 })
 
 #endif /* _ASM_RISCV_CMPXCHG_H */
diff --git a/arch/riscv/include/asm/fence.h b/arch/riscv/include/asm/fence.h
new file mode 100644
index 000000000000..2b443a3a487f
--- /dev/null
+++ b/arch/riscv/include/asm/fence.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_RISCV_FENCE_H
+#define _ASM_RISCV_FENCE_H
+
+#ifdef CONFIG_SMP
+#define RISCV_ACQUIRE_BARRIER		"\tfence r , rw\n"
+#define RISCV_RELEASE_BARRIER		"\tfence rw,  w\n"
+#else
+#define RISCV_ACQUIRE_BARRIER
+#define RISCV_RELEASE_BARRIER
+#endif
+
+#endif	/* _ASM_RISCV_FENCE_H */
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 66d4175eb13e..c6dcc5291f97 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -8,3 +8,59 @@
 #if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_FRAME_POINTER)
 #define HAVE_FUNCTION_GRAPH_FP_TEST
 #endif
+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
+
+#define ARCH_SUPPORTS_FTRACE_OPS 1
+#ifndef __ASSEMBLY__
+void _mcount(void);
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+	return addr;
+}
+
+struct dyn_arch_ftrace {
+};
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * A general call in RISC-V is a pair of insts:
+ * 1) auipc: setting high-20 pc-related bits to ra register
+ * 2) jalr: setting low-12 offset to ra, jump to ra, and set ra to
+ *          return address (original pc + 4)
+ *
+ * Dynamic ftrace generates probes to call sites, so we must deal with
+ * both auipc and jalr at the same time.
+ */
+
+#define MCOUNT_ADDR		((unsigned long)_mcount)
+#define JALR_SIGN_MASK		(0x00000800)
+#define JALR_OFFSET_MASK	(0x00000fff)
+#define AUIPC_OFFSET_MASK	(0xfffff000)
+#define AUIPC_PAD		(0x00001000)
+#define JALR_SHIFT		20
+#define JALR_BASIC		(0x000080e7)
+#define AUIPC_BASIC		(0x00000097)
+#define NOP4			(0x00000013)
+
+#define make_call(caller, callee, call)					\
+do {									\
+	call[0] = to_auipc_insn((unsigned int)((unsigned long)callee -	\
+				(unsigned long)caller));		\
+	call[1] = to_jalr_insn((unsigned int)((unsigned long)callee -	\
+			       (unsigned long)caller));			\
+} while (0)
+
+#define to_jalr_insn(offset)						\
+	(((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_BASIC)
+
+#define to_auipc_insn(offset)						\
+	((offset & JALR_SIGN_MASK) ?					\
+	(((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_BASIC) :	\
+	((offset & AUIPC_OFFSET_MASK) | AUIPC_BASIC))
+
+/*
+ * Let auipc+jalr be the basic *mcount unit*, so we make it 8 bytes here.
+ */
+#define MCOUNT_INSN_SIZE 8
+#endif
diff --git a/arch/riscv/include/asm/module.h b/arch/riscv/include/asm/module.h
new file mode 100644
index 000000000000..349df33808c4
--- /dev/null
+++ b/arch/riscv/include/asm/module.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2017 Andes Technology Corporation */
+
+#ifndef _ASM_RISCV_MODULE_H
+#define _ASM_RISCV_MODULE_H
+
+#include <asm-generic/module.h>
+
+#define MODULE_ARCH_VERMAGIC    "riscv"
+
+u64 module_emit_got_entry(struct module *mod, u64 val);
+u64 module_emit_plt_entry(struct module *mod, u64 val);
+
+#ifdef CONFIG_MODULE_SECTIONS
+struct mod_section {
+	struct elf64_shdr *shdr;
+	int num_entries;
+	int max_entries;
+};
+
+struct mod_arch_specific {
+	struct mod_section got;
+	struct mod_section plt;
+	struct mod_section got_plt;
+};
+
+struct got_entry {
+	u64 symbol_addr;	/* the real variable address */
+};
+
+static inline struct got_entry emit_got_entry(u64 val)
+{
+	return (struct got_entry) {val};
+}
+
+static inline struct got_entry *get_got_entry(u64 val,
+					      const struct mod_section *sec)
+{
+	struct got_entry *got = (struct got_entry *)sec->shdr->sh_addr;
+	int i;
+	for (i = 0; i < sec->num_entries; i++) {
+		if (got[i].symbol_addr == val)
+			return &got[i];
+	}
+	return NULL;
+}
+
+struct plt_entry {
+	/*
+	 * Trampoline code to real target address. The return address
+	 * should be the original (pc+4) before entring plt entry.
+	 */
+	u32 insn_auipc;		/* auipc t0, 0x0                       */
+	u32 insn_ld;		/* ld    t1, 0x10(t0)                  */
+	u32 insn_jr;		/* jr    t1                            */
+};
+
+#define OPC_AUIPC  0x0017
+#define OPC_LD     0x3003
+#define OPC_JALR   0x0067
+#define REG_T0     0x5
+#define REG_T1     0x6
+
+static inline struct plt_entry emit_plt_entry(u64 val, u64 plt, u64 got_plt)
+{
+	/*
+	 * U-Type encoding:
+	 * +------------+----------+----------+
+	 * | imm[31:12] | rd[11:7] | opc[6:0] |
+	 * +------------+----------+----------+
+	 *
+	 * I-Type encoding:
+	 * +------------+------------+--------+----------+----------+
+	 * | imm[31:20] | rs1[19:15] | funct3 | rd[11:7] | opc[6:0] |
+	 * +------------+------------+--------+----------+----------+
+	 *
+	 */
+	u64 offset = got_plt - plt;
+	u32 hi20 = (offset + 0x800) & 0xfffff000;
+	u32 lo12 = (offset - hi20);
+	return (struct plt_entry) {
+		OPC_AUIPC | (REG_T0 << 7) | hi20,
+		OPC_LD | (lo12 << 20) | (REG_T0 << 15) | (REG_T1 << 7),
+		OPC_JALR | (REG_T1 << 15)
+	};
+}
+
+static inline int get_got_plt_idx(u64 val, const struct mod_section *sec)
+{
+	struct got_entry *got_plt = (struct got_entry *)sec->shdr->sh_addr;
+	int i;
+	for (i = 0; i < sec->num_entries; i++) {
+		if (got_plt[i].symbol_addr == val)
+			return i;
+	}
+	return -1;
+}
+
+static inline struct plt_entry *get_plt_entry(u64 val,
+				      const struct mod_section *sec_plt,
+				      const struct mod_section *sec_got_plt)
+{
+	struct plt_entry *plt = (struct plt_entry *)sec_plt->shdr->sh_addr;
+	int got_plt_idx = get_got_plt_idx(val, sec_got_plt);
+	if (got_plt_idx >= 0)
+		return plt + got_plt_idx;
+	else
+		return NULL;
+}
+
+#endif /* CONFIG_MODULE_SECTIONS */
+
+#endif /* _ASM_RISCV_MODULE_H */
diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
index 2fd27e8ef1fd..8eb26d1ede81 100644
--- a/arch/riscv/include/asm/spinlock.h
+++ b/arch/riscv/include/asm/spinlock.h
@@ -17,6 +17,7 @@
 
 #include <linux/kernel.h>
 #include <asm/current.h>
+#include <asm/fence.h>
 
 /*
  * Simple spin lock operations.  These provide no fairness guarantees.
@@ -28,10 +29,7 @@
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
-	__asm__ __volatile__ (
-		"amoswap.w.rl x0, x0, %0"
-		: "=A" (lock->lock)
-		:: "memory");
+	smp_store_release(&lock->lock, 0);
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
@@ -39,7 +37,8 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 	int tmp = 1, busy;
 
 	__asm__ __volatile__ (
-		"amoswap.w.aq %0, %2, %1"
+		"	amoswap.w %0, %2, %1\n"
+		RISCV_ACQUIRE_BARRIER
 		: "=r" (busy), "+A" (lock->lock)
 		: "r" (tmp)
 		: "memory");
@@ -68,8 +67,9 @@ static inline void arch_read_lock(arch_rwlock_t *lock)
 		"1:	lr.w	%1, %0\n"
 		"	bltz	%1, 1b\n"
 		"	addi	%1, %1, 1\n"
-		"	sc.w.aq	%1, %1, %0\n"
+		"	sc.w	%1, %1, %0\n"
 		"	bnez	%1, 1b\n"
+		RISCV_ACQUIRE_BARRIER
 		: "+A" (lock->lock), "=&r" (tmp)
 		:: "memory");
 }
@@ -82,8 +82,9 @@ static inline void arch_write_lock(arch_rwlock_t *lock)
 		"1:	lr.w	%1, %0\n"
 		"	bnez	%1, 1b\n"
 		"	li	%1, -1\n"
-		"	sc.w.aq	%1, %1, %0\n"
+		"	sc.w	%1, %1, %0\n"
 		"	bnez	%1, 1b\n"
+		RISCV_ACQUIRE_BARRIER
 		: "+A" (lock->lock), "=&r" (tmp)
 		:: "memory");
 }
@@ -96,8 +97,9 @@ static inline int arch_read_trylock(arch_rwlock_t *lock)
 		"1:	lr.w	%1, %0\n"
 		"	bltz	%1, 1f\n"
 		"	addi	%1, %1, 1\n"
-		"	sc.w.aq	%1, %1, %0\n"
+		"	sc.w	%1, %1, %0\n"
 		"	bnez	%1, 1b\n"
+		RISCV_ACQUIRE_BARRIER
 		"1:\n"
 		: "+A" (lock->lock), "=&r" (busy)
 		:: "memory");
@@ -113,8 +115,9 @@ static inline int arch_write_trylock(arch_rwlock_t *lock)
 		"1:	lr.w	%1, %0\n"
 		"	bnez	%1, 1f\n"
 		"	li	%1, -1\n"
-		"	sc.w.aq	%1, %1, %0\n"
+		"	sc.w	%1, %1, %0\n"
 		"	bnez	%1, 1b\n"
+		RISCV_ACQUIRE_BARRIER
 		"1:\n"
 		: "+A" (lock->lock), "=&r" (busy)
 		:: "memory");
@@ -125,7 +128,8 @@ static inline int arch_write_trylock(arch_rwlock_t *lock)
 static inline void arch_read_unlock(arch_rwlock_t *lock)
 {
 	__asm__ __volatile__(
-		"amoadd.w.rl x0, %1, %0"
+		RISCV_RELEASE_BARRIER
+		"	amoadd.w x0, %1, %0\n"
 		: "+A" (lock->lock)
 		: "r" (-1)
 		: "memory");
@@ -133,10 +137,7 @@ static inline void arch_read_unlock(arch_rwlock_t *lock)
 
 static inline void arch_write_unlock(arch_rwlock_t *lock)
 {
-	__asm__ __volatile__ (
-		"amoswap.w.rl x0, x0, %0"
-		: "=A" (lock->lock)
-		:: "memory");
+	smp_store_release(&lock->lock, 0);
 }
 
 #endif /* _ASM_RISCV_SPINLOCK_H */
diff --git a/arch/riscv/include/uapi/asm/elf.h b/arch/riscv/include/uapi/asm/elf.h
index a510edfa8226..5cae4c30cd8e 100644
--- a/arch/riscv/include/uapi/asm/elf.h
+++ b/arch/riscv/include/uapi/asm/elf.h
@@ -79,5 +79,12 @@ typedef union __riscv_fp_state elf_fpregset_t;
 #define R_RISCV_TPREL_I		49
 #define R_RISCV_TPREL_S		50
 #define R_RISCV_RELAX		51
+#define R_RISCV_SUB6		52
+#define R_RISCV_SET6		53
+#define R_RISCV_SET8		54
+#define R_RISCV_SET16		55
+#define R_RISCV_SET32		56
+#define R_RISCV_32_PCREL	57
+
 
 #endif /* _UAPI_ASM_ELF_H */