1 files changed, 52 insertions, 40 deletions
diff --git a/libitm/config/linux/rwlock.cc b/libitm/config/linux/rwlock.cc
index 3471049083e..f87be2e880a 100644
--- a/libitm/config/linux/rwlock.cc
+++ b/libitm/config/linux/rwlock.cc
@@ -36,10 +36,11 @@ gtm_rwlock::read_lock (gtm_thread *tx)
   for (;;)
     {
       // Fast path: first announce our intent to read, then check for
-      // conflicting intents to write.  Note that direct assignment to
-      // an atomic object is memory_order_seq_cst.
-      tx->shared_state = 0;
-      if (likely(writers == 0))
+      // conflicting intents to write.  The fence ensures that this happens
+      // in exactly this order.
+      tx->shared_state.store (0, memory_order_relaxed);
+      atomic_thread_fence (memory_order_seq_cst);
+      if (likely (writers.load (memory_order_relaxed) == 0))
 	return;
 
       // There seems to be an active, waiting, or confirmed writer, so enter
@@ -50,10 +51,11 @@ gtm_rwlock::read_lock (gtm_thread *tx)
       // We need the barrier here for the same reason that we need it in
       // read_unlock().
       // TODO Potentially too many wake-ups. See comments in read_unlock().
-      tx->shared_state = -1;
-      if (writer_readers > 0)
+      tx->shared_state.store (-1, memory_order_relaxed);
+      atomic_thread_fence (memory_order_seq_cst);
+      if (writer_readers.load (memory_order_relaxed) > 0)
 	{
-	  writer_readers = 0;
+	  writer_readers.store (0, memory_order_relaxed);
 	  futex_wake(&writer_readers, 1);
 	}
 
@@ -61,16 +63,16 @@ gtm_rwlock::read_lock (gtm_thread *tx)
       // writer anymore.
       // TODO Spin here on writers for a while. Consider whether we woke
       // any writers before?
-      while (writers)
+      while (writers.load (memory_order_relaxed))
 	{
 	  // An active writer. Wait until it has finished. To avoid lost
 	  // wake-ups, we need to use Dekker-like synchronization.
 	  // Note that we cannot reset readers to zero when we see that there
 	  // are no writers anymore after the barrier because this pending
 	  // store could then lead to lost wake-ups at other readers.
-	  readers = 1;
-	  atomic_thread_fence(memory_order_acq_rel);
-	  if (writers)
+	  readers.store (1, memory_order_relaxed);
+	  atomic_thread_fence (memory_order_seq_cst);
+	  if (writers.load (memory_order_relaxed))
 	    futex_wait(&readers, 1);
 	}
 
@@ -95,8 +97,8 @@ bool
 gtm_rwlock::write_lock_generic (gtm_thread *tx)
 {
   // Try to acquire the write lock.
-  unsigned int w;
-  if (unlikely((w = __sync_val_compare_and_swap(&writers, 0, 1)) != 0))
+  int w = 0;
+  if (unlikely (!writers.compare_exchange_strong (w, 1)))
     {
       // If this is an upgrade, we must not wait for other writers or
       // upgrades.
@@ -104,19 +106,14 @@ gtm_rwlock::write_lock_generic (gtm_thread *tx)
 	return false;
 
       // There is already a writer. If there are no other waiting writers,
-      // switch to contended mode.
-      // Note that this is actually an atomic exchange, not a TAS. Also,
-      // it's only guaranteed to have acquire semantics, whereas we need a
-      // full barrier to make the Dekker-style synchronization work. However,
-      // we rely on the xchg being a full barrier on the architectures that we
-      // consider here.
-      // ??? Use C++0x atomics as soon as they are available.
+      // switch to contended mode.  We need seq_cst memory order to make the
+      // Dekker-style synchronization work.
       if (w != 2)
-	w = __sync_lock_test_and_set(&writers, 2);
+	w = writers.exchange (2);
       while (w != 0)
 	{
 	  futex_wait(&writers, 2);
-	  w = __sync_lock_test_and_set(&writers, 2);
+	  w = writers.exchange (2);
 	}
     }
 
@@ -130,13 +127,14 @@ gtm_rwlock::write_lock_generic (gtm_thread *tx)
   // TODO In the worst case, this requires one wait/wake pair for each
   // active reader. Reduce this!
   if (tx != 0)
-    tx->shared_state = ~(typeof tx->shared_state)0;
+    tx->shared_state.store (-1, memory_order_relaxed);
 
   for (gtm_thread *it = gtm_thread::list_of_threads; it != 0;
       it = it->next_thread)
     {
       // Use a loop here to check reader flags again after waiting.
-      while (it->shared_state != ~(typeof it->shared_state)0)
+      while (it->shared_state.load (memory_order_relaxed)
+          != ~(typeof it->shared_state)0)
 	{
 	  // An active reader. Wait until it has finished. To avoid lost
 	  // wake-ups, we need to use Dekker-like synchronization.
@@ -144,12 +142,13 @@ gtm_rwlock::write_lock_generic (gtm_thread *tx)
 	  // the barrier that the reader has finished in the meantime;
 	  // however, this is only possible because we are the only writer.
 	  // TODO Spin for a while on this reader flag.
-	  writer_readers = 1;
-	  __sync_synchronize();
-	  if (it->shared_state != ~(typeof it->shared_state)0)
+	  writer_readers.store (1, memory_order_relaxed);
+	  atomic_thread_fence (memory_order_seq_cst);
+	  if (it->shared_state.load (memory_order_relaxed)
+	      != ~(typeof it->shared_state)0)
 	    futex_wait(&writer_readers, 1);
 	  else
-	    writer_readers = 0;
+	    writer_readers.store (0, memory_order_relaxed);
 	}
     }
 
@@ -181,19 +180,28 @@ gtm_rwlock::write_upgrade (gtm_thread *tx)
 void
 gtm_rwlock::read_unlock (gtm_thread *tx)
 {
-  tx->shared_state = ~(typeof tx->shared_state)0;
-
-  // If there is a writer waiting for readers, wake it up. We need the barrier
-  // to avoid lost wake-ups.
+  // We only need release memory order here because of privatization safety
+  // (this ensures that marking the transaction as inactive happens after
+  // any prior data accesses by this transaction, and that neither the
+  // compiler nor the hardware order this store earlier).
+  // ??? We might be able to avoid this release here if the compiler can't
+  // merge the release fence with the subsequent seq_cst fence.
+  tx->shared_state.store (-1, memory_order_release);
+
+  // If there is a writer waiting for readers, wake it up.  We need the fence
+  // to avoid lost wake-ups.  Furthermore, the privatization safety
+  // implementation in gtm_thread::try_commit() relies on the existence of
+  // this seq_cst fence.
   // ??? We might not be the last active reader, so the wake-up might happen
   // too early. How do we avoid this without slowing down readers too much?
   // Each reader could scan the list of txns for other active readers but
   // this can result in many cache misses. Use combining instead?
   // TODO Sends out one wake-up for each reader in the worst case.
-  __sync_synchronize();
-  if (unlikely(writer_readers > 0))
+  atomic_thread_fence (memory_order_seq_cst);
+  if (unlikely (writer_readers.load (memory_order_relaxed) > 0))
     {
-      writer_readers = 0;
+      // No additional barrier needed here (see write_unlock()).
+      writer_readers.store (0, memory_order_relaxed);
       futex_wake(&writer_readers, 1);
     }
 }
@@ -204,11 +212,11 @@ gtm_rwlock::read_unlock (gtm_thread *tx)
 void
 gtm_rwlock::write_unlock ()
 {
-  // This is supposed to be a full barrier.
-  if (__sync_fetch_and_sub(&writers, 1) == 2)
+  // This needs to have seq_cst memory order.
+  if (writers.fetch_sub (1) == 2)
     {
       // There might be waiting writers, so wake them.
-      writers = 0;
+      writers.store (0, memory_order_relaxed);
       if (futex_wake(&writers, 1) == 0)
 	{
 	  // If we did not wake any waiting writers, we might indeed be the
@@ -223,9 +231,13 @@ gtm_rwlock::write_unlock ()
   // No waiting writers, so wake up all waiting readers.
   // Because the fetch_and_sub is a full barrier already, we don't need
   // another barrier here (as in read_unlock()).
-  if (readers > 0)
+  if (readers.load (memory_order_relaxed) > 0)
     {
-      readers = 0;
+      // No additional barrier needed here.  The previous load must be in
+      // modification order because of the coherency constraints.  Late stores
+      // by a reader are not a problem because readers do Dekker-style
+      // synchronization on writers.
+      readers.store (0, memory_order_relaxed);
       futex_wake(&readers, INT_MAX);
     }
 }