Many more comments. Use a vec<bool> when we only care about 0/1.devel/jlaw/crc

author: Jeff Law <jlaw@ventanamicro> 2023-03-21 22:45:51 -0600
committer: Jeff Law <jlaw@ventanamicro> 2023-03-21 22:46:50 -0600
commit: 4953a04ee6e956898c3330e70b6043db2f55aeac (patch)
tree: e320974c64944755fd100033ff9c6ecb4ecdc0b0
parent: 42a2a810cd72195c8ece37405336bb5e3795de85 (diff)
download: gcc-devel/jlaw/crc.tar.gz
2 files changed, 79 insertions, 8 deletions
diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
index bcd5302f740..fffd8c6ca50 100644
--- a/gcc/config/riscv/bitmanip.md
+++ b/gcc/config/riscv/bitmanip.md
@@ -702,7 +702,19 @@
       rtx t1 = force_reg (word_mode, operands[3]);
       a0 = force_reg (word_mode, gen_rtx_XOR (word_mode, a0, a1));
       a0 = force_reg (word_mode, gen_rtx_CLMUL (word_mode, a0, q_reg));
+
+      /* XXX By adjusting Q we may be able to eliminate this shift.  The size
+         of this shift seems to be dependent on the size of the CRC
+         output (aka N in N-bit CRC).  */
       a0 = force_reg (word_mode, gen_rtx_ASHIFT (word_mode, a0, GEN_INT (16)));
+
+      /* CCC By adjusting operands[3] (which should be a constant) we may
+         be able to utilize CLMULH to get the bits in the right place and
+         avoid the shifts to extract the bitfield.   If that is not possible
+         the shifts will still be needed and are dependent on input/output
+         sizes as well.   Does adjusting the constant and shift counts
+         result in a constant that is more likely to bt synthesized in a
+         single instruction?  */
       a0 = force_reg (word_mode, gen_rtx_CLMUL (word_mode, a0, t1));
       a0 = force_reg (word_mode, gen_rtx_LSHIFTRT (word_mode, a0, GEN_INT (24)));
       a0 = force_reg (word_mode, gen_rtx_ASHIFT (word_mode, a0, GEN_INT (24)));
@@ -718,7 +730,13 @@
   else
     {
       /* If we do not have the ZBC extension (ie, no clmul), then
-         use a table based algorithm to implement the CRC.  */
+         use a table based algorithm to implement the CRC. 
+
+         XXX What is the size of each element in this table and
+         how many entries are in the table?  Does the element
+         size or number of elements vary based on the input or
+         output types for the CRC function?   If so, do we need
+         to restrict it to only be used for certain modes?  */
       expand_crc_table_based (operands, QImode);
     }
 
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 842fe3f7902..3fc6b9c54be 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -6902,19 +6902,21 @@ riscv_shamt_matches_mask_p (int shamt, HOST_WIDE_INT mask)
   return shamt == ctz_hwi (mask);
 }
 
-/* Calculate the quotient of polynomial long division of x^2n by the polynomial
+/* Return the quotient of polynomial long division of x^2n by POLYNOMIAL
    in GF (2^n).  */
 
 unsigned HOST_WIDE_INT
 gf2n_poly_long_div_quotient (unsigned HOST_WIDE_INT polynomial)
 {
-  vec<short> x2n, pol, q;
+  vec<bool>pol;
+  vec<short> x2n, q;
+
   // Create vector of bits, for the polynomial.
   pol.create (sizeof (polynomial) * BITS_PER_UNIT + 1);
   size_t n = 1;
   for ( ; polynomial; n++)
     {
-      pol.quick_push (polynomial&1);
+      pol.quick_push (polynomial & 1);
       polynomial >>= 1;
     }
   pol.quick_push (1);
@@ -6923,6 +6925,8 @@ gf2n_poly_long_div_quotient (unsigned HOST_WIDE_INT polynomial)
   x2n.create (2 * n - 1);
   for (size_t i = 0; i < 2 * (n - 1); i++)
     x2n.safe_push (0);
+
+  /* Is this the implicit bit on at the top of the poly?  */
   x2n.safe_push (1);
 
   q.create (n);
@@ -6952,6 +6956,9 @@ gf2n_poly_long_div_quotient (unsigned HOST_WIDE_INT polynomial)
 }
 
 /* Calculates reciprocal CRC for initial CRC and given polynomial.  */
+/* XXX Is this needed?  It's not referenced anywhere. 
+   If it is needed, it needs to be generalized rather than only
+   working on uint16_t.  */
 static uint16_t
 generate_crc_reciprocal (uint16_t crc,
 			 uint16_t polynomial)
@@ -6967,6 +6974,8 @@ generate_crc_reciprocal (uint16_t crc,
 }
 
 /* Calculates CRC for initial CRC and given polynomial.  */
+/* XXX This needs to be generalized rather than only working
+   on uint16_t.  */
 static uint16_t
 generate_crc (uint16_t crc,
 	      uint16_t polynomial)
@@ -6983,6 +6992,19 @@ generate_crc (uint16_t crc,
 }
 
 /* Generates 16-bit CRC table.  */
+/* XXX This needs to be generalized rather than only working
+   on uint16_t.
+
+   This looks like it tries to share tables which is good, but
+   don't we have to verify that the polynomial and sizes are the
+   same for sharing to be safe?  Doesn't that in turn argue that
+   the polynomial and size should be encoded into the table
+   name? 
+
+   Presumably the table should be going into a readonly data
+   section.  It doesn't look like we make any attempt to switch
+   sections.  Mixing code and data is exceedingly bad on
+   modern processors.  */
 rtx
 generate_crc16_table (uint16_t polynom)
 {
@@ -7016,27 +7038,43 @@ generate_crc16_table (uint16_t polynom)
   return lab;
 }
 
-void reflect (rtx op1, machine_mode mode)
+/* XXX Is this needed?  It's not referenced anywhere.   */
+void
+reflect (rtx op1, machine_mode mode)
 {
   // Reflect the bits
   op1 = gen_rtx_BSWAP (mode, op1);
 
-// Adjust the position of the reflected bits
+  // Adjust the position of the reflected bits
+  /* XXX I don't understand the comment.  Under what
+     conditions does mode != Pmode?  */
   if (mode != Pmode)
     op1 = gen_rtx_SUBREG (Pmode, op1, 0);
 
-// Shift the reflected bits to the least significant end
+  // Shift the reflected bits to the least significant end
+  /* XXX This seems to assume we're always dealing with
+     a 16bit quantity.  */
   rtx shift_amt = gen_rtx_CONST_INT (Pmode, 8);
   op1 = gen_rtx_LSHIFTRT (Pmode, op1, shift_amt);
+
+  /* XXX This routine is going to have no impact if it was
+     ever used.  Changing OP1 above isn't reflected into
+     the caller.  */
 }
 
 /* Generate table based CRC code.  */
+/* XXX This doesn't seem to be used.  Is it the case that we're
+   eventually going to need to distinguish between a bit-reflected
+   CRC and a normal CRC for table based variants?  If so, doesn't
+   that need to be in the operands for the .CRC IFN?  */
 void
-expand_crc_table_based_reflected (rtx *operands,  machine_mode data_mode)
+expand_crc_table_based_reflected (rtx *operands, machine_mode data_mode)
 {
   machine_mode mode = GET_MODE (operands[0]);
   rtx in = force_reg (mode, gen_rtx_XOR (mode, operands[1], operands[2]));
   rtx ix = gen_rtx_AND (mode, in, GEN_INT (GET_MODE_MASK (data_mode)));
+  /* XXX Under what conditions will mode != Pmode be true?  Is this an
+     artifact of having the modes wrong for the crc expander?  */
   if (mode != Pmode)
     ix = gen_rtx_SUBREG (Pmode, ix, 0);
   ix = gen_rtx_ASHIFT (Pmode, ix, GEN_INT (exact_log2 (GET_MODE_SIZE (mode)
@@ -7048,6 +7086,15 @@ expand_crc_table_based_reflected (rtx *operands,  machine_mode data_mode)
   rtx high = gen_rtx_LSHIFTRT (mode, operands[1],
 			       GEN_INT (data_mode));
   rtx crc = force_reg (mode, gen_rtx_XOR (mode, tab, high));
+
+  /* XXX In general we prefer to avoid SUBREGs, especially
+     paradoxical subregs (outer mode is wider than inner mode).
+
+     It should be possible to replace a paradoxical subreg with
+     a sign or zero extension.
+
+     If this is a narrowing subreg, then gen_lowpart might be
+     better.  */
   riscv_emit_move (operands[0], gen_rtx_SUBREG (mode, crc, 0));
 }
 
@@ -7060,6 +7107,8 @@ expand_crc_table_based (rtx *operands,  machine_mode data_mode)
 			       GEN_INT (8));
   rtx in = force_reg (mode, gen_rtx_XOR (mode, op1, operands[2]));
   rtx ix = gen_rtx_AND (mode, in, GEN_INT (GET_MODE_MASK (data_mode)));
+  /* XXX Under what conditions will mode != Pmode be true?  Is this an
+     artifact of having the modes wrong for the crc expander?  */
   if (mode != Pmode)
     ix = gen_rtx_SUBREG (Pmode, ix, 0);
   ix = gen_rtx_ASHIFT (Pmode, ix, GEN_INT (exact_log2 (GET_MODE_SIZE (mode)
@@ -7072,6 +7121,10 @@ expand_crc_table_based (rtx *operands,  machine_mode data_mode)
 			       GEN_INT (8));
   high = force_reg (mode, gen_rtx_AND (mode, high, GEN_INT (65535)));
   rtx crc = force_reg (mode, gen_rtx_XOR (mode, tab, high));
+
+  /* Why is this different than the reflected version above?  Doesn't
+     it have the same potential concers WRT mismatched modes between
+     these two objects?  */
   riscv_emit_move (operands[0], crc);
 }
author	Jeff Law <jlaw@ventanamicro>	2023-03-21 22:45:51 -0600
committer	Jeff Law <jlaw@ventanamicro>	2023-03-21 22:46:50 -0600
commit	4953a04ee6e956898c3330e70b6043db2f55aeac (patch)
tree	e320974c64944755fd100033ff9c6ecb4ecdc0b0
parent	42a2a810cd72195c8ece37405336bb5e3795de85 (diff)
download	gcc-devel/jlaw/crc.tar.gz