summaryrefslogtreecommitdiff
path: root/tune/speed-ext.c
blob: 239c9dd40b244f326e1e73c5bcd96c5333951f1a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
/* An example of extending the speed program to measure routines not in GMP. */

/*
Copyright 1999, 2000 Free Software Foundation, Inc.

This file is part of the GNU MP Library.

The GNU MP Library is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at your
option) any later version.

The GNU MP Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public License
along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111-1307, USA.
*/


/* The extension here is three versions of an mpn arithmetic mean.  These
   aren't meant to be particularly useful, just examples.

   You can run something like the following to compare their speeds.

           ./speed-ext -s 1-20 -c mean_calls mean_open mean_open2

   On RISC chips, mean_open() might be fastest if the compiler is doing a
   good job.  On the register starved x86s, mean_calls will be fastest.


   Notes:

   SPEED_EXTRA_PROTOS and SPEED_EXTRA_ROUTINES are macros that get expanded
   by speed.c in useful places.  SPEED_EXTRA_PROTOS goes after the header
   files, and SPEED_EXTRA_ROUTINES goes in the array of available routines.

   The advantage of this #include "speed.c" scheme is that there's no
   editing of a copy of that file, and new features in new versions of it
   will be immediately available.

   In a real program the routines mean_calls() etc would probably be in
   separate C or assembler source files, and just the measuring
   speed_mean_calls() etc would be here.  Linking against other libraries
   for things to measure is perfectly possible too.

   When attempting to compare two versions of the same named routine, say
   like the generic and assembler versions of mpn_add_n(), creative use of
   cc -D or #define is suggested, so one or both can be renamed and linked
   into the same program.  It'll be much easier to compare them side by side
   than with separate programs for each.

   common.c has notes on writing speed measuring routines.

   Remember to link against tune/libspeed.la (or tune/.libs/libspeed.a if
   not using libtool) to get common.o and other objects needed by speed.c.  */


#define SPEED_EXTRA_PROTOS                              \
  double speed_mean_calls (struct speed_params *s);     \
  double speed_mean_open  (struct speed_params *s);     \
  double speed_mean_open2 (struct speed_params *s);

#define SPEED_EXTRA_ROUTINES            \
  { "mean_calls",  speed_mean_calls  }, \
  { "mean_open",   speed_mean_open   }, \
  { "mean_open2",  speed_mean_open2  },

#include "speed.c"


/* A straightforward implementation calling mpn subroutines.

   wp,size is set to (xp,size + yp,size) / 2.  The return value is the
   remainder from the division.  The other versions are the same.  */

mp_limb_t
mean_calls (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size)
{
  mp_limb_t  c, ret;

  ASSERT (size >= 1);

  c = mpn_add_n (wp, xp, yp, size);
  ret = mpn_rshift (wp, wp, size, 1) >> (BITS_PER_MP_LIMB-1);
  wp[size-1] |= (c << (BITS_PER_MP_LIMB-1));
  return ret;
}


/* An open-coded version, making one pass over the data.  The right shift is
   done as the added limbs are produced.  The addition code follows
   mpn/generic/add_n.c. */

mp_limb_t
mean_open (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size)
{
  mp_limb_t  w, wprev, x, y, c, ret;
  mp_size_t  i;

  ASSERT (size >= 1);

  x = xp[0];
  y = yp[0];

  wprev = x + y;
  c = (wprev < x);
  ret = (wprev & 1);

#define RSHIFT(hi,lo)   (((lo) >> 1) | ((hi) << (BITS_PER_MP_LIMB-1)))

  for (i = 1; i < size; i++)
    {
      x = xp[i];
      y = yp[i];

      w = x + c;
      c = (w < x);
      w += y;
      c += (w < y);

      wp[i-1] = RSHIFT (w, wprev);
      wprev = w;
    }

  wp[i-1] = RSHIFT (c, wprev);

  return ret;
}


/* Another one-pass version, but right shifting the source limbs rather than
   the result limbs.  There's not much chance of this being better than the
   above, but it's an alternative at least. */

mp_limb_t
mean_open2 (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size)
{
  mp_limb_t  w, x, y, xnext, ynext, c, ret;
  mp_size_t  i;

  ASSERT (size >= 1);

  x = xp[0];
  y = yp[0];

  /* ret is the low bit of x+y, c is the carry out of that low bit add */
  ret = (x ^ y) & 1;
  c   = (x & y) & 1;

  for (i = 0; i < size-1; i++)
    {
      xnext = xp[i+1];
      ynext = yp[i+1];
      x = RSHIFT (xnext, x);
      y = RSHIFT (ynext, y);

      w = x + c;
      c = (w < x);
      w += y;
      c += (w < y);
      wp[i] = w;

      x = xnext;
      y = ynext;
    }

  wp[i] = (x >> 1) + (y >> 1) + c;

  return ret;
}


/* The speed measuring routines are the same apart from which function they
   run, so a macro is used.  Actually this macro is the same as
   SPEED_ROUTINE_MPN_BINARY_N.  */

#define SPEED_ROUTINE_MEAN(mean_fun)                    \
  {                                                     \
    unsigned  i;                                        \
    mp_ptr    wp;                                       \
    double    t;                                        \
    TMP_DECL (marker);                                  \
                                                        \
    SPEED_RESTRICT_COND (s->size >= 1);                 \
                                                        \
    TMP_MARK (marker);                                  \
    wp = SPEED_TMP_ALLOC_LIMBS (s->size, s->align_wp);  \
                                                        \
    speed_operand_src (s, s->xp, s->size);              \
    speed_operand_src (s, s->yp, s->size);              \
    speed_operand_dst (s, wp, s->size);                 \
    speed_cache_fill (s);                               \
                                                        \
    speed_starttime ();                                 \
    i = s->reps;                                        \
    do                                                  \
      mean_fun (wp, s->xp, s->yp, s->size);             \
    while (--i != 0);                                   \
    t = speed_endtime ();                               \
                                                        \
    TMP_FREE (marker);                                  \
    return t;                                           \
  }

double
speed_mean_calls (struct speed_params *s)
{
  SPEED_ROUTINE_MEAN (mean_calls);
}

double
speed_mean_open (struct speed_params *s)
{
  SPEED_ROUTINE_MEAN (mean_open);
}

double
speed_mean_open2 (struct speed_params *s)
{
  SPEED_ROUTINE_MEAN (mean_open2);
}