summaryrefslogtreecommitdiff
path: root/includes/TailCalls.h
blob: 670da9546f75de9df447311bef7f9edbfd293f0e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/* -----------------------------------------------------------------------------
 *
 * (c) The GHC Team, 1998-1999
 *
 * Stuff for implementing proper tail jumps.
 *
 * ---------------------------------------------------------------------------*/

#ifndef TAILCALLS_H
#define TAILCALLS_H

/* -----------------------------------------------------------------------------
   Unmangled tail-jumping: use the mini interpretter.
   -------------------------------------------------------------------------- */

#ifdef USE_MINIINTERPRETER

#define JMP_(cont) return((StgFunPtr)(cont))
#define FB_
#define FE_

#else

extern void __DISCARD__(void);

/* -----------------------------------------------------------------------------
   Tail calling on x86
   -------------------------------------------------------------------------- */

#if i386_HOST_ARCH

/* Note about discard: possibly there to fool GCC into clearing up
   before we do the jump eg. if there are some arguments left on the C
   stack that GCC hasn't popped yet.  Also possibly to fool any
   optimisations (a function call often acts as a barrier).  Not sure
   if any of this is necessary now -- SDM

   Comment to above note: I don't think the __DISCARD__() in JMP_ is 
   necessary.  Arguments should be popped from the C stack immediately
   after returning from a function, as long as we pass -fno-defer-pop
   to gcc.  Moreover, a goto to a first-class label acts as a barrier 
   for optimisations in the same way a function call does. 
   -= chak
   */

/* The goto here seems to cause gcc -O2 to delete all the code after
   it - including the FE_ marker and the epilogue code - exactly what
   we want! -- SDM
   */

#define JMP_(cont)			\
    { 					\
      void *__target;			\
      __DISCARD__();			\
      __target = (void *)(cont);    	\
      goto *__target; 	    	    	\
    }

#endif /* i386_HOST_ARCH */

/* -----------------------------------------------------------------------------
   Tail calling on x86_64
   -------------------------------------------------------------------------- */

#if x86_64_HOST_ARCH

/*
  NOTE about __DISCARD__():

  On x86_64 this is necessary to work around bugs in the register
  variable support in gcc.  Without the __DISCARD__() call, gcc will
  silently throw away assignements to global register variables that
  happen before the jump.

  Here's the example:

  extern void g(void);
  static void f(void) {
    R1 = g;
    __DISCARD__()
    goto *R1;
  }

  without the dummy function call, gcc throws away the assignment to R1
  (gcc 3.4.3) gcc bug #20359.
*/

#define JMP_(cont)			\
    { 					\
      __DISCARD__();			\
      goto *(void *)(cont); 	    	    	\
    }

#endif /* x86_64_HOST_ARCH */

/* -----------------------------------------------------------------------------
   Tail calling on Sparc
   -------------------------------------------------------------------------- */

#ifdef sparc_HOST_ARCH

#define JMP_(cont)	((F_) (cont))()
	/* Oh so happily, the above turns into a "call" instruction,
	   which, on a SPARC, is nothing but a "jmpl" with the
	   return address in %o7 [which we don't care about].
	*/

/* Don't need these for sparc mangling */
#define FB_
#define FE_

#endif /* sparc_HOST_ARCH */

/* -----------------------------------------------------------------------------
   Tail calling on Alpha
   -------------------------------------------------------------------------- */

#ifdef alpha_HOST_ARCH

#if IN_STG_CODE
register void *_procedure __asm__("$27");
#endif

#define JMP_(cont)				\
    do { _procedure = (void *)(cont);		\
         __DISCARD__();				\
         goto *_procedure;			\
       } while(0)

/* Don't need these for alpha mangling */
#define FB_
#define FE_

#endif /* alpha_HOST_ARCH */

/* -----------------------------------------------------------------------------
   Tail calling on HP

Description of HP's weird procedure linkage, many thanks to Andy Bennet
<andy_bennett@hp.com>:

I've been digging a little further into the problem of how HP-UX does
dynamic procedure calls. My solution in the last e-mail inserting an extra
'if' statement into the JMP_ I think is probably the best general solution I
can come up with. There are still a few problems with it however: It wont
work, if JMP_ ever has to call anything in a shared library, if this is
likely to be required it'll need something more elaborate. It also wont work
with PA-RISC 2.0 wide mode (64-bit) which uses a different format PLT.

I had some feedback from someone in HP's compiler lab and the problem
relates to the linker on HP-UX, not gcc as I first suspected. The reason the
'hsc' executable works is most likely due to a change in 'ld's behaviour for
performance reasons between your revision and mine.

The major issue relating to this is shared libraries and how they are
implented under HP-UX. The whole point of the Procedure Label Table (PLT) is
to allow a function pointer to hold the address of the function and a
pointer to the library's global data lookup table (DLT) used by position
independent code (PIC). This makes the PLT absolutely essential for shared
library calls. HP has two linker introduced assembly functions for dealing
with dynamic calls, $$dyncall and $$dyncall_external. The former does a
check to see if the address is a PLT pointer and dereferences if necessary
or just calls the address otherwise; the latter skips the check and just
does the indirect jump no matter what.

Since $$dyncall_external runs faster due to its not having the test, the
linker nowadays prefers to generate calls to that, rather than $$dyncall. It
makes this decision based on the presence of any shared library. If it even
smells an sl's existence at link time, it rigs the runtime system to
generate PLT references for everything on the assumption that the result
will be slightly more efficient. This is what is crashing GHC since the
calls it is generating have no understanding of the procedure label proper.
The only way to get real addresses is to link everything archive, including
system libraries, at which point it assumes you probably are going to be
using calls similar to GHC's (its rigged for HP's +ESfic compiler option)
but uses $$dyncall if necessary to cope, just in case you aren't.

   -------------------------------------------------------------------------- */

#ifdef hppa1_1_hp_hpux_TARGET

#define JMP_(cont)                              \
    do { void *_procedure = (void *)(cont);     \
         if (((int) _procedure) & 2)            \
            _procedure = (void *)(*((int *) (_procedure - 2))); \
         goto *_procedure;                      \
       } while(0)

#endif /* hppa1_1_hp_hpux_TARGET */

/* -----------------------------------------------------------------------------
   Tail calling on PowerPC
   -------------------------------------------------------------------------- */

#ifdef powerpc_HOST_ARCH

#define JMP_(cont)			\
    { 					\
      void *target;			\
      target = (void *)(cont);    	\
      __DISCARD__();			\
      goto *target; 	    	    	\
    }

/*
	The __DISCARD__ is there because Apple's April 2002 Beta of GCC 3.1
	sometimes generates incorrect code otherwise.
	It tends to "forget" to update global register variables in the presence
	of decrement/increment operators:
	JMP_(*(--Sp)) is wrongly compiled as JMP_(Sp[-1]).
	Calling __DISCARD__ in between works around this problem.
*/

/*
	I would _love_ to use the following instead,
	but some versions of Apple's GCC fail to generate code for it
	if it is called for a casted data pointer - which is exactly what
	we are going to do...

	#define JMP_(cont)	((F_) (cont))()
*/

#endif /* powerpc_HOST_ARCH */

#ifdef powerpc64_HOST_ARCH
#define JMP_(cont) ((F_) (cont))()
#endif

/* -----------------------------------------------------------------------------
   Tail calling on IA64
   -------------------------------------------------------------------------- */

#ifdef ia64_HOST_ARCH

/* The compiler can more intelligently decide how to do this.  We therefore
 * implement it as a call and optimise to a jump at mangle time. */
#define JMP_(cont)	((F_) (cont))(); __asm__ volatile ("--- TAILCALL ---");

/* Don't emit calls to __DISCARD__ as this causes hassles */
#define __DISCARD__()

#endif

/* -----------------------------------------------------------------------------
  FUNBEGIN and FUNEND.

  These are markers indicating the start and end of Real Code in a
  function.  All instructions between the actual start and end of the
  function and these markers is shredded by the mangler.
  -------------------------------------------------------------------------- */

/*  The following __DISCARD__() has become necessary with gcc 2.96 on x86.
 *  It prevents gcc from moving stack manipulation code from the function
 *  body (aka the Real Code) into the function prologue, ie, from moving it
 *  over the --- BEGIN --- marker.  It should be noted that (like some
 *  other black magic in GHC's code), there is no essential reason why gcc
 *  could not move some stack manipulation code across the __DISCARD__() -
 *  it just doesn't choose to do it at the moment.
 *  -= chak
 */
 
#ifndef FB_
#define FB_    __asm__ volatile ("--- BEGIN ---"); __DISCARD__ ();
#endif

#ifndef FE_
#define FE_    __asm__ volatile ("--- END ---");
#endif

#endif /* !USE_MINIINTERPRETER */

#endif /* TAILCALLS_H */