1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
|
%
% (c) The University of Glasgow 2001
%
\section[ByteCodeGen]{Generate machine-code sequences for foreign import}
\begin{code}
module ByteCodeFFI ( mkMarshalCode, moan64 ) where
#include "HsVersions.h"
import Outputable
import SMRep ( CgRep(..), cgRepSizeW )
import ForeignCall ( CCallConv(..) )
import Panic
-- DON'T remove apparently unused imports here ..
-- there is ifdeffery below
import Control.Exception ( throwDyn )
import DATA_BITS ( Bits(..), shiftR, shiftL )
import Foreign ( newArray )
import Data.List ( mapAccumL )
import DATA_WORD ( Word8, Word32 )
import Foreign ( Ptr )
import System.IO.Unsafe ( unsafePerformIO )
import IO ( hPutStrLn, stderr )
import Debug.Trace ( trace )
\end{code}
%************************************************************************
%* *
\subsection{The platform-dependent marshall-code-generator.}
%* *
%************************************************************************
\begin{code}
moan64 :: String -> SDoc -> a
moan64 msg pp_rep
= unsafePerformIO (
hPutStrLn stderr (
"\nGHCi's bytecode generation machinery can't handle 64-bit\n" ++
"code properly yet. You can work around this for the time being\n" ++
"by compiling this module and all those it imports to object code,\n" ++
"and re-starting your GHCi session. The panic below contains information,\n" ++
"intended for the GHC implementors, about the exact place where GHC gave up.\n"
)
)
`seq`
pprPanic msg pp_rep
-- For sparc_TARGET_ARCH, i386_TARGET_ARCH, etc.
#include "nativeGen/NCG.h"
{-
Make a piece of code which expects to see the Haskell stack
looking like this. It is given a pointer to the lowest word in
the stack -- presumably the tag of the placeholder.
<arg_n>
...
<arg_1>
Addr# address_of_C_fn
<placeholder-for-result#> (must be an unboxed type)
We cope with both ccall and stdcall for the C fn. However, this code
itself expects only to be called using the ccall convention -- that is,
we don't clear our own (single) arg off the C stack.
-}
mkMarshalCode :: CCallConv
-> (Int, CgRep) -> Int -> [(Int, CgRep)]
-> IO (Ptr Word8)
mkMarshalCode cconv (r_offW, r_rep) addr_offW arg_offs_n_reps
= let bytes = mkMarshalCode_wrk cconv (r_offW, r_rep)
addr_offW arg_offs_n_reps
in trace (show bytes) $ Foreign.newArray bytes
mkMarshalCode_wrk :: CCallConv
-> (Int, CgRep) -> Int -> [(Int, CgRep)]
-> [Word8]
mkMarshalCode_wrk cconv (r_offW, r_rep) addr_offW arg_offs_n_reps
#if i386_TARGET_ARCH
= let -- Don't change this without first consulting Intel Corp :-)
bytes_per_word = 4
offsets_to_pushW
= concat
[ -- reversed because x86 is little-endian
reverse [a_offW .. a_offW + cgRepSizeW a_rep - 1]
-- reversed because args are pushed L -> R onto C stack
| (a_offW, a_rep) <- reverse arg_offs_n_reps
]
arguments_size = bytes_per_word * length offsets_to_pushW
#if darwin_TARGET_OS
-- Darwin: align stack frame size to a multiple of 16 bytes
stack_frame_size = (arguments_size + 15) .&. complement 15
stack_frame_pad = stack_frame_size - arguments_size
#else
stack_frame_size = arguments_size
#endif
-- some helpers to assemble x86 insns.
movl_offespmem_esi offB -- movl offB(%esp), %esi
= [0x8B, 0xB4, 0x24] ++ lit32 offB
movl_offesimem_ecx offB -- movl offB(%esi), %ecx
= [0x8B, 0x8E] ++ lit32 offB
save_regs -- pushl all intregs except %esp
= [0x50, 0x53, 0x51, 0x52, 0x56, 0x57, 0x55]
restore_regs -- popl ditto
= [0x5D, 0x5F, 0x5E, 0x5A, 0x59, 0x5B, 0x58]
pushl_ecx -- pushl %ecx
= [0x51]
call_star_ecx -- call * %ecx
= [0xFF, 0xD1]
add_lit_esp lit -- addl $lit, %esp
= [0x81, 0xC4] ++ lit32 lit
movl_eax_offesimem offB -- movl %eax, offB(%esi)
= [0x89, 0x86] ++ lit32 offB
movl_edx_offesimem offB -- movl %edx, offB(%esi)
= [0x89, 0x96] ++ lit32 offB
ret -- ret
= [0xC3]
fstpl_offesimem offB -- fstpl offB(%esi)
= [0xDD, 0x9E] ++ lit32 offB
fstps_offesimem offB -- fstps offB(%esi)
= [0xD9, 0x9E] ++ lit32 offB
{-
2 0000 8BB42478 movl 0x12345678(%esp), %esi
2 563412
3 0007 8B8E7856 movl 0x12345678(%esi), %ecx
3 3412
4
5 000d 50535152 pushl %eax ; pushl %ebx ; pushl %ecx ; pushl %edx
6 0011 565755 pushl %esi ; pushl %edi ; pushl %ebp
7
8 0014 5D5F5E popl %ebp ; popl %edi ; popl %esi
9 0017 5A595B58 popl %edx ; popl %ecx ; popl %ebx ; popl %eax
10
11 001b 51 pushl %ecx
12 001c FFD1 call * %ecx
13
14 001e 81C47856 addl $0x12345678, %esp
14 3412
15 0024 89867856 movl %eax, 0x12345678(%esi)
15 3412
16 002a 89967856 movl %edx, 0x12345678(%esi)
16 3412
17
18 0030 DD967856 fstl 0x12345678(%esi)
18 3412
19 0036 DD9E7856 fstpl 0x12345678(%esi)
19 3412
20 003c D9967856 fsts 0x12345678(%esi)
20 3412
21 0042 D99E7856 fstps 0x12345678(%esi)
18
19 0030 C3 ret
20
-}
in
--trace (show (map fst arg_offs_n_reps))
(
{- On entry, top of C stack 0(%esp) is the RA and 4(%esp) is
arg passed from the interpreter.
Push all callee saved regs. Push all of them anyway ...
pushl %eax
pushl %ebx
pushl %ecx
pushl %edx
pushl %esi
pushl %edi
pushl %ebp
-}
save_regs
{- Now 28+0(%esp) is RA and 28+4(%esp) is the arg (the H stack ptr).
We'll use %esi as a temporary to point at the H stack, and
%ecx as a temporary to copy via.
movl 28+4(%esp), %esi
-}
++ movl_offespmem_esi 32
#if darwin_TARGET_OS
{- On Darwin, add some padding so that the stack stays aligned. -}
++ (if stack_frame_pad /= 0
then add_lit_esp (-stack_frame_pad)
else [])
#endif
{- For each arg in args_offs_n_reps, examine the associated
CgRep to determine how many words there are. This gives a
bunch of offsets on the H stack to copy to the C stack:
movl off1(%esi), %ecx
pushl %ecx
-}
++ concatMap (\offW -> movl_offesimem_ecx (bytes_per_word * offW)
++ pushl_ecx)
offsets_to_pushW
{- Get the addr to call into %ecx, bearing in mind that there's
an Addr# tag at the indicated location, and do the call:
movl 4*(1 /*tag*/ +addr_offW)(%esi), %ecx
call * %ecx
-}
++ movl_offesimem_ecx (bytes_per_word * addr_offW)
++ call_star_ecx
{- Nuke the args just pushed and re-establish %esi at the
H-stack ptr:
addl $4*number_of_args_pushed, %esp (ccall only)
movl 28+4(%esp), %esi
-}
++ (if cconv /= StdCallConv
then add_lit_esp stack_frame_size
else [])
++ movl_offespmem_esi 32
{- Depending on what the return type is, get the result
from %eax or %edx:%eax or %st(0).
movl %eax, 4(%esi) -- assuming tagged result
or
movl %edx, 4(%esi)
movl %eax, 8(%esi)
or
fstpl 4(%esi)
or
fstps 4(%esi)
-}
++ let i32 = movl_eax_offesimem 0
i64 = movl_eax_offesimem 0 ++ movl_edx_offesimem 4
f32 = fstps_offesimem 0
f64 = fstpl_offesimem 0
in
case r_rep of
NonPtrArg -> i32
DoubleArg -> f64
FloatArg -> f32
-- LongArg -> i64
VoidArg -> []
other -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(x86)"
(ppr r_rep)
{- Restore all the pushed regs and go home.
pushl %ebp
pushl %edi
pushl %esi
pushl %edx
pushl %ecx
pushl %ebx
pushl %eax
ret
-}
++ restore_regs
++ ret
)
#elif x86_64_TARGET_ARCH
=
-- the address of the H stack is in %rdi. We need to move it out, so
-- we can use %rdi as an arg reg for the following call:
pushq_rbp ++
movq_rdi_rbp ++
-- ####### load / push the args
let
(stack_args, fregs_unused, reg_loads) =
load_arg_regs arg_offs_n_reps int_loads float_loads []
tot_arg_size = bytes_per_word * length stack_args
-- On entry to the called function, %rsp should be aligned
-- on a 16-byte boundary +8 (i.e. the first stack arg after
-- the return address is 16-byte aligned). In STG land
-- %rsp is kept 16-byte aligned (see StgCRun.c), so we just
-- need to make sure we push a multiple of 16-bytes of args,
-- plus the return address, to get the correct alignment.
(real_size, adjust_rsp)
| tot_arg_size `rem` 16 == 0 = (tot_arg_size, [])
| otherwise = (tot_arg_size + 8, subq_lit_rsp 8)
(stack_pushes, stack_words) =
push_args stack_args [] 0
-- we need to know the number of SSE regs used in the call, see later
n_sse_regs_used = length float_loads - length fregs_unused
in
concat reg_loads
++ adjust_rsp
++ concat stack_pushes -- push in reverse order
-- ####### make the call
-- use %r10 to make the call, because we don't have to save it.
-- movq 8*addr_offW(%rbp), %r10
++ movq_rbpoff_r10 (bytes_per_word * addr_offW)
-- The x86_64 ABI requires us to set %al to the number of SSE
-- registers that contain arguments, if the called routine
-- is a varargs function. We don't know whether it's a
-- varargs function or not, so we have to assume it is.
--
-- It's not safe to omit this assignment, even if the number
-- of SSE regs in use is zero. If %al is larger than 8
-- on entry to a varargs function, seg faults ensue.
++ movq_lit_rax n_sse_regs_used
++ call_star_r10
-- pop the args from the stack, only in ccall mode
-- (in stdcall the callee does it).
++ (if cconv /= StdCallConv
then addq_lit_rsp real_size
else [])
-- ####### place the result in the right place and return
++ assign_result
++ popq_rbp
++ ret
where
bytes_per_word = 8
-- int arg regs: rdi,rsi,rdx,rcx,r8,r9
-- flt arg regs: xmm0..xmm7
int_loads = [ movq_rbpoff_rdi, movq_rbpoff_rsi, movq_rbpoff_rdx,
movq_rbpoff_rcx, movq_rbpoff_r8, movq_rbpoff_r9 ]
float_loads = [ (mov_f32_rbpoff_xmm n, mov_f64_rbpoff_xmm n) | n <- [0..7] ]
load_arg_regs args [] [] code = (args, [], code)
load_arg_regs [] iregs fregs code = ([], fregs, code)
load_arg_regs ((off,rep):args) iregs fregs code
| FloatArg <- rep, ((mov_f32,_):frest) <- fregs =
load_arg_regs args iregs frest (mov_f32 (bytes_per_word * off) : code)
| DoubleArg <- rep, ((_,mov_f64):frest) <- fregs =
load_arg_regs args iregs frest (mov_f64 (bytes_per_word * off) : code)
| (mov_reg:irest) <- iregs =
load_arg_regs args irest fregs (mov_reg (bytes_per_word * off) : code)
| otherwise =
push_this_arg
where
push_this_arg = ((off,rep):args',fregs', code')
where (args',fregs',code') = load_arg_regs args iregs fregs code
push_args [] code pushed_words = (code, pushed_words)
push_args ((off,rep):args) code pushed_words
| FloatArg <- rep =
push_args args (push_f32_rbpoff (bytes_per_word * off) : code)
(pushed_words+1)
| DoubleArg <- rep =
push_args args (push_f64_rbpoff (bytes_per_word * off) : code)
(pushed_words+1)
| otherwise =
push_args args (pushq_rbpoff (bytes_per_word * off) : code)
(pushed_words+1)
assign_result =
case r_rep of
DoubleArg -> f64
FloatArg -> f32
VoidArg -> []
_other -> i64
where
i64 = movq_rax_rbpoff 0
f32 = mov_f32_xmm0_rbpoff 0
f64 = mov_f64_xmm0_rbpoff 0
-- ######### x86_64 machine code:
-- 0: 48 89 fd mov %rdi,%rbp
-- 3: 48 8b bd 78 56 34 12 mov 0x12345678(%rbp),%rdi
-- a: 48 8b b5 78 56 34 12 mov 0x12345678(%rbp),%rsi
-- 11: 48 8b 95 78 56 34 12 mov 0x12345678(%rbp),%rdx
-- 18: 48 8b 8d 78 56 34 12 mov 0x12345678(%rbp),%rcx
-- 1f: 4c 8b 85 78 56 34 12 mov 0x12345678(%rbp),%r8
-- 26: 4c 8b 8d 78 56 34 12 mov 0x12345678(%rbp),%r9
-- 2d: 4c 8b 95 78 56 34 12 mov 0x12345678(%rbp),%r10
-- 34: 48 c7 c0 78 56 34 12 mov $0x12345678,%rax
-- 3b: 48 89 85 78 56 34 12 mov %rax,0x12345678(%rbp)
-- 42: f3 0f 10 85 78 56 34 12 movss 0x12345678(%rbp),%xmm0
-- 4a: f2 0f 10 85 78 56 34 12 movsd 0x12345678(%rbp),%xmm0
-- 52: f3 0f 11 85 78 56 34 12 movss %xmm0,0x12345678(%rbp)
-- 5a: f2 0f 11 85 78 56 34 12 movsd %xmm0,0x12345678(%rbp)
-- 62: ff b5 78 56 34 12 pushq 0x12345678(%rbp)
-- 68: f3 44 0f 11 04 24 movss %xmm8,(%rsp)
-- 6e: f2 44 0f 11 04 24 movsd %xmm8,(%rsp)
-- 74: 48 81 ec 78 56 34 12 sub $0x12345678,%rsp
-- 7b: 48 81 c4 78 56 34 12 add $0x12345678,%rsp
-- 82: 41 ff d2 callq *%r10
-- 85: c3 retq
movq_rdi_rbp = [0x48,0x89,0xfd]
movq_rbpoff_rdi off = [0x48, 0x8b, 0xbd] ++ lit32 off
movq_rbpoff_rsi off = [0x48, 0x8b, 0xb5] ++ lit32 off
movq_rbpoff_rdx off = [0x48, 0x8b, 0x95] ++ lit32 off
movq_rbpoff_rcx off = [0x48, 0x8b, 0x8d] ++ lit32 off
movq_rbpoff_r8 off = [0x4c, 0x8b, 0x85] ++ lit32 off
movq_rbpoff_r9 off = [0x4c, 0x8b, 0x8d] ++ lit32 off
movq_rbpoff_r10 off = [0x4c, 0x8b, 0x95] ++ lit32 off
movq_lit_rax lit = [0x48, 0xc7, 0xc0] ++ lit32 lit
movq_rax_rbpoff off = [0x48, 0x89, 0x85] ++ lit32 off
mov_f32_rbpoff_xmm n off = [0xf3, 0x0f, 0x10, 0x85 + n`shiftL`3] ++ lit32 off
mov_f64_rbpoff_xmm n off = [0xf2, 0x0f, 0x10, 0x85 + n`shiftL`3] ++ lit32 off
mov_f32_xmm0_rbpoff off = [0xf3, 0x0f, 0x11, 0x85] ++ lit32 off
mov_f64_xmm0_rbpoff off = [0xf2, 0x0f, 0x11, 0x85] ++ lit32 off
pushq_rbpoff off = [0xff, 0xb5] ++ lit32 off
push_f32_rbpoff off =
mov_f32_rbpoff_xmm 8 off ++ -- movss off(%rbp), %xmm8
[0xf3, 0x44, 0x0f, 0x11, 0x04, 0x24] ++ -- movss %xmm8, (%rsp)
subq_lit_rsp 8 -- subq $8, %rsp
push_f64_rbpoff off =
mov_f64_rbpoff_xmm 8 off ++ -- movsd off(%rbp), %xmm8
[0xf2, 0x44, 0x0f, 0x11, 0x04, 0x24] ++ -- movsd %xmm8, (%rsp)
subq_lit_rsp 8 -- subq $8, %rsp
subq_lit_rsp lit = [0x48, 0x81, 0xec] ++ lit32 lit
addq_lit_rsp lit = [0x48, 0x81, 0xc4] ++ lit32 lit
call_star_r10 = [0x41,0xff,0xd2]
ret = [0xc3]
pushq_rbp = [0x55]
popq_rbp = [0x5d]
#elif sparc_TARGET_ARCH
= let -- At least for sparc V8
bytes_per_word = 4
-- speaks for itself
w32_to_w8s_bigEndian :: Word32 -> [Word8]
w32_to_w8s_bigEndian w
= [fromIntegral (0xFF .&. (w `shiftR` 24)),
fromIntegral (0xFF .&. (w `shiftR` 16)),
fromIntegral (0xFF .&. (w `shiftR` 8)),
fromIntegral (0xFF .&. w)]
offsets_to_pushW
= concat
[ [a_offW .. a_offW + cgRepSizeW a_rep - 1]
| (a_offW, a_rep) <- arg_offs_n_reps
]
total_argWs = length offsets_to_pushW
argWs_on_stack = if total_argWs > 6 then total_argWs - 6
else 0
-- The stack pointer must be kept 8-byte aligned, which means
-- we need to calculate this quantity too
argWs_on_stack_ROUNDED_UP
| odd argWs_on_stack = 1 + argWs_on_stack
| otherwise = argWs_on_stack
-- some helpers to assemble sparc insns.
-- REGS
iReg, oReg, gReg, fReg :: Int -> Word32
iReg = fromIntegral . (+ 24)
oReg = fromIntegral . (+ 8)
gReg = fromIntegral . (+ 0)
fReg = fromIntegral
sp = oReg 6
i0 = iReg 0
i7 = iReg 7
o0 = oReg 0
o1 = oReg 1
o7 = oReg 7
g0 = gReg 0
g1 = gReg 1
f0 = fReg 0
f1 = fReg 1
-- INSN templates
insn_r_r_i :: Word32 -> Word32 -> Word32 -> Int -> Word32
insn_r_r_i op3 rs1 rd imm13
= (3 `shiftL` 30)
.|. (rs1 `shiftL` 25)
.|. (op3 `shiftL` 19)
.|. (rd `shiftL` 14)
.|. (1 `shiftL` 13)
.|. mkSimm13 imm13
insn_r_i_r :: Word32 -> Word32 -> Int -> Word32 -> Word32
insn_r_i_r op3 rs1 imm13 rd
= (2 `shiftL` 30)
.|. (rd `shiftL` 25)
.|. (op3 `shiftL` 19)
.|. (rs1 `shiftL` 14)
.|. (1 `shiftL` 13)
.|. mkSimm13 imm13
mkSimm13 :: Int -> Word32
mkSimm13 imm13
= let imm13w = (fromIntegral imm13) :: Word32
in imm13w .&. 0x1FFF
-- REAL (non-synthetic) insns
-- or %rs1, %rs2, %rd
mkOR :: Word32 -> Word32 -> Word32 -> Word32
mkOR rs1 rs2 rd
= (2 `shiftL` 30)
.|. (rd `shiftL` 25)
.|. (op3_OR `shiftL` 19)
.|. (rs1 `shiftL` 14)
.|. (0 `shiftL` 13)
.|. rs2
where op3_OR = 2 :: Word32
-- ld(int) [%rs + imm13], %rd
mkLD rs1 imm13 rd = insn_r_r_i 0x00{-op3_LD-} rd rs1 imm13
-- st(int) %rs, [%rd + imm13]
mkST = insn_r_r_i 0x04 -- op3_ST
-- st(float) %rs, [%rd + imm13]
mkSTF = insn_r_r_i 0x24 -- op3_STF
-- jmpl %rs + imm13, %rd
mkJMPL = insn_r_i_r 0x38 -- op3_JMPL
-- save %rs + imm13, %rd
mkSAVE = insn_r_i_r 0x3C -- op3_SAVE
-- restore %rs + imm13, %rd
mkRESTORE = insn_r_i_r 0x3D -- op3_RESTORE
-- SYNTHETIC insns
mkNOP = mkOR g0 g0 g0
mkCALL reg = mkJMPL reg 0 o7
mkRET = mkJMPL i7 8 g0
mkRESTORE_TRIVIAL = mkRESTORE g0 0 g0
in
--trace (show (map fst arg_offs_n_reps))
concatMap w32_to_w8s_bigEndian (
{- On entry, %o0 is the arg passed from the interpreter. After
the initial save insn, it will be in %i0. Studying the sparc
docs one would have thought that the minimum frame size is 92
bytes, but gcc always uses at least 112, and indeed there are
segfaults a-plenty with 92. So I use 112 here as well. I
don't understand why, tho.
-}
[mkSAVE sp (- ({-92-}112 + 4*argWs_on_stack_ROUNDED_UP)) sp]
{- For each arg in args_offs_n_reps, examine the associated
CgRep to determine how many words there are. This gives a
bunch of offsets on the H stack. Move the first 6 words into
%o0 .. %o5 and the rest on the stack, starting at [%sp+92].
Use %g1 as a temp.
-}
++ let doArgW (offW, wordNo)
| wordNo < 6
= [mkLD i0 (bytes_per_word * offW) (oReg wordNo)]
| otherwise
= [mkLD i0 (bytes_per_word * offW) g1,
mkST g1 sp (92 + bytes_per_word * (wordNo - 6))]
in
concatMap doArgW (zip offsets_to_pushW [0 ..])
{- Get the addr to call into %g1, bearing in mind that there's
an Addr# tag at the indicated location, and do the call:
ld [4*(1 /*tag*/ +addr_offW) + %i0], %g1
call %g1
-}
++ [mkLD i0 (bytes_per_word * addr_offW) g1,
mkCALL g1,
mkNOP]
{- Depending on what the return type is, get the result
from %o0 or %o1:%o0 or %f0 or %f1:%f0.
st %o0, [%i0 + 4] -- 32 bit int
or
st %o0, [%i0 + 4] -- 64 bit int
st %o1, [%i0 + 8] -- or the other way round?
or
st %f0, [%i0 + 4] -- 32 bit float
or
st %f0, [%i0 + 4] -- 64 bit float
st %f1, [%i0 + 8] -- or the other way round?
-}
++ let i32 = [mkST o0 i0 0]
i64 = [mkST o0 i0 0, mkST o1 i0 4]
f32 = [mkSTF f0 i0 0]
f64 = [mkSTF f0 i0 0, mkSTF f1 i0 4]
in
case r_rep of
NonPtrArg -> i32
DoubleArg -> f64
FloatArg -> f32
VoidArg -> []
other -> moan64 "ByteCodeFFI.mkMarshalCode_wrk(sparc)"
(ppr r_rep)
++ [mkRET,
mkRESTORE_TRIVIAL] -- this is in the delay slot of the RET
)
#elif powerpc_TARGET_ARCH && darwin_TARGET_OS
= let
bytes_per_word = 4
-- speaks for itself
w32_to_w8s_bigEndian :: Word32 -> [Word8]
w32_to_w8s_bigEndian w
= [fromIntegral (0xFF .&. (w `shiftR` 24)),
fromIntegral (0xFF .&. (w `shiftR` 16)),
fromIntegral (0xFF .&. (w `shiftR` 8)),
fromIntegral (0xFF .&. w)]
-- addr and result bits offsetsW
a_off = addr_offW * bytes_per_word
result_off = r_offW * bytes_per_word
linkageArea = 24
parameterArea = sum [ cgRepSizeW a_rep * bytes_per_word
| (_, a_rep) <- arg_offs_n_reps ]
savedRegisterArea = 4
frameSize = padTo16 (linkageArea + max parameterArea 32 + savedRegisterArea)
padTo16 x = case x `mod` 16 of
0 -> x
y -> x - y + 16
pass_parameters [] _ _ = []
pass_parameters ((a_offW, a_rep):args) nextFPR offsetW =
let
haskellArgOffset = a_offW * bytes_per_word
offsetW' = offsetW + cgRepSizeW a_rep
pass_word w
| offsetW + w < 8 =
[0x801f0000 -- lwz rX, src(r31)
.|. (fromIntegral src .&. 0xFFFF)
.|. (fromIntegral (offsetW+w+3) `shiftL` 21)]
| otherwise =
[0x801f0000 -- lwz r0, src(r31)
.|. (fromIntegral src .&. 0xFFFF),
0x90010000 -- stw r0, dst(r1)
.|. (fromIntegral dst .&. 0xFFFF)]
where
src = haskellArgOffset + w*bytes_per_word
dst = linkageArea + (offsetW+w) * bytes_per_word
in
case a_rep of
FloatArg | nextFPR < 14 ->
(0xc01f0000 -- lfs fX, haskellArgOffset(r31)
.|. (fromIntegral haskellArgOffset .&. 0xFFFF)
.|. (fromIntegral nextFPR `shiftL` 21))
: pass_parameters args (nextFPR+1) offsetW'
DoubleArg | nextFPR < 14 ->
(0xc81f0000 -- lfd fX, haskellArgOffset(r31)
.|. (fromIntegral haskellArgOffset .&. 0xFFFF)
.|. (fromIntegral nextFPR `shiftL` 21))
: pass_parameters args (nextFPR+1) offsetW'
_ ->
concatMap pass_word [0 .. cgRepSizeW a_rep - 1]
++ pass_parameters args nextFPR offsetW'
gather_result = case r_rep of
VoidArg -> []
FloatArg ->
[0xd03f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
-- stfs f1, result_off(r31)
DoubleArg ->
[0xd83f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
-- stfd f1, result_off(r31)
_ | cgRepSizeW r_rep == 2 ->
[0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF),
0x909f0000 .|. (fromIntegral (result_off+4) .&. 0xFFFF)]
-- stw r3, result_off(r31)
-- stw r4, result_off+4(r31)
_ | cgRepSizeW r_rep == 1 ->
[0x907f0000 .|. (fromIntegral result_off .&. 0xFFFF)]
-- stw r3, result_off(r31)
in
concatMap w32_to_w8s_bigEndian $ [
0x7c0802a6, -- mflr r0
0x93e1fffc, -- stw r31,-4(r1)
0x90010008, -- stw r0,8(r1)
0x94210000 .|. (fromIntegral (-frameSize) .&. 0xFFFF),
-- stwu r1, -frameSize(r1)
0x7c7f1b78 -- mr r31, r3
] ++ pass_parameters arg_offs_n_reps 1 0 ++ [
0x819f0000 .|. (fromIntegral a_off .&. 0xFFFF),
-- lwz r12, a_off(r31)
0x7d8903a6, -- mtctr r12
0x4e800421 -- bctrl
] ++ gather_result ++ [
0x80210000, -- lwz r1, 0(r1)
0x83e1fffc, -- lwz r31, -4(r1)
0x80010008, -- lwz r0, 8(r1)
0x7c0803a6, -- mtlr r0
0x4e800020 -- blr
]
#elif powerpc_TARGET_ARCH && linux_TARGET_OS
-- All offsets here are measured in Words (not bytes). This includes
-- arguments to the load/store machine code generators, alignment numbers
-- and the final 'framesize' among others.
= concatMap w32_to_w8s_bigEndian $ [
0x7c0802a6, -- mflr r0
0x93e1fffc, -- stw r31,-4(r1)
0x90010008, -- stw r0,8(r1)
0x94210000 .|. offset (-framesize), -- stwu r1, -frameSize(r1)
0x7c7f1b78 -- mr r31, r3
] ++ pass_parameters ++ -- pass the parameters
loadWord 12 addr_offW ++ [ -- lwz r12, a_off(r31)
0x7d8903a6, -- mtctr r12
0x4e800421 -- bctrl
] ++ gather_result ++ [ -- save the return value
0x80210000, -- lwz r1, 0(r1)
0x83e1fffc, -- lwz r31, -4(r1)
0x80010008, -- lwz r0, 8(r1)
0x7c0803a6, -- mtlr r0
0x4e800020 -- blr
]
where
gather_result :: [Word32]
gather_result = case r_rep of
VoidArg -> []
FloatArg -> storeFloat 1 r_offW
DoubleArg -> storeDouble 1 r_offW
LongArg -> storeLong 3 r_offW
_ -> storeWord 3 r_offW
pass_parameters :: [Word32]
pass_parameters = concat params
-- vector aligned (4 word = 16 bytes) with 8 extra words of buffer space
framesize = alignedTo 4 (argsize + 8)
((_,_,argsize), params) = mapAccumL loadparam (3,1,2) arg_offs_n_reps
-- handle one argument, returning machine code and the updated state
loadparam :: (Int, Int, Int) -> (Int, CgRep) ->
((Int, Int, Int), [Word32])
loadparam (gpr, fpr, stack) (ofs, rep) = case rep of
FloatArg | fpr <= 8 -> ( (gpr, fpr + 1, stack), loadFloat fpr ofs )
FloatArg -> ( (gpr, fpr, stack + 1), stackWord stack ofs )
DoubleArg | fpr <= 8 -> ( (gpr, fpr + 1, stack), loadDouble fpr ofs )
DoubleArg -> ( (gpr, fpr, astack + 2), stackLong astack ofs )
LongArg | even gpr -> loadparam (gpr + 1, fpr, stack) (ofs, rep)
LongArg | gpr <= 9 -> ( (gpr + 2, fpr, stack), loadLong gpr ofs )
LongArg -> ( (gpr, fpr, astack + 2), stackLong astack ofs )
_ | gpr <= 10 -> ( (gpr + 1, fpr, stack), loadWord gpr ofs )
_ -> ( (gpr, fpr, stack + 1), stackWord stack ofs )
where astack = alignedTo 2 stack
alignedTo :: Int -> Int -> Int
alignedTo alignment x = case x `mod` alignment of
0 -> x
y -> x - y + alignment
-- convenience macros to do multiple-instruction data moves
stackWord dst src = loadWord 0 src ++ storeWordC 0 dst
stackLong dst src = stackWord dst src ++ stackWord (dst + 1) (src + 1)
loadLong dst src = loadWord dst src ++ loadWord (dst + 1) (src + 1)
storeLong dst src = storeWord dst src ++ storeWord (dst + 1) (src + 1)
-- load data from the Haskell stack (relative to r31)
loadFloat = loadstoreInstr 0xc01f0000 -- lfs fpr, ofs(r31)
loadDouble = loadstoreInstr 0xc81f0000 -- lfd fpr, ofs(r31)
loadWord = loadstoreInstr 0x801f0000 -- lwz gpr, ofs(r31)
-- store data to the Haskell stack (relative to r31)
storeFloat = loadstoreInstr 0xd01f0000 -- stfs fpr, ofs(r31)
storeDouble = loadstoreInstr 0xd81f0000 -- stfd fpr, ofs(r31)
storeWord = loadstoreInstr 0x901f0000 -- stw gpr, ofs(r31)
-- store data to the C stack (relative to r1)
storeWordC = loadstoreInstr 0x90010000 -- stw gpr, ofs(r1)
-- machine code building blocks
loadstoreInstr :: Word32 -> Int -> Int -> [Word32]
loadstoreInstr code reg ofs = [ code .|. register reg .|. offset ofs ]
register :: Int -> Word32
register reg = fromIntegral reg `shiftL` 21
offset :: Int -> Word32
offset ofs = fromIntegral (ofs * 4) .&. 0xFFFF
-- speaks for itself
w32_to_w8s_bigEndian :: Word32 -> [Word8]
w32_to_w8s_bigEndian w = [fromIntegral (0xFF .&. (w `shiftR` 24)),
fromIntegral (0xFF .&. (w `shiftR` 16)),
fromIntegral (0xFF .&. (w `shiftR` 8)),
fromIntegral (0xFF .&. w)]
#else
= throwDyn (InstallationError "foreign import is not implemented for GHCi on this platform.")
#endif
#if i386_TARGET_ARCH || x86_64_TARGET_ARCH
lit32 :: Int -> [Word8]
lit32 i = let w32 = (fromIntegral i) :: Word32
in map (fromIntegral . ( .&. 0xFF))
[w32, w32 `shiftR` 8,
w32 `shiftR` 16, w32 `shiftR` 24]
#endif
\end{code}
|