diff options
Diffstat (limited to 'x64masm.asm')
-rwxr-xr-x | x64masm.asm | 69 |
1 files changed, 22 insertions, 47 deletions
diff --git a/x64masm.asm b/x64masm.asm index 4102c6a..76676a7 100755 --- a/x64masm.asm +++ b/x64masm.asm @@ -1,80 +1,55 @@ PUBLIC Baseline_Add PUBLIC Baseline_Sub .CODE - ALIGN 8 + ALIGN 8 Baseline_Add PROC - lea rdx, [rdx+8*rcx] lea r8, [r8+8*rcx] lea r9, [r9+8*rcx] - neg rcx ; rcx is negative index - test rcx, 2 ; this clears carry flag - jz $0@Baseline_Add - sub rcx, 2 - jmp $1@Baseline_Add - -$0@Baseline_Add: - jrcxz $2@Baseline_Add ; loop until rcx overflows and becomes zero + jz $1@Baseline_Add mov rax,[r8+8*rcx] - adc rax,[r9+8*rcx] + add rax,[r9+8*rcx] mov [rdx+8*rcx],rax +$0@Baseline_Add: mov rax,[r8+8*rcx+8] adc rax,[r9+8*rcx+8] mov [rdx+8*rcx+8],rax -$1@Baseline_Add: - mov rax,[r8+8*rcx+16] - adc rax,[r9+8*rcx+16] - mov [rdx+8*rcx+16],rax - mov rax,[r8+8*rcx+24] - adc rax,[r9+8*rcx+24] - mov [rdx+8*rcx+24],rax - - lea rcx,[rcx+4] ; advance index, avoid inc which causes slowdown on Intel Core 2 + lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2 + jrcxz $1@Baseline_Add ; loop until rcx overflows and becomes zero + mov rax,[r8+8*rcx] + adc rax,[r9+8*rcx] + mov [rdx+8*rcx],rax jmp $0@Baseline_Add - -$2@Baseline_Add: +$1@Baseline_Add: mov rax, 0 - setc al ; store carry into rax (return result register) - + adc rax, rax ; store carry into rax (return result register) ret Baseline_Add ENDP - ALIGN 8 + ALIGN 8 Baseline_Sub PROC - lea rdx, [rdx+8*rcx] lea r8, [r8+8*rcx] lea r9, [r9+8*rcx] - neg rcx ; rcx is negative index - test rcx, 2 ; this clears carry flag - jz $0@Baseline_Sub - sub rcx, 2 - jmp $1@Baseline_Sub - -$0@Baseline_Sub: - jrcxz $2@Baseline_Sub ; loop until rcx overflows and becomes zero + jz $1@Baseline_Sub mov rax,[r8+8*rcx] - sbb rax,[r9+8*rcx] + sub rax,[r9+8*rcx] mov [rdx+8*rcx],rax +$0@Baseline_Sub: mov rax,[r8+8*rcx+8] sbb rax,[r9+8*rcx+8] mov [rdx+8*rcx+8],rax -$1@Baseline_Sub: - mov rax,[r8+8*rcx+16] - sbb rax,[r9+8*rcx+16] - mov [rdx+8*rcx+16],rax - mov rax,[r8+8*rcx+24] - sbb rax,[r9+8*rcx+24] - mov [rdx+8*rcx+24],rax - - lea rcx,[rcx+4] ; advance index, avoid inc which causes slowdown on Intel Core 2 + lea rcx,[rcx+2] ; advance index, avoid inc which causes slowdown on Intel Core 2 + jrcxz $1@Baseline_Sub ; loop until rcx overflows and becomes zero + mov rax,[r8+8*rcx] + sbb rax,[r9+8*rcx] + mov [rdx+8*rcx],rax jmp $0@Baseline_Sub - -$2@Baseline_Sub: +$1@Baseline_Sub: mov rax, 0 - setc al ; store carry into rax (return result register) + adc rax, rax ; store carry into rax (return result register) ret Baseline_Sub ENDP |