From 50633f8f4e8d629e65d96f83abd9f99ca2cad463 Mon Sep 17 00:00:00 2001 From: Lorry Tar Creator Date: Tue, 26 Apr 2016 20:55:49 +0000 Subject: gzip-1.8 --- lib/match.c | 774 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 774 insertions(+) create mode 100644 lib/match.c (limited to 'lib/match.c') diff --git a/lib/match.c b/lib/match.c new file mode 100644 index 0000000..41fb03e --- /dev/null +++ b/lib/match.c @@ -0,0 +1,774 @@ +/* match.s -- optional optimized asm version of longest match in deflate.c + + Copyright (C) 2002, 2006, 2009-2016 Free Software Foundation, Inc. + Copyright (C) 1992-1993 Jean-loup Gailly + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* + * The 68020 version has been written by Francesco Potorti` + * with adaptations by Carsten Steger , + * Andreas Schwab and + * Kristoffer Eriksson + * + * The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002. + * Unwind directives and some reformatting for better readability added by + * David Mosberger-Tang . + */ + +/* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix + * external symbols with an underline character '_'. + */ +#ifdef NO_UNDERLINE +# define _prev prev +# define _window window +# define _match_start match_start +# define _prev_length prev_length +# define _good_match good_match +# define _nice_match nice_match +# define _strstart strstart +# define _max_chain_length max_chain_length + +# define _match_init match_init +# define _longest_match longest_match +#endif + +#ifdef DYN_ALLOC + error: DYN_ALLOC not yet supported in match.s +#endif + +/* On x86-64, Sun C 5.13 (Oracle Solaris Studio 12.4) 'cc -E -m64' + defines i386 when compiling .s or .S files! Luckily it also + defines __x86_64__. See Bug#23133. */ +#if ((defined i386 || defined _I386 || defined __i386 || defined __i386__) \ + && !defined __x86_64__) + +/* This version is for 386 Unix or OS/2 in 32 bit mode. + * Warning: it uses the AT&T syntax: mov source,dest + * This file is only optional. If you want to force the C version, + * add -DNO_ASM to CFLAGS in Makefile and set OBJA to an empty string. + * If you have reduced WSIZE in gzip.h, then change its value below. + * This version assumes static allocation of the arrays (-DDYN_ALLOC not used). + */ + + .file "match.S" + +#define MAX_MATCH 258 +#define MAX_MATCH2 $128 /* MAX_MATCH/2-1 */ +#define MIN_MATCH 3 +#define WSIZE $32768 +#define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1 + + .globl _match_init + .globl _longest_match + + .text + +_match_init: + ret + +/*----------------------------------------------------------------------- + * Set match_start to the longest match starting at the given string and + * return its length. Matches shorter or equal to prev_length are discarded, + * in which case the result is equal to prev_length and match_start is + * garbage. + * IN assertions: cur_match is the head of the hash chain for the current + * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 + */ + +_longest_match: /* int longest_match(cur_match) */ + +#define cur_match 20(%esp) + /* return address */ /* esp+16 */ + push %ebp /* esp+12 */ + push %edi /* esp+8 */ + push %esi /* esp+4 */ + push %ebx /* esp */ + +/* + * match equ esi + * scan equ edi + * chain_length equ ebp + * best_len equ ebx + * limit equ edx + */ + mov cur_match,%esi + mov _max_chain_length,%ebp /* chain_length = max_chain_length */ + mov _strstart,%edi + mov %edi,%edx + sub MAX_DIST,%edx /* limit = strstart-MAX_DIST */ + jae limit_ok + sub %edx,%edx /* limit = NIL */ +limit_ok: + add $2+_window,%edi /* edi = offset(window+strstart+2) */ + mov _prev_length,%ebx /* best_len = prev_length */ + movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */ + movw -2(%edi),%cx /* cx = scan[0..1] */ + cmp _good_match,%ebx /* do we have a good match already? */ + jb do_scan + shr $2,%ebp /* chain_length >>= 2 */ + jmp do_scan + + .align 4 +long_loop: +/* at this point, edi == scan+2, esi == cur_match */ + movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */ + movw -2(%edi),%cx /* cx = scan[0..1] */ +short_loop: +/* + * at this point, di == scan+2, si == cur_match, + * ax = scan[best_len-1..best_len] and cx = scan[0..1] + */ + and WSIZE-1, %esi + movw _prev(%esi,%esi),%si /* cur_match = prev[cur_match] */ + /* top word of esi is still 0 */ + cmp %edx,%esi /* cur_match <= limit ? */ + jbe the_end + dec %ebp /* --chain_length */ + jz the_end +do_scan: + cmpw _window-1(%ebx,%esi),%ax/* check match at best_len-1 */ + jne short_loop + cmpw _window(%esi),%cx /* check min_match_length match */ + jne short_loop + + lea _window+2(%esi),%esi /* si = match */ + mov %edi,%eax /* ax = scan+2 */ + mov MAX_MATCH2,%ecx /* scan for at most MAX_MATCH bytes */ + rep; cmpsw /* loop until mismatch */ + je maxmatch /* match of length MAX_MATCH? */ +mismatch: + movb -2(%edi),%cl /* mismatch on first or second byte? */ + subb -2(%esi),%cl /* cl = 0 if first bytes equal */ + xchg %edi,%eax /* edi = scan+2, eax = end of scan */ + sub %edi,%eax /* eax = len */ + sub %eax,%esi /* esi = cur_match + 2 + offset(window) */ + sub $2+_window,%esi /* esi = cur_match */ + subb $1,%cl /* set carry if cl == 0 (cannot use DEC) */ + adc $0,%eax /* eax = carry ? len+1 : len */ + cmp %ebx,%eax /* len > best_len ? */ + jle long_loop + mov %esi,_match_start /* match_start = cur_match */ + mov %eax,%ebx /* ebx = best_len = len */ + cmp _nice_match,%eax /* len >= nice_match ? */ + jl long_loop +the_end: + mov %ebx,%eax /* result = eax = best_len */ + pop %ebx + pop %esi + pop %edi + pop %ebp + ret +maxmatch: + cmpsb + jmp mismatch + +#else + +/* ======================== 680x0 version ================================= */ + +#if defined(m68k)||defined(mc68k)||defined(__mc68000__)||defined(__MC68000__) +# ifndef mc68000 +# define mc68000 +# endif +#endif + +#if defined(__mc68020__) || defined(__MC68020__) || defined(sysV68) +# ifndef mc68020 +# define mc68020 +# endif +#endif + +#if defined(mc68020) || defined(mc68000) + +#if (defined(mc68020) || defined(NeXT)) && !defined(UNALIGNED_OK) +# define UNALIGNED_OK +#endif + +#ifdef sysV68 /* Try Motorola Delta style */ + +# define GLOBAL(symbol) global symbol +# define TEXT text +# define FILE(filename) file filename +# define invert_maybe(src,dst) dst,src +# define imm(data) &data +# define reg(register) %register + +# define addl add.l +# define addql addq.l +# define blos blo.b +# define bhis bhi.b +# define bras bra.b +# define clrl clr.l +# define cmpmb cmpm.b +# define cmpw cmp.w +# define cmpl cmp.l +# define lslw lsl.w +# define lsrl lsr.l +# define movel move.l +# define movew move.w +# define moveb move.b +# define moveml movem.l +# define subl sub.l +# define subw sub.w +# define subql subq.l + +# define IndBase(bd,An) (bd,An) +# define IndBaseNdxl(bd,An,Xn) (bd,An,Xn.l) +# define IndBaseNdxw(bd,An,Xn) (bd,An,Xn.w) +# define predec(An) -(An) +# define postinc(An) (An)+ + +#else /* default style (Sun 3, NeXT, Amiga, Atari) */ + +# define GLOBAL(symbol) .globl symbol +# define TEXT .text +# define FILE(filename) .even +# define invert_maybe(src,dst) src,dst +# if defined(sun) || defined(mc68k) +# define imm(data) #data +# else +# define imm(data) \#data +# endif +# define reg(register) register + +# define blos bcss +# if defined(sun) || defined(mc68k) +# define movel movl +# define movew movw +# define moveb movb +# endif +# define IndBase(bd,An) An@(bd) +# define IndBaseNdxl(bd,An,Xn) An@(bd,Xn:l) +# define IndBaseNdxw(bd,An,Xn) An@(bd,Xn:w) +# define predec(An) An@- +# define postinc(An) An@+ + +#endif /* styles */ + +#define Best_Len reg(d0) /* unsigned */ +#define Cur_Match reg(d1) /* Ipos */ +#define Loop_Counter reg(d2) /* int */ +#define Scan_Start reg(d3) /* unsigned short */ +#define Scan_End reg(d4) /* unsigned short */ +#define Limit reg(d5) /* IPos */ +#define Chain_Length reg(d6) /* unsigned */ +#define Scan_Test reg(d7) +#define Scan reg(a0) /* *uch */ +#define Match reg(a1) /* *uch */ +#define Prev_Address reg(a2) /* *Pos */ +#define Scan_Ini reg(a3) /* *uch */ +#define Match_Ini reg(a4) /* *uch */ +#define Stack_Pointer reg(sp) + +#define MAX_MATCH 258 +#define MIN_MATCH 3 +#define WSIZE 32768 +#define MAX_DIST (WSIZE - MAX_MATCH - MIN_MATCH - 1) + + GLOBAL (_match_init) + GLOBAL (_longest_match) + + TEXT + + FILE ("match.S") + +_match_init: + rts + +/*----------------------------------------------------------------------- + * Set match_start to the longest match starting at the given string and + * return its length. Matches shorter or equal to prev_length are discarded, + * in which case the result is equal to prev_length and match_start is + * garbage. + * IN assertions: cur_match is the head of the hash chain for the current + * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 + */ + +/* int longest_match (cur_match) */ + +#ifdef UNALIGNED_OK +# define pushreg 15928 /* d2-d6/a2-a4 */ +# define popreg 7292 +#else +# define pushreg 16184 /* d2-d7/a2-a4 */ +# define popreg 7420 +#endif + +_longest_match: + movel IndBase(4,Stack_Pointer),Cur_Match + moveml imm(pushreg),predec(Stack_Pointer) + movel _max_chain_length,Chain_Length + movel _prev_length,Best_Len + movel imm(_prev),Prev_Address + movel imm(_window+MIN_MATCH),Match_Ini + movel _strstart,Limit + movel Match_Ini,Scan_Ini + addl Limit,Scan_Ini + subw imm(MAX_DIST),Limit + bhis L__limit_ok + clrl Limit +L__limit_ok: + cmpl invert_maybe(_good_match,Best_Len) + blos L__length_ok + lsrl imm(2),Chain_Length +L__length_ok: + subql imm(1),Chain_Length +#ifdef UNALIGNED_OK + movew IndBase(-MIN_MATCH,Scan_Ini),Scan_Start + movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End +#else + moveb IndBase(-MIN_MATCH,Scan_Ini),Scan_Start + lslw imm(8),Scan_Start + moveb IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start + moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End + lslw imm(8),Scan_End + moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End +#endif + bras L__do_scan + +L__long_loop: +#ifdef UNALIGNED_OK + movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End +#else + moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End + lslw imm(8),Scan_End + moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End +#endif + +L__short_loop: + lslw imm(1),Cur_Match + movew IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match + cmpw invert_maybe(Limit,Cur_Match) + dbls Chain_Length,L__do_scan + bras L__return + +L__do_scan: + movel Match_Ini,Match + addl Cur_Match,Match +#ifdef UNALIGNED_OK + cmpw invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End) + bne L__short_loop + cmpw invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start) + bne L__short_loop +#else + moveb IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test + lslw imm(8),Scan_Test + moveb IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test + cmpw invert_maybe(Scan_Test,Scan_End) + bne L__short_loop + moveb IndBase(-MIN_MATCH,Match),Scan_Test + lslw imm(8),Scan_Test + moveb IndBase(-MIN_MATCH+1,Match),Scan_Test + cmpw invert_maybe(Scan_Test,Scan_Start) + bne L__short_loop +#endif + + movew imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter + movel Scan_Ini,Scan +L__scan_loop: + cmpmb postinc(Match),postinc(Scan) + dbne Loop_Counter,L__scan_loop + + subl Scan_Ini,Scan + addql imm(MIN_MATCH-1),Scan + cmpl invert_maybe(Best_Len,Scan) + bls L__short_loop + movel Scan,Best_Len + movel Cur_Match,_match_start + cmpl invert_maybe(_nice_match,Best_Len) + blos L__long_loop +L__return: + moveml postinc(Stack_Pointer),imm(popreg) + rts + +#else + +# if defined (__ia64__) + +/* ======================== ia64 version ================================= */ + +/* + * 'longest_match.S' (assembly program for gzip for the IA-64 architecture) + * + * Optimised for McKinley, but with Merced-compatibility, such as MIB+MIB, used wherever + * possible. + * + * Copyright: Sverre Jarp (HP Labs) 2001-2002 + * + * See deflate.c for c-version + * Version 2 - Optimize the outer loop + */ + +#include + +#if __BYTE_ORDER == ____BIG_ENDIAN +#define first shl +#define second shr.u +#define count czx1.l +#else +#define first shr.u +#define second shl +#define count czx1.r +#endif + +// 24 rotating register (r32 - r55) + +#define s_vmatch0 r32 +#define s_vmatch1 r33 +#define s_vmatbst r34 +#define s_vmatbst1 r35 +#define s_amatblen r36 + +#define s_tm1 r56 +#define s_tm2 r57 +#define s_tm3 r58 +#define s_tm4 r59 +#define s_tm5 r60 +#define s_tm6 r61 +#define s_tm7 r62 +#define s_tm8 r63 + +#define s_vlen r31 +#define s_vstrstart r30 +#define s_vchainlen r29 +#define s_awinbest r28 +#define s_vcurmatch r27 +#define s_vlimit r26 +#define s_vscanend r25 +#define s_vscanend1 r24 +#define s_anicematch r23 +#define s_vscan0 r22 +#define s_vscan1 r21 + +#define s_aprev r20 +#define s_awindow r19 +#define s_amatchstart r18 +#define s_ascan r17 +#define s_amatch r16 +#define s_wmask r15 +#define s_ascanend r14 + +#define s_vspec_cmatch r11 // next iteration +#define s_lcsave r10 +#define s_prsave r9 +#define s_vbestlen r8 // return register + +#define s_vscan3 r3 +#define s_vmatch3 r2 + +#define p_no p2 +#define p_yes p3 +#define p_shf p4 // +#define p_bn2 p5 // Use in loop (indicating bestlen != 2) + +#define p_nbs p9 // not new best_len +#define p_nnc p10 // not nice_length +#define p_ll p11 +#define p_end p12 + +#define MAX_MATCH 258 +#define MIN_MATCH 4 +#define WSIZE 32768 +#define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1 + +#define R_INPUT 1 +#define R_LOCAL 31 +#define R_OUTPUT 0 +#define R_ROTATING 24 +#define MLAT 3 +#define SHLAT 2 + +#define mova mov +#define movi0 mov +#define cgtu cmp.gt.unc +#define cgeu cmp.ge.unc +#define cneu cmp.ne.unc + + .global longest_match + .proc longest_match + .align 32 +longest_match: +// -- Cycle: 0 + .prologue +{.mmi + alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING + .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \ + shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1] + .rotp lc[MLAT+SHLAT+2] + mova s_vspec_cmatch=in0 // cur_match from input register + add s_tm1=@gprel(strstart),gp // a(a(strstart)) +}{.mmi + add s_tm3=@gprel(prev_length),gp // a(a(prev_length)) + add s_tm5=@ltoff(window),gp // a(a(window)) + add s_tm6=@ltoff(prev),gp // a(a(prev)) + ;; +}{.mmb // Cycle: 1 + ld4 s_vstrstart=[s_tm1] // strstart + ld4 s_vbestlen=[s_tm3] // best_len = prev_length + brp.loop.imp .cmploop,.cmploop+48 +}{.mli + add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length)) + movl s_wmask=WSIZE-1 + ;; +}{.mmi // Cycle: 2 + ld8 s_aprev=[s_tm6] // a(prev) + ld8 s_awindow=[s_tm5] // a(window) + .save pr, s_prsave + movi0 s_prsave=pr // save predicates +}{.mmi + add s_tm4=@gprel(good_match),gp // a(a(good_match)) + add s_tm7=@ltoff(nice_match),gp // a(a(nice_match)) + add s_tm8=@ltoff(match_start),gp // a(match_start) + ;; +}{.mmi // Cycle: 3 + ld8 s_anicematch=[s_tm7] // a(nice_match) + ld8 s_amatchstart=[s_tm8] // a(match_start) + .save ar.lc, s_lcsave + movi0 s_lcsave=ar.lc // save loop count register +}{.mmi + .body + add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist + cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true' + mova s_vcurmatch=s_vspec_cmatch + ;; +}{.mmi // Cycle: 4 + ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length + ld4 s_tm4=[s_tm4] // v(good_match) + add s_ascan=s_awindow,s_vstrstart // scan=window + strstart +}{.mmi + sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST + add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match + and s_vspec_cmatch =s_vspec_cmatch,s_wmask + ;; +}{.mmi // Cycle: 5 + add s_amatblen=s_amatch,s_vbestlen // + cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2 + add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len +}{.mmi + ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++ + ld1 s_vmatch0=[s_amatch],1 + cgtu p0,p_no=s_vlimit,r0 // is result positive ? + ;; +}{.mmi // Cycle: 6 + ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total + ld1.nt1 s_vmatch1=[s_amatch],2 + add s_awinbest=s_awindow,s_vbestlen // + ;; +}{.mmi // Cycle: 7 + ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len] + ld1.nt1 s_vmatbst=[s_amatblen],-1 +(p_no) mova s_vlimit=r0 + ;; +}{.mmi // Cycle: 8 +(p_bn2) ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1] +(p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] + shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev +}{.mmi + cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ? + ;; +}{.mmi // Cycle: 9 + ld1.nt1 s_vscan3=[s_ascan] + ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] + mova s_vlen=3 +}{.mmi +(p_shf) shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2 + ;; +}{.mmi // Cycle: 10 + ld1.nt1 s_vmatch3=[s_amatch] + // p_ll switched on as soon as we get a mismatch: + cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0 + cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend +}{.mib + cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1 +(p_bn2) cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1 +(p_ll) br.cond.dpnt.many .test_more + ;; +} + +.next_iter: +{.mmi // Cycle 0 + add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match + mov s_vcurmatch=s_vspec_cmatch // current value + add s_vchainlen=-1,s_vchainlen // --chain_length +}{.mib + cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit + and s_vspec_cmatch=s_vspec_cmatch,s_wmask +(p_end) br.cond.dptk.many .terminate + ;; +}{.mmi // Cycle 1 + ld1 s_vmatch0=[s_amatch],1 // load match[0] + // compute prev[cur_match]: + shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev + cmp.eq.unc p_end,p0=s_vchainlen,r0 +} {.mib + nop.m 0 + add s_amatblen=s_awinbest,s_vcurmatch // match=window + cur_match +(p_end) br.cond.dptk.many .terminate + ;; +}{.mmi // Cycle 2 (short) + ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] // get next cur_match + ;; +}{.mmi // Cycle 3 (short) + ld1.nt1 s_vmatbst=[s_amatblen],-1 // load match[best_len] + cmp.ne.unc p_ll,p0=r0,r0 // parallel compare initialized as 'false' + ;; +}{.mmi // Cycle 4 (short) + // load match[1] - - note: match += 3 (in total): + ld1.nt1 s_vmatch1=[s_amatch],2 + ;; + // Cycle 5 (short) +(p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] // load match[best_len-1] +}{.mib // Here we (MOST LIKELY) pay a L2-fetch stall + // p_ll switched on as soon as we get a mismatch: + cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0 + cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend +(p_ll) br.cond.dptk.many .next_iter + ;; +}{.mmi // Cycle 6 + ld1.nt1 s_vmatch3=[s_amatch] + mova s_vlen=3 + nop.i 0 +}{.mib + cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1 +(p_bn2) cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1 +(p_ll) br.cond.dptk.many .next_iter + ;; +} + +// We have passed the first hurdle - Are there additional matches ??? + +.test_more: +{.mmi // Cycle 0 + and s_tm3=7,s_ascan // get byte offset + and s_tm4=7,s_amatch // get byte offset + movi0 ar.ec=MLAT+SHLAT+2 // NB: One trip more than usual +}{.mib + cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3 // does not next one differ? +(p_no) br.cond.dptk.many .only3 + ;; +}{.mmi // Cycle 1 + and s_tm1=-8,s_ascan // get aligned address + shladd s_tm3=s_tm3,3,r0 + movi0 ar.lc=31 // 32 times around the loop (8B at a time) +}{.mib + and s_tm2=-8,s_amatch // get aligned address + shladd s_tm4=s_tm4,3,r0 + nop.b 0 + ;; +}{.mmi // Cycle 2 + ld8.nt1 scan[1]=[s_tm1],8 // load first chunk + sub s_tm5=64,s_tm3 // 64 - amount + movi0 pr.rot=1<<16 +}{.mmi + ld8.nt1 match[1]=[s_tm2],8 // load first chunk + sub s_tm6=64,s_tm4 // 64 - amount + add s_vlen=-8,s_vlen // will be updated at least once + ;; +} + .align 32 +.cmploop: +{.mmi // Cycle 0 +(lc[0]) ld8 scan[0]=[s_tm1],8 // next scan chunk +(lc[MLAT+SHLAT+1]) add s_vlen=8,s_vlen +(lc[MLAT]) first shscan0[0]=scan[MLAT+1],s_tm3 +}{.mib +(lc[MLAT+SHLAT+1]) cmp.ne.unc p_no,p0=s_tm7,s_tm8 // break search if != +(lc[MLAT]) first shmatch0[0]=match[MLAT+1],s_tm4 +(p_no) br.cond.dpnt.many .mismatch + ;; +}{.mii // Cycle 1 +(lc[0]) ld8 match[0]=[s_tm2],8 + // shift left(le) or right(be): +(lc[MLAT]) second shscan1[0]=scan[MLAT],s_tm5 +(lc[MLAT]) second shmatch1[0]=match[MLAT],s_tm6 +}{.mmb +(lc[MLAT+SHLAT]) or s_tm7=shscan0[SHLAT],shscan1[SHLAT] +(lc[MLAT+SHLAT]) or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT] + br.ctop.dptk.many .cmploop + ;; +}{.mfi + mov s_vlen=258 + nop.f 0 +}{.mfi + nop.f 0 // realign + ;; +} +.mismatch: +{.mii // Cycle 0 (short) +(p_no) pcmp1.eq s_tm2=s_tm7,s_tm8 // find first non-matching character + nop.i 0 + ;; + // Cycle 1 (short) +(p_no) count s_tm1=s_tm2 + ;; +}{.mib // Cycle 2 (short) +(p_no) add s_vlen=s_vlen,s_tm1 // effective length + nop.i 0 + clrrrb + ;; +} + +.only3: +{.mib // Cycle 0 (short) + cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen // (len > best_len) ? +(p_nbs) br.cond.dpnt.many .next_iter // if not, re-iternate + ;; +}{.mmi // Cycle 1 (short) + ld4 s_tm7=[s_anicematch] // nice_match + st4 [s_amatchstart]= s_vcurmatch + add s_ascanend=s_ascan,s_vlen // reset with best_len + ;; +}{.mmi // Cycle 2 (short) + mova s_vbestlen=s_vlen + add s_ascanend=-3,s_ascanend // remember extra offset + ;; +}{.mmi // Cycle 3 (short) + ld1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len] + add s_awinbest=s_awindow,s_vbestlen // update with new best_len + cmp.ne.unc p_bn2,p0=2,s_vbestlen // set if bestlen != 2 + ;; +}{.mib // Cycle 4 (short) + // scan_end1=scan[best_len-1] NB: s_ascanend reset: + ld1.nt1 s_vscanend1=[s_ascanend],1 + cmp.lt.unc p_nnc,p0=s_vlen,s_tm7 // compare with nice_match +(p_nnc) br.cond.dptk.many .next_iter + ;; +} +.terminate: +{.mii // Cycle 0/1 + nop.m 0 + movi0 ar.lc=s_lcsave + movi0 pr=s_prsave,-1 +}{.mbb + nop.m 0 + nop.b 0 + br.ret.sptk.many rp // ret0 is identical to best_len + ;; +} + .endp + + .global match_init + .proc match_init +match_init: + sub ret0=ret0,ret0 + br.ret.sptk.many rp + .endp + +# else + error: this asm version is for 386 or 680x0 or ia64 only +# endif /* __ia64__ */ +#endif /* mc68000 || mc68020 */ +#endif /* i386 || _I386 */ -- cgit v1.2.1