/* match.s -- optional optimized asm version of longest match in deflate.c Copyright (C) 2002, 2006, 2009-2016 Free Software Foundation, Inc. Copyright (C) 1992-1993 Jean-loup Gailly This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /* * The 68020 version has been written by Francesco Potorti` * with adaptations by Carsten Steger , * Andreas Schwab and * Kristoffer Eriksson * * The ia64 version has been written by Sverre Jarp (HP Labs) 2001-2002. * Unwind directives and some reformatting for better readability added by * David Mosberger-Tang . */ /* Preprocess with -DNO_UNDERLINE if your C compiler does not prefix * external symbols with an underline character '_'. */ #ifdef NO_UNDERLINE # define _prev prev # define _window window # define _match_start match_start # define _prev_length prev_length # define _good_match good_match # define _nice_match nice_match # define _strstart strstart # define _max_chain_length max_chain_length # define _match_init match_init # define _longest_match longest_match #endif #ifdef DYN_ALLOC error: DYN_ALLOC not yet supported in match.s #endif /* On x86-64, Sun C 5.13 (Oracle Solaris Studio 12.4) 'cc -E -m64' defines i386 when compiling .s or .S files! Luckily it also defines __x86_64__. See Bug#23133. */ #if ((defined i386 || defined _I386 || defined __i386 || defined __i386__) \ && !defined __x86_64__) /* This version is for 386 Unix or OS/2 in 32 bit mode. * Warning: it uses the AT&T syntax: mov source,dest * This file is only optional. If you want to force the C version, * add -DNO_ASM to CFLAGS in Makefile and set OBJA to an empty string. * If you have reduced WSIZE in gzip.h, then change its value below. * This version assumes static allocation of the arrays (-DDYN_ALLOC not used). */ .file "match.S" #define MAX_MATCH 258 #define MAX_MATCH2 $128 /* MAX_MATCH/2-1 */ #define MIN_MATCH 3 #define WSIZE $32768 #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1 .globl _match_init .globl _longest_match .text _match_init: ret /*----------------------------------------------------------------------- * Set match_start to the longest match starting at the given string and * return its length. Matches shorter or equal to prev_length are discarded, * in which case the result is equal to prev_length and match_start is * garbage. * IN assertions: cur_match is the head of the hash chain for the current * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 */ _longest_match: /* int longest_match(cur_match) */ #define cur_match 20(%esp) /* return address */ /* esp+16 */ push %ebp /* esp+12 */ push %edi /* esp+8 */ push %esi /* esp+4 */ push %ebx /* esp */ /* * match equ esi * scan equ edi * chain_length equ ebp * best_len equ ebx * limit equ edx */ mov cur_match,%esi mov _max_chain_length,%ebp /* chain_length = max_chain_length */ mov _strstart,%edi mov %edi,%edx sub MAX_DIST,%edx /* limit = strstart-MAX_DIST */ jae limit_ok sub %edx,%edx /* limit = NIL */ limit_ok: add $2+_window,%edi /* edi = offset(window+strstart+2) */ mov _prev_length,%ebx /* best_len = prev_length */ movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */ movw -2(%edi),%cx /* cx = scan[0..1] */ cmp _good_match,%ebx /* do we have a good match already? */ jb do_scan shr $2,%ebp /* chain_length >>= 2 */ jmp do_scan .align 4 long_loop: /* at this point, edi == scan+2, esi == cur_match */ movw -3(%ebx,%edi),%ax /* ax = scan[best_len-1..best_len] */ movw -2(%edi),%cx /* cx = scan[0..1] */ short_loop: /* * at this point, di == scan+2, si == cur_match, * ax = scan[best_len-1..best_len] and cx = scan[0..1] */ and WSIZE-1, %esi movw _prev(%esi,%esi),%si /* cur_match = prev[cur_match] */ /* top word of esi is still 0 */ cmp %edx,%esi /* cur_match <= limit ? */ jbe the_end dec %ebp /* --chain_length */ jz the_end do_scan: cmpw _window-1(%ebx,%esi),%ax/* check match at best_len-1 */ jne short_loop cmpw _window(%esi),%cx /* check min_match_length match */ jne short_loop lea _window+2(%esi),%esi /* si = match */ mov %edi,%eax /* ax = scan+2 */ mov MAX_MATCH2,%ecx /* scan for at most MAX_MATCH bytes */ rep; cmpsw /* loop until mismatch */ je maxmatch /* match of length MAX_MATCH? */ mismatch: movb -2(%edi),%cl /* mismatch on first or second byte? */ subb -2(%esi),%cl /* cl = 0 if first bytes equal */ xchg %edi,%eax /* edi = scan+2, eax = end of scan */ sub %edi,%eax /* eax = len */ sub %eax,%esi /* esi = cur_match + 2 + offset(window) */ sub $2+_window,%esi /* esi = cur_match */ subb $1,%cl /* set carry if cl == 0 (cannot use DEC) */ adc $0,%eax /* eax = carry ? len+1 : len */ cmp %ebx,%eax /* len > best_len ? */ jle long_loop mov %esi,_match_start /* match_start = cur_match */ mov %eax,%ebx /* ebx = best_len = len */ cmp _nice_match,%eax /* len >= nice_match ? */ jl long_loop the_end: mov %ebx,%eax /* result = eax = best_len */ pop %ebx pop %esi pop %edi pop %ebp ret maxmatch: cmpsb jmp mismatch #else /* ======================== 680x0 version ================================= */ #if defined(m68k)||defined(mc68k)||defined(__mc68000__)||defined(__MC68000__) # ifndef mc68000 # define mc68000 # endif #endif #if defined(__mc68020__) || defined(__MC68020__) || defined(sysV68) # ifndef mc68020 # define mc68020 # endif #endif #if defined(mc68020) || defined(mc68000) #if (defined(mc68020) || defined(NeXT)) && !defined(UNALIGNED_OK) # define UNALIGNED_OK #endif #ifdef sysV68 /* Try Motorola Delta style */ # define GLOBAL(symbol) global symbol # define TEXT text # define FILE(filename) file filename # define invert_maybe(src,dst) dst,src # define imm(data) &data # define reg(register) %register # define addl add.l # define addql addq.l # define blos blo.b # define bhis bhi.b # define bras bra.b # define clrl clr.l # define cmpmb cmpm.b # define cmpw cmp.w # define cmpl cmp.l # define lslw lsl.w # define lsrl lsr.l # define movel move.l # define movew move.w # define moveb move.b # define moveml movem.l # define subl sub.l # define subw sub.w # define subql subq.l # define IndBase(bd,An) (bd,An) # define IndBaseNdxl(bd,An,Xn) (bd,An,Xn.l) # define IndBaseNdxw(bd,An,Xn) (bd,An,Xn.w) # define predec(An) -(An) # define postinc(An) (An)+ #else /* default style (Sun 3, NeXT, Amiga, Atari) */ # define GLOBAL(symbol) .globl symbol # define TEXT .text # define FILE(filename) .even # define invert_maybe(src,dst) src,dst # if defined(sun) || defined(mc68k) # define imm(data) #data # else # define imm(data) \#data # endif # define reg(register) register # define blos bcss # if defined(sun) || defined(mc68k) # define movel movl # define movew movw # define moveb movb # endif # define IndBase(bd,An) An@(bd) # define IndBaseNdxl(bd,An,Xn) An@(bd,Xn:l) # define IndBaseNdxw(bd,An,Xn) An@(bd,Xn:w) # define predec(An) An@- # define postinc(An) An@+ #endif /* styles */ #define Best_Len reg(d0) /* unsigned */ #define Cur_Match reg(d1) /* Ipos */ #define Loop_Counter reg(d2) /* int */ #define Scan_Start reg(d3) /* unsigned short */ #define Scan_End reg(d4) /* unsigned short */ #define Limit reg(d5) /* IPos */ #define Chain_Length reg(d6) /* unsigned */ #define Scan_Test reg(d7) #define Scan reg(a0) /* *uch */ #define Match reg(a1) /* *uch */ #define Prev_Address reg(a2) /* *Pos */ #define Scan_Ini reg(a3) /* *uch */ #define Match_Ini reg(a4) /* *uch */ #define Stack_Pointer reg(sp) #define MAX_MATCH 258 #define MIN_MATCH 3 #define WSIZE 32768 #define MAX_DIST (WSIZE - MAX_MATCH - MIN_MATCH - 1) GLOBAL (_match_init) GLOBAL (_longest_match) TEXT FILE ("match.S") _match_init: rts /*----------------------------------------------------------------------- * Set match_start to the longest match starting at the given string and * return its length. Matches shorter or equal to prev_length are discarded, * in which case the result is equal to prev_length and match_start is * garbage. * IN assertions: cur_match is the head of the hash chain for the current * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 */ /* int longest_match (cur_match) */ #ifdef UNALIGNED_OK # define pushreg 15928 /* d2-d6/a2-a4 */ # define popreg 7292 #else # define pushreg 16184 /* d2-d7/a2-a4 */ # define popreg 7420 #endif _longest_match: movel IndBase(4,Stack_Pointer),Cur_Match moveml imm(pushreg),predec(Stack_Pointer) movel _max_chain_length,Chain_Length movel _prev_length,Best_Len movel imm(_prev),Prev_Address movel imm(_window+MIN_MATCH),Match_Ini movel _strstart,Limit movel Match_Ini,Scan_Ini addl Limit,Scan_Ini subw imm(MAX_DIST),Limit bhis L__limit_ok clrl Limit L__limit_ok: cmpl invert_maybe(_good_match,Best_Len) blos L__length_ok lsrl imm(2),Chain_Length L__length_ok: subql imm(1),Chain_Length #ifdef UNALIGNED_OK movew IndBase(-MIN_MATCH,Scan_Ini),Scan_Start movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End #else moveb IndBase(-MIN_MATCH,Scan_Ini),Scan_Start lslw imm(8),Scan_Start moveb IndBase(-MIN_MATCH+1,Scan_Ini),Scan_Start moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End lslw imm(8),Scan_End moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End #endif bras L__do_scan L__long_loop: #ifdef UNALIGNED_OK movew IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End #else moveb IndBaseNdxw(-MIN_MATCH-1,Scan_Ini,Best_Len),Scan_End lslw imm(8),Scan_End moveb IndBaseNdxw(-MIN_MATCH,Scan_Ini,Best_Len),Scan_End #endif L__short_loop: lslw imm(1),Cur_Match movew IndBaseNdxl(0,Prev_Address,Cur_Match),Cur_Match cmpw invert_maybe(Limit,Cur_Match) dbls Chain_Length,L__do_scan bras L__return L__do_scan: movel Match_Ini,Match addl Cur_Match,Match #ifdef UNALIGNED_OK cmpw invert_maybe(IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_End) bne L__short_loop cmpw invert_maybe(IndBase(-MIN_MATCH,Match),Scan_Start) bne L__short_loop #else moveb IndBaseNdxw(-MIN_MATCH-1,Match,Best_Len),Scan_Test lslw imm(8),Scan_Test moveb IndBaseNdxw(-MIN_MATCH,Match,Best_Len),Scan_Test cmpw invert_maybe(Scan_Test,Scan_End) bne L__short_loop moveb IndBase(-MIN_MATCH,Match),Scan_Test lslw imm(8),Scan_Test moveb IndBase(-MIN_MATCH+1,Match),Scan_Test cmpw invert_maybe(Scan_Test,Scan_Start) bne L__short_loop #endif movew imm((MAX_MATCH-MIN_MATCH+1)-1),Loop_Counter movel Scan_Ini,Scan L__scan_loop: cmpmb postinc(Match),postinc(Scan) dbne Loop_Counter,L__scan_loop subl Scan_Ini,Scan addql imm(MIN_MATCH-1),Scan cmpl invert_maybe(Best_Len,Scan) bls L__short_loop movel Scan,Best_Len movel Cur_Match,_match_start cmpl invert_maybe(_nice_match,Best_Len) blos L__long_loop L__return: moveml postinc(Stack_Pointer),imm(popreg) rts #else # if defined (__ia64__) /* ======================== ia64 version ================================= */ /* * 'longest_match.S' (assembly program for gzip for the IA-64 architecture) * * Optimised for McKinley, but with Merced-compatibility, such as MIB+MIB, used wherever * possible. * * Copyright: Sverre Jarp (HP Labs) 2001-2002 * * See deflate.c for c-version * Version 2 - Optimize the outer loop */ #include #if __BYTE_ORDER == ____BIG_ENDIAN #define first shl #define second shr.u #define count czx1.l #else #define first shr.u #define second shl #define count czx1.r #endif // 24 rotating register (r32 - r55) #define s_vmatch0 r32 #define s_vmatch1 r33 #define s_vmatbst r34 #define s_vmatbst1 r35 #define s_amatblen r36 #define s_tm1 r56 #define s_tm2 r57 #define s_tm3 r58 #define s_tm4 r59 #define s_tm5 r60 #define s_tm6 r61 #define s_tm7 r62 #define s_tm8 r63 #define s_vlen r31 #define s_vstrstart r30 #define s_vchainlen r29 #define s_awinbest r28 #define s_vcurmatch r27 #define s_vlimit r26 #define s_vscanend r25 #define s_vscanend1 r24 #define s_anicematch r23 #define s_vscan0 r22 #define s_vscan1 r21 #define s_aprev r20 #define s_awindow r19 #define s_amatchstart r18 #define s_ascan r17 #define s_amatch r16 #define s_wmask r15 #define s_ascanend r14 #define s_vspec_cmatch r11 // next iteration #define s_lcsave r10 #define s_prsave r9 #define s_vbestlen r8 // return register #define s_vscan3 r3 #define s_vmatch3 r2 #define p_no p2 #define p_yes p3 #define p_shf p4 // #define p_bn2 p5 // Use in loop (indicating bestlen != 2) #define p_nbs p9 // not new best_len #define p_nnc p10 // not nice_length #define p_ll p11 #define p_end p12 #define MAX_MATCH 258 #define MIN_MATCH 4 #define WSIZE 32768 #define MAX_DIST WSIZE - MAX_MATCH - MIN_MATCH - 1 #define R_INPUT 1 #define R_LOCAL 31 #define R_OUTPUT 0 #define R_ROTATING 24 #define MLAT 3 #define SHLAT 2 #define mova mov #define movi0 mov #define cgtu cmp.gt.unc #define cgeu cmp.ge.unc #define cneu cmp.ne.unc .global longest_match .proc longest_match .align 32 longest_match: // -- Cycle: 0 .prologue {.mmi alloc r2=ar.pfs,R_INPUT,R_LOCAL,R_OUTPUT,R_ROTATING .rotr scan[MLAT+2], match[MLAT+2], shscan0[SHLAT+1], \ shscan1[SHLAT+1], shmatch0[SHLAT+1], shmatch1[SHLAT+1] .rotp lc[MLAT+SHLAT+2] mova s_vspec_cmatch=in0 // cur_match from input register add s_tm1=@gprel(strstart),gp // a(a(strstart)) }{.mmi add s_tm3=@gprel(prev_length),gp // a(a(prev_length)) add s_tm5=@ltoff(window),gp // a(a(window)) add s_tm6=@ltoff(prev),gp // a(a(prev)) ;; }{.mmb // Cycle: 1 ld4 s_vstrstart=[s_tm1] // strstart ld4 s_vbestlen=[s_tm3] // best_len = prev_length brp.loop.imp .cmploop,.cmploop+48 }{.mli add s_tm2=@gprel(max_chain_length),gp // a(a(max_chain_length)) movl s_wmask=WSIZE-1 ;; }{.mmi // Cycle: 2 ld8 s_aprev=[s_tm6] // a(prev) ld8 s_awindow=[s_tm5] // a(window) .save pr, s_prsave movi0 s_prsave=pr // save predicates }{.mmi add s_tm4=@gprel(good_match),gp // a(a(good_match)) add s_tm7=@ltoff(nice_match),gp // a(a(nice_match)) add s_tm8=@ltoff(match_start),gp // a(match_start) ;; }{.mmi // Cycle: 3 ld8 s_anicematch=[s_tm7] // a(nice_match) ld8 s_amatchstart=[s_tm8] // a(match_start) .save ar.lc, s_lcsave movi0 s_lcsave=ar.lc // save loop count register }{.mmi .body add s_tm1=-(MAX_MATCH + MIN_MATCH),s_wmask // maxdist cmp.eq p_ll,p0=r0,r0 // parallel compare initialized as 'true' mova s_vcurmatch=s_vspec_cmatch ;; }{.mmi // Cycle: 4 ld4 s_vchainlen=[s_tm2] // chain_length=max_chain_length ld4 s_tm4=[s_tm4] // v(good_match) add s_ascan=s_awindow,s_vstrstart // scan=window + strstart }{.mmi sub s_vlimit=s_vstrstart, s_tm1 // limit=strstart - MAX_DIST add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match and s_vspec_cmatch =s_vspec_cmatch,s_wmask ;; }{.mmi // Cycle: 5 add s_amatblen=s_amatch,s_vbestlen // cneu p_bn2,p0=2,s_vbestlen // set if bestlen != 2 add s_ascanend=s_ascan,s_vbestlen // compute a(scan) + best_len }{.mmi ld1 s_vscan0=[s_ascan],1 // NB: s_ascan++ ld1 s_vmatch0=[s_amatch],1 cgtu p0,p_no=s_vlimit,r0 // is result positive ? ;; }{.mmi // Cycle: 6 ld1.nt1 s_vscan1=[s_ascan],2 // NB: s_ascan+3 in total ld1.nt1 s_vmatch1=[s_amatch],2 add s_awinbest=s_awindow,s_vbestlen // ;; }{.mmi // Cycle: 7 ld1.nt1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len] ld1.nt1 s_vmatbst=[s_amatblen],-1 (p_no) mova s_vlimit=r0 ;; }{.mmi // Cycle: 8 (p_bn2) ld1.nt1 s_vscanend1=[s_ascanend],1 // scan_end1=scan[best_len-1] (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] shladd s_vspec_cmatch =s_vspec_cmatch,1,s_aprev }{.mmi cgeu p_shf,p0=s_vbestlen,s_tm4 // is (prev_length >= good_match) ? ;; }{.mmi // Cycle: 9 ld1.nt1 s_vscan3=[s_ascan] ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] mova s_vlen=3 }{.mmi (p_shf) shr.u s_vchainlen=s_vchainlen,2 // (cur_len) >> 2 ;; }{.mmi // Cycle: 10 ld1.nt1 s_vmatch3=[s_amatch] // p_ll switched on as soon as we get a mismatch: cmp.eq.and p_ll,p0=s_vmatch0,s_vscan0 cmp.eq.and p_ll,p0=s_vmatbst,s_vscanend }{.mib cmp.eq.and p_ll,p0=s_vmatch1,s_vscan1 (p_bn2) cmp.eq.and p_ll,p0=s_vmatbst1,s_vscanend1 (p_ll) br.cond.dpnt.many .test_more ;; } .next_iter: {.mmi // Cycle 0 add s_amatch=s_awindow,s_vspec_cmatch // match=window + cur_match mov s_vcurmatch=s_vspec_cmatch // current value add s_vchainlen=-1,s_vchainlen // --chain_length }{.mib cmp.le.unc p_end,p0=s_vspec_cmatch,s_vlimit and s_vspec_cmatch=s_vspec_cmatch,s_wmask (p_end) br.cond.dptk.many .terminate ;; }{.mmi // Cycle 1 ld1 s_vmatch0=[s_amatch],1 // load match[0] // compute prev[cur_match]: shladd s_vspec_cmatch=s_vspec_cmatch,1,s_aprev cmp.eq.unc p_end,p0=s_vchainlen,r0 } {.mib nop.m 0 add s_amatblen=s_awinbest,s_vcurmatch // match=window + cur_match (p_end) br.cond.dptk.many .terminate ;; }{.mmi // Cycle 2 (short) ld2.nt1 s_vspec_cmatch=[s_vspec_cmatch] // get next cur_match ;; }{.mmi // Cycle 3 (short) ld1.nt1 s_vmatbst=[s_amatblen],-1 // load match[best_len] cmp.ne.unc p_ll,p0=r0,r0 // parallel compare initialized as 'false' ;; }{.mmi // Cycle 4 (short) // load match[1] - - note: match += 3 (in total): ld1.nt1 s_vmatch1=[s_amatch],2 ;; // Cycle 5 (short) (p_bn2) ld1.nt1 s_vmatbst1=[s_amatblen] // load match[best_len-1] }{.mib // Here we (MOST LIKELY) pay a L2-fetch stall // p_ll switched on as soon as we get a mismatch: cmp.ne.or p_ll,p0=s_vmatch0,s_vscan0 cmp.ne.or p_ll,p0=s_vmatbst,s_vscanend (p_ll) br.cond.dptk.many .next_iter ;; }{.mmi // Cycle 6 ld1.nt1 s_vmatch3=[s_amatch] mova s_vlen=3 nop.i 0 }{.mib cmp.ne.or p_ll,p0=s_vmatch1,s_vscan1 (p_bn2) cmp.ne.or p_ll,p0=s_vmatbst1,s_vscanend1 (p_ll) br.cond.dptk.many .next_iter ;; } // We have passed the first hurdle - Are there additional matches ??? .test_more: {.mmi // Cycle 0 and s_tm3=7,s_ascan // get byte offset and s_tm4=7,s_amatch // get byte offset movi0 ar.ec=MLAT+SHLAT+2 // NB: One trip more than usual }{.mib cmp.ne.unc p_no,p0=s_vscan3,s_vmatch3 // does not next one differ? (p_no) br.cond.dptk.many .only3 ;; }{.mmi // Cycle 1 and s_tm1=-8,s_ascan // get aligned address shladd s_tm3=s_tm3,3,r0 movi0 ar.lc=31 // 32 times around the loop (8B at a time) }{.mib and s_tm2=-8,s_amatch // get aligned address shladd s_tm4=s_tm4,3,r0 nop.b 0 ;; }{.mmi // Cycle 2 ld8.nt1 scan[1]=[s_tm1],8 // load first chunk sub s_tm5=64,s_tm3 // 64 - amount movi0 pr.rot=1<<16 }{.mmi ld8.nt1 match[1]=[s_tm2],8 // load first chunk sub s_tm6=64,s_tm4 // 64 - amount add s_vlen=-8,s_vlen // will be updated at least once ;; } .align 32 .cmploop: {.mmi // Cycle 0 (lc[0]) ld8 scan[0]=[s_tm1],8 // next scan chunk (lc[MLAT+SHLAT+1]) add s_vlen=8,s_vlen (lc[MLAT]) first shscan0[0]=scan[MLAT+1],s_tm3 }{.mib (lc[MLAT+SHLAT+1]) cmp.ne.unc p_no,p0=s_tm7,s_tm8 // break search if != (lc[MLAT]) first shmatch0[0]=match[MLAT+1],s_tm4 (p_no) br.cond.dpnt.many .mismatch ;; }{.mii // Cycle 1 (lc[0]) ld8 match[0]=[s_tm2],8 // shift left(le) or right(be): (lc[MLAT]) second shscan1[0]=scan[MLAT],s_tm5 (lc[MLAT]) second shmatch1[0]=match[MLAT],s_tm6 }{.mmb (lc[MLAT+SHLAT]) or s_tm7=shscan0[SHLAT],shscan1[SHLAT] (lc[MLAT+SHLAT]) or s_tm8=shmatch0[SHLAT],shmatch1[SHLAT] br.ctop.dptk.many .cmploop ;; }{.mfi mov s_vlen=258 nop.f 0 }{.mfi nop.f 0 // realign ;; } .mismatch: {.mii // Cycle 0 (short) (p_no) pcmp1.eq s_tm2=s_tm7,s_tm8 // find first non-matching character nop.i 0 ;; // Cycle 1 (short) (p_no) count s_tm1=s_tm2 ;; }{.mib // Cycle 2 (short) (p_no) add s_vlen=s_vlen,s_tm1 // effective length nop.i 0 clrrrb ;; } .only3: {.mib // Cycle 0 (short) cmp.gt.unc p0,p_nbs=s_vlen,s_vbestlen // (len > best_len) ? (p_nbs) br.cond.dpnt.many .next_iter // if not, re-iternate ;; }{.mmi // Cycle 1 (short) ld4 s_tm7=[s_anicematch] // nice_match st4 [s_amatchstart]= s_vcurmatch add s_ascanend=s_ascan,s_vlen // reset with best_len ;; }{.mmi // Cycle 2 (short) mova s_vbestlen=s_vlen add s_ascanend=-3,s_ascanend // remember extra offset ;; }{.mmi // Cycle 3 (short) ld1 s_vscanend=[s_ascanend],-1 // scan_end=scan[best_len] add s_awinbest=s_awindow,s_vbestlen // update with new best_len cmp.ne.unc p_bn2,p0=2,s_vbestlen // set if bestlen != 2 ;; }{.mib // Cycle 4 (short) // scan_end1=scan[best_len-1] NB: s_ascanend reset: ld1.nt1 s_vscanend1=[s_ascanend],1 cmp.lt.unc p_nnc,p0=s_vlen,s_tm7 // compare with nice_match (p_nnc) br.cond.dptk.many .next_iter ;; } .terminate: {.mii // Cycle 0/1 nop.m 0 movi0 ar.lc=s_lcsave movi0 pr=s_prsave,-1 }{.mbb nop.m 0 nop.b 0 br.ret.sptk.many rp // ret0 is identical to best_len ;; } .endp .global match_init .proc match_init match_init: sub ret0=ret0,ret0 br.ret.sptk.many rp .endp # else error: this asm version is for 386 or 680x0 or ia64 only # endif /* __ia64__ */ #endif /* mc68000 || mc68020 */ #endif /* i386 || _I386 */