diff options
author | Timothy B. Terriberry <tterribe@xiph.org> | 2013-11-18 13:30:13 -0500 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2013-11-18 13:41:17 -0500 |
commit | 39386e0b85ec0f978aa104d312604badb9047d58 (patch) | |
tree | e1171628bb638ec1b770b049e39609d7a268c584 /celt/arm | |
parent | 530198f955e49571b3f890b4da4d933a4cd5df4e (diff) | |
download | opus-39386e0b85ec0f978aa104d312604badb9047d58.tar.gz |
Adds Neon assembly for correlation/convolution
Optimizing celt_pitch_xcorr()/xcorr_kernel() which also speeds up
FIRs, IIRs and auto-correlations
Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
Diffstat (limited to 'celt/arm')
-rw-r--r-- | celt/arm/arm2gnu.pl | 316 | ||||
-rw-r--r-- | celt/arm/arm_celt_map.c | 49 | ||||
-rw-r--r-- | celt/arm/armcpu.c | 14 | ||||
-rw-r--r-- | celt/arm/armcpu.h | 42 | ||||
-rw-r--r-- | celt/arm/armopts.s.in | 37 | ||||
-rw-r--r-- | celt/arm/celt_pitch_xcorr_arm.s | 598 | ||||
-rw-r--r-- | celt/arm/pitch_arm.h | 57 |
7 files changed, 1107 insertions, 6 deletions
diff --git a/celt/arm/arm2gnu.pl b/celt/arm/arm2gnu.pl new file mode 100644 index 00000000..eab42efa --- /dev/null +++ b/celt/arm/arm2gnu.pl @@ -0,0 +1,316 @@ +#!/usr/bin/perl + +my $bigend; # little/big endian +my $nxstack; + +$nxstack = 0; + +eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}' + if $running_under_some_shell; + +while ($ARGV[0] =~ /^-/) { + $_ = shift; + last if /^--/; + if (/^-n/) { + $nflag++; + next; + } + die "I don't recognize this switch: $_\\n"; +} +$printit++ unless $nflag; + +$\ = "\n"; # automatically add newline on print +$n=0; + +$thumb = 0; # ARM mode by default, not Thumb. +@proc_stack = (); + +LINE: +while (<>) { + + # For ADRLs we need to add a new line after the substituted one. + $addPadding = 0; + + # First, we do not dare to touch *anything* inside double quotes, do we? + # Second, if you want a dollar character in the string, + # insert two of them -- that's how ARM C and assembler treat strings. + s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next }; + s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next }; + s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next }; + # If there's nothing on a line but a comment, don't try to apply any further + # substitutions (this is a cheap hack to avoid mucking up the license header) + s/^([ \t]*);/$1@/ && do { s/\$\$/\$/g; next }; + # If substituted -- leave immediately ! + + s/@/,:/; + s/;/@/; + while ( /@.*'/ ) { + s/(@.*)'/$1/g; + } + s/\{FALSE\}/0/g; + s/\{TRUE\}/1/g; + s/\{(\w\w\w\w+)\}/$1/g; + s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/; + s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/; + s/\bIMPORT\b/.extern/; + s/\bEXPORT\b/.global/; + s/^(\s+)\[/$1IF/; + s/^(\s+)\|/$1ELSE/; + s/^(\s+)\]/$1ENDIF/; + s/IF *:DEF:/ .ifdef/; + s/IF *:LNOT: *:DEF:/ .ifndef/; + s/ELSE/ .else/; + s/ENDIF/ .endif/; + + if( /\bIF\b/ ) { + s/\bIF\b/ .if/; + s/=/==/; + } + if ( $n == 2) { + s/\$/\\/g; + } + if ($n == 1) { + s/\$//g; + s/label//g; + $n = 2; + } + if ( /MACRO/ ) { + s/MACRO *\n/.macro/; + $n=1; + } + if ( /\bMEND\b/ ) { + s/\bMEND\b/.endm/; + $n=0; + } + + # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there. + # + if ( /\bAREA\b/ ) { + my $align; + $align = "2"; + if ( /ALIGN=(\d+)/ ) { + $align = $1; + } + if ( /CODE/ ) { + $nxstack = 1; + } + s/^(.+)CODE(.+)READONLY(.*)/ .text/; + s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata/; + s/^(.+)\|\|\.data\|\|(.+)/ .data/; + s/^(.+)\|\|\.bss\|\|(.+)/ .bss/; + s/$/; .p2align $align/; + # Enable NEON instructions but don't produce a binary that requires + # ARMv7. RVCT does not have equivalent directives, so we just do this + # for all CODE areas. + if ( /.text/ ) { + # Separating .arch, .fpu, etc., by semicolons does not work (gas + # thinks the semicolon is part of the arch name, even when there's + # whitespace separating them). Sadly this means our line numbers + # won't match the original source file (we could use the .line + # directive, which is documented to be obsolete, but then gdb will + # show the wrong line in the translated source file). + s/$/; .arch armv7-a\n .fpu neon\n .object_arch armv4t/; + } + } + + s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3|| + s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2|| + s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2|| + s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/; + s/^(\s+)\%(\s)/ .space $1/; + + s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123 + s/\bCODE32\b/.code 32/ && do {$thumb = 0}; + s/\bCODE16\b/.code 16/ && do {$thumb = 1}; + if (/\bPROC\b/) + { + my $prefix; + my $proc; + /^([A-Za-z_\.]\w+)\b/; + $proc = $1; + $prefix = ""; + if ($proc) + { + $prefix = $prefix.sprintf("\t.type\t%s, %%function; ",$proc); + push(@proc_stack, $proc); + s/^[A-Za-z_\.]\w+/$&:/; + } + $prefix = $prefix."\t.thumb_func; " if ($thumb); + s/\bPROC\b/@ $&/; + $_ = $prefix.$_; + } + s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/; + s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/; + if (/\bENDP\b/) + { + my $proc; + s/\bENDP\b/@ $&/; + $proc = pop(@proc_stack); + $_ = "\t.size $proc, .-$proc".$_ if ($proc); + } + s/\bSUBT\b/@ $&/; + s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25 + s/\bKEEP\b/@ $&/; + s/\bEXPORTAS\b/@ $&/; + s/\|\|(.)+\bEQU\b/@ $&/; + s/\|\|([\w\$]+)\|\|/$1/; + s/\bENTRY\b/@ $&/; + s/\bASSERT\b/@ $&/; + s/\bGBLL\b/@ $&/; + s/\bGBLA\b/@ $&/; + s/^\W+OPT\b/@ $&/; + s/:OR:/|/g; + s/:SHL:/<</g; + s/:SHR:/>>/g; + s/:AND:/&/g; + s/:LAND:/&&/g; + s/CPSR/cpsr/; + s/SPSR/spsr/; + s/ALIGN$/.balign 4/; + s/ALIGN\s+([0-9x]+)$/.balign $1/; + s/psr_cxsf/psr_all/; + s/LTORG/.ltorg/; + s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/; + s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/; + + # {PC} + 0xdeadfeed --> . + 0xdeadfeed + s/\{PC\} \+/ \. +/; + + # Single hex constant on the line ! + # + # >>> NOTE <<< + # Double-precision floats in gcc are always mixed-endian, which means + # bytes in two words are little-endian, but words are big-endian. + # So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address + # and 0xfeed0000 at high address. + # + s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/; + # Only decimal constants on the line, no hex ! + s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/; + + # Single hex constant on the line ! +# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/; + # Only decimal constants on the line, no hex ! +# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/; + s/\bDCFS[ \t]+0x/.word 0x/; + s/\bDCFS\b/.float/; + + s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/; + s/\bDCD\b/.word/; + s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/; + s/\bDCW\b/.short/; + s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/; + s/\bDCB\b/.byte/; + s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/; + s/^[A-Za-z_\.]\w+/$&:/; + s/^(\d+)/$1:/; + s/\%(\d+)/$1b_or_f/; + s/\%[Bb](\d+)/$1b/; + s/\%[Ff](\d+)/$1f/; + s/\%[Ff][Tt](\d+)/$1f/; + s/&([\dA-Fa-f]+)/0x$1/; + if ( /\b2_[01]+\b/ ) { + s/\b2_([01]+)\b/conv$1&&&&/g; + while ( /[01][01][01][01]&&&&/ ) { + s/0000&&&&/&&&&0/g; + s/0001&&&&/&&&&1/g; + s/0010&&&&/&&&&2/g; + s/0011&&&&/&&&&3/g; + s/0100&&&&/&&&&4/g; + s/0101&&&&/&&&&5/g; + s/0110&&&&/&&&&6/g; + s/0111&&&&/&&&&7/g; + s/1000&&&&/&&&&8/g; + s/1001&&&&/&&&&9/g; + s/1010&&&&/&&&&A/g; + s/1011&&&&/&&&&B/g; + s/1100&&&&/&&&&C/g; + s/1101&&&&/&&&&D/g; + s/1110&&&&/&&&&E/g; + s/1111&&&&/&&&&F/g; + } + s/000&&&&/&&&&0/g; + s/001&&&&/&&&&1/g; + s/010&&&&/&&&&2/g; + s/011&&&&/&&&&3/g; + s/100&&&&/&&&&4/g; + s/101&&&&/&&&&5/g; + s/110&&&&/&&&&6/g; + s/111&&&&/&&&&7/g; + s/00&&&&/&&&&0/g; + s/01&&&&/&&&&1/g; + s/10&&&&/&&&&2/g; + s/11&&&&/&&&&3/g; + s/0&&&&/&&&&0/g; + s/1&&&&/&&&&1/g; + s/conv&&&&/0x/g; + } + + if ( /commandline/) + { + if( /-bigend/) + { + $bigend=1; + } + } + + if ( /\bDCDU\b/ ) + { + my $cmd=$_; + my $value; + my $prefix; + my $w1; + my $w2; + my $w3; + my $w4; + + s/\s+DCDU\b/@ $&/; + + $cmd =~ /\bDCDU\b\s+0x(\d+)/; + $value = $1; + $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/; + $w1 = $1; + $w2 = $2; + $w3 = $3; + $w4 = $4; + + if( $bigend ne "") + { + # big endian + $prefix = "\t.byte\t0x".$w1.";". + "\t.byte\t0x".$w2.";". + "\t.byte\t0x".$w3.";". + "\t.byte\t0x".$w4."; "; + } + else + { + # little endian + $prefix = "\t.byte\t0x".$w4.";". + "\t.byte\t0x".$w3.";". + "\t.byte\t0x".$w2.";". + "\t.byte\t0x".$w1."; "; + } + $_=$prefix.$_; + } + + if ( /\badrl\b/i ) + { + s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i; + $addPadding = 1; + } + s/\bEND\b/@ END/; +} continue { + printf ("%s", $_) if $printit; + if ($addPadding != 0) + { + printf (" mov r0,r0\n"); + $addPadding = 0; + } +} +#If we had a code section, mark that this object doesn't need an executable +# stack. +if ($nxstack) { + printf (" .section\t.note.GNU-stack,\"\",\%\%progbits\n"); +} diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c new file mode 100644 index 00000000..547a84d1 --- /dev/null +++ b/celt/arm/arm_celt_map.c @@ -0,0 +1,49 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pitch.h" + +#if defined(OPUS_HAVE_RTCD) + +# if defined(FIXED_POINT) +opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *, + const opus_val16 *, opus_val32 *, int , int) = { + celt_pitch_xcorr_c, /* ARMv4 */ + MAY_HAVE_EDSP(celt_pitch_xcorr), /* EDSP */ + MAY_HAVE_MEDIA(celt_pitch_xcorr), /* Media */ + MAY_HAVE_NEON(celt_pitch_xcorr) /* NEON */ +}; +# else +# error "Floating-point implementation is not supported by ARM asm yet." \ + "Reconfigure with --disable-rtcd or send patches." +# endif + +#endif diff --git a/celt/arm/armcpu.c b/celt/arm/armcpu.c index 5fe16025..17685258 100644 --- a/celt/arm/armcpu.c +++ b/celt/arm/armcpu.c @@ -55,7 +55,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ /* MSVC has no OPUS_INLINE __asm support for ARM, but it does let you __emit * instructions via their assembled hex code. * All of these instructions should be essentially nops. */ -# if defined(ARMv5E_ASM) +# if defined(OPUS_ARM_MAY_HAVE_EDSP) __try{ /*PLD [r13]*/ __emit(0xF5DDF000); @@ -64,7 +64,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ /*Ignore exception.*/ } -# if defined(ARMv6E_ASM) +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) __try{ /*SHADD8 r3,r3,r3*/ __emit(0xE6333F93); @@ -73,7 +73,7 @@ static OPUS_INLINE opus_uint32 opus_cpu_capabilities(void){ __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){ /*Ignore exception.*/ } -# if defined(ARM_HAVE_NEON) +# if defined(OPUS_ARM_MAY_HAVE_NEON) __try{ /*VORR q0,q0,q0*/ __emit(0xF2200150); @@ -107,19 +107,26 @@ opus_uint32 opus_cpu_capabilities(void) while(fgets(buf, 512, cpuinfo) != NULL) { +# if defined(OPUS_ARM_MAY_HAVE_EDSP) || defined(OPUS_ARM_MAY_HAVE_NEON) /* Search for edsp and neon flag */ if(memcmp(buf, "Features", 8) == 0) { char *p; +# if defined(OPUS_ARM_MAY_HAVE_EDSP) p = strstr(buf, " edsp"); if(p != NULL && (p[5] == ' ' || p[5] == '\n')) flags |= OPUS_CPU_ARM_EDSP; +# endif +# if defined(OPUS_ARM_MAY_HAVE_NEON) p = strstr(buf, " neon"); if(p != NULL && (p[5] == ' ' || p[5] == '\n')) flags |= OPUS_CPU_ARM_NEON; +# endif } +# endif +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) /* Search for media capabilities (>= ARMv6) */ if(memcmp(buf, "CPU architecture:", 17) == 0) { @@ -129,6 +136,7 @@ opus_uint32 opus_cpu_capabilities(void) if(version >= 6) flags |= OPUS_CPU_ARM_MEDIA; } +# endif } fclose(cpuinfo); diff --git a/celt/arm/armcpu.h b/celt/arm/armcpu.h index 68d80fe2..ac574460 100644 --- a/celt/arm/armcpu.h +++ b/celt/arm/armcpu.h @@ -25,11 +25,47 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* Original code from libtheora modified to suit to Opus */ +#if !defined(ARMCPU_H) +# define ARMCPU_H -#ifndef ARMCPU_H -#define ARMCPU_H +# if defined(OPUS_ARM_MAY_HAVE_EDSP) +# define MAY_HAVE_EDSP(name) name ## _edsp +# else +# define MAY_HAVE_EDSP(name) name ## _c +# endif +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) +# define MAY_HAVE_MEDIA(name) name ## _media +# else +# define MAY_HAVE_MEDIA(name) MAY_HAVE_EDSP(name) +# endif + +# if defined(OPUS_ARM_MAY_HAVE_NEON) +# define MAY_HAVE_NEON(name) name ## _neon +# else +# define MAY_HAVE_NEON(name) MAY_HAVE_MEDIA(name) +# endif + +# if defined(OPUS_ARM_PRESUME_EDSP) +# define PRESUME_EDSP(name) name ## _edsp +# else +# define PRESUME_EDSP(name) name ## _c +# endif + +# if defined(OPUS_ARM_PRESUME_MEDIA) +# define PRESUME_MEDIA(name) name ## _media +# else +# define PRESUME_MEDIA(name) PRESUME_EDSP(name) +# endif + +# if defined(OPUS_ARM_PRESUME_NEON) +# define PRESUME_NEON(name) name ## _neon +# else +# define PRESUME_NEON(name) PRESUME_MEDIA(name) +# endif + +# if defined(OPUS_HAVE_RTCD) int opus_select_arch(void); +# endif #endif diff --git a/celt/arm/armopts.s.in b/celt/arm/armopts.s.in new file mode 100644 index 00000000..3d8aaf27 --- /dev/null +++ b/celt/arm/armopts.s.in @@ -0,0 +1,37 @@ +/* Copyright (C) 2013 Mozilla Corporation */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +; Set the following to 1 if we have EDSP instructions +; (LDRD/STRD, etc., ARMv5E and later). +OPUS_ARM_MAY_HAVE_EDSP * @OPUS_ARM_MAY_HAVE_EDSP@ + +; Set the following to 1 if we have ARMv6 media instructions. +OPUS_ARM_MAY_HAVE_MEDIA * @OPUS_ARM_MAY_HAVE_MEDIA@ + +; Set the following to 1 if we have NEON (some ARMv7) +OPUS_ARM_MAY_HAVE_NEON * @OPUS_ARM_MAY_HAVE_NEON@ + +END diff --git a/celt/arm/celt_pitch_xcorr_arm.s b/celt/arm/celt_pitch_xcorr_arm.s new file mode 100644 index 00000000..2db681d2 --- /dev/null +++ b/celt/arm/celt_pitch_xcorr_arm.s @@ -0,0 +1,598 @@ +; Copyright (c) 2007-2008 CSIRO +; Copyright (c) 2007-2009 Xiph.Org Foundation +; Copyright (c) 2013 Parrot +; Written by Aurélien Zanelli +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; +; - Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; +; - Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + AREA |.text|, CODE, READONLY + + GET celt/arm/armopts.s + +IF OPUS_ARM_MAY_HAVE_EDSP + EXPORT celt_pitch_xcorr_edsp +ENDIF + +IF OPUS_ARM_MAY_HAVE_NEON + EXPORT celt_pitch_xcorr_neon +ENDIF + +IF OPUS_ARM_MAY_HAVE_NEON + +;; Compute sum[k]=sum(x[j]*y[j+k],j=0...len-1), k=0...3 +;xcorr_kernel_neon PROC +; ; input: +; ; r3 = int len +; ; r4 = opus_val16 *x +; ; r5 = opus_val16 *y +; ; q0 = opus_val32 sum[4] +; ; output: +; ; q0 = opus_val32 sum[4] +; ; preserved: r0-r3, r6-r11, d2, q4-q7, q9-q15 +; ; internal usage: +; ; r12 = int j +; ; d3 = y_3|y_2|y_1|y_0 +; ; q2 = y_B|y_A|y_9|y_8|y_7|y_6|y_5|y_4 +; ; q3 = x_7|x_6|x_5|x_4|x_3|x_2|x_1|x_0 +; ; q8 = scratch +; ; +; ; Load y[0...3] +; ; This requires len>0 to always be valid (which we assert in the C code). +; VLD1.16 {d5}, [r5]! +; SUBS r12, r3, #8 +; BLE xcorr_kernel_neon_process4 +;; Process 8 samples at a time. +;; This loop loads one y value more than we actually need. Therefore we have to +;; stop as soon as there are 8 or fewer samples left (instead of 7), to avoid +;; reading past the end of the array. +;xcorr_kernel_neon_process8 +; ; This loop has 19 total instructions (10 cycles to issue, minimum), with +; ; - 2 cycles of ARM insrtuctions, +; ; - 10 cycles of load/store/byte permute instructions, and +; ; - 9 cycles of data processing instructions. +; ; On a Cortex A8, we dual-issue the maximum amount (9 cycles) between the +; ; latter two categories, meaning the whole loop should run in 10 cycles per +; ; iteration, barring cache misses. +; ; +; ; Load x[0...7] +; VLD1.16 {d6, d7}, [r4]! +; ; Unlike VMOV, VAND is a data processsing instruction (and doesn't get +; ; assembled to VMOV, like VORR would), so it dual-issues with the prior VLD1. +; VAND d3, d5, d5 +; SUBS r12, r12, #8 +; ; Load y[4...11] +; VLD1.16 {d4, d5}, [r5]! +; VMLAL.S16 q0, d3, d6[0] +; VEXT.16 d16, d3, d4, #1 +; VMLAL.S16 q0, d4, d7[0] +; VEXT.16 d17, d4, d5, #1 +; VMLAL.S16 q0, d16, d6[1] +; VEXT.16 d16, d3, d4, #2 +; VMLAL.S16 q0, d17, d7[1] +; VEXT.16 d17, d4, d5, #2 +; VMLAL.S16 q0, d16, d6[2] +; VEXT.16 d16, d3, d4, #3 +; VMLAL.S16 q0, d17, d7[2] +; VEXT.16 d17, d4, d5, #3 +; VMLAL.S16 q0, d16, d6[3] +; VMLAL.S16 q0, d17, d7[3] +; BGT xcorr_kernel_neon_process8 +;; Process 4 samples here if we have > 4 left (still reading one extra y value). +;xcorr_kernel_neon_process4 +; ADDS r12, r12, #4 +; BLE xcorr_kernel_neon_process2 +; ; Load x[0...3] +; VLD1.16 d6, [r4]! +; ; Use VAND since it's a data processing instruction again. +; VAND d4, d5, d5 +; SUB r12, r12, #4 +; ; Load y[4...7] +; VLD1.16 d5, [r5]! +; VMLAL.S16 q0, d4, d6[0] +; VEXT.16 d16, d4, d5, #1 +; VMLAL.S16 q0, d16, d6[1] +; VEXT.16 d16, d4, d5, #2 +; VMLAL.S16 q0, d16, d6[2] +; VEXT.16 d16, d4, d5, #3 +; VMLAL.S16 q0, d16, d6[3] +;; Process 2 samples here if we have > 2 left (still reading one extra y value). +;xcorr_kernel_neon_process2 +; ADDS r12, r12, #2 +; BLE xcorr_kernel_neon_process1 +; ; Load x[0...1] +; VLD2.16 {d6[],d7[]}, [r4]! +; ; Use VAND since it's a data processing instruction again. +; VAND d4, d5, d5 +; SUB r12, r12, #2 +; ; Load y[4...5] +; VLD1.32 {d5[]}, [r5]! +; VMLAL.S16 q0, d4, d6 +; VEXT.16 d16, d4, d5, #1 +; ; Replace bottom copy of {y5,y4} in d5 with {y3,y2} from d4, using VSRI +; ; instead of VEXT, since it's a data-processing instruction. +; VSRI.64 d5, d4, #32 +; VMLAL.S16 q0, d16, d7 +;; Process 1 sample using the extra y value we loaded above. +;xcorr_kernel_neon_process1 +; ; Load next *x +; VLD1.16 {d6[]}, [r4]! +; ADDS r12, r12, #1 +; ; y[0...3] are left in d5 from prior iteration(s) (if any) +; VMLAL.S16 q0, d5, d6 +; MOVLE pc, lr +;; Now process 1 last sample, not reading ahead. +; ; Load last *y +; VLD1.16 {d4[]}, [r5]! +; VSRI.64 d4, d5, #16 +; ; Load last *x +; VLD1.16 {d6[]}, [r4]! +; VMLAL.S16 q0, d4, d6 +; MOV pc, lr +; ENDP + +;; opus_val32 celt_pitch_xcorr_neon(opus_val16 *_x, opus_val16 *_y, +;; opus_val32 *xcorr, int len, int max_pitch) +;celt_pitch_xcorr_neon PROC +; ; input: +; ; r0 = opus_val16 *_x +; ; r1 = opus_val16 *_y +; ; r2 = opus_val32 *xcorr +; ; r3 = int len +; ; output: +; ; r0 = int maxcorr +; ; internal usage: +; ; r4 = opus_val16 *x (for xcorr_kernel_neon()) +; ; r5 = opus_val16 *y (for xcorr_kernel_neon()) +; ; r6 = int max_pitch +; ; r12 = int j +; ; q15 = int maxcorr[4] (q15 is not used by xcorr_kernel_neon()) +; STMFD sp!, {r4-r6, lr} +; LDR r6, [sp, #16] +; VMOV.S32 q15, #1 +; ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done +; SUBS r6, r6, #4 +; BLT celt_pitch_xcorr_neon_process4_done +;celt_pitch_xcorr_neon_process4 +; ; xcorr_kernel_neon parameters: +; ; r3 = len, r4 = _x, r5 = _y, q0 = {0, 0, 0, 0} +; MOV r4, r0 +; MOV r5, r1 +; VEOR q0, q0, q0 +; ; xcorr_kernel_neon only modifies r4, r5, r12, and q0...q3. +; ; So we don't save/restore any other registers. +; BL xcorr_kernel_neon +; SUBS r6, r6, #4 +; VST1.32 {q0}, [r2]! +; ; _y += 4 +; ADD r1, r1, #8 +; VMAX.S32 q15, q15, q0 +; ; if (max_pitch < 4) goto celt_pitch_xcorr_neon_process4_done +; BGE celt_pitch_xcorr_neon_process4 +;; We have less than 4 sums left to compute. +;celt_pitch_xcorr_neon_process4_done +; ADDS r6, r6, #4 +; ; Reduce maxcorr to a single value +; VMAX.S32 d30, d30, d31 +; VPMAX.S32 d30, d30, d30 +; ; if (max_pitch <= 0) goto celt_pitch_xcorr_neon_done +; BLE celt_pitch_xcorr_neon_done +;; Now compute each remaining sum one at a time. +;celt_pitch_xcorr_neon_process_remaining +; MOV r4, r0 +; MOV r5, r1 +; VMOV.I32 q0, #0 +; SUBS r12, r3, #8 +; BLT celt_pitch_xcorr_neon_process_remaining4 +;; Sum terms 8 at a time. +;celt_pitch_xcorr_neon_process_remaining_loop8 +; ; Load x[0...7] +; VLD1.16 {q1}, [r4]! +; ; Load y[0...7] +; VLD1.16 {q2}, [r5]! +; SUBS r12, r12, #8 +; VMLAL.S16 q0, d4, d2 +; VMLAL.S16 q0, d5, d3 +; BGE celt_pitch_xcorr_neon_process_remaining_loop8 +;; Sum terms 4 at a time. +;celt_pitch_xcorr_neon_process_remaining4 +; ADDS r12, r12, #4 +; BLT celt_pitch_xcorr_neon_process_remaining4_done +; ; Load x[0...3] +; VLD1.16 {d2}, [r4]! +; ; Load y[0...3] +; VLD1.16 {d3}, [r5]! +; SUB r12, r12, #4 +; VMLAL.S16 q0, d3, d2 +; ; Reduce the sum to a single value. +; VADD.S32 d0, d0, d1 +; VPADDL.S32 d0, d0 +;celt_pitch_xcorr_neon_process_remaining4_done +; ADDS r12, r12, #4 +; BLE celt_pitch_xcorr_neon_process_remaining_loop_done +;; Sum terms 1 at a time. +;celt_pitch_xcorr_neon_process_remaining_loop1 +; VLD1.16 {d2[]}, [r4]! +; VLD1.16 {d3[]}, [r5]! +; SUBS r12, r12, #1 +; VMLAL.S16 q0, d2, d3 +; BGT celt_pitch_xcorr_neon_process_remaining_loop1 +;celt_pitch_xcorr_neon_process_remaining_loop_done +; VST1.32 {d0[0]}, [r2]! +; VMAX.S32 d30, d30, d0 +; SUBS r6, r6, #1 +; ; _y++ +; ADD r1, r1, #2 +; ; if (--max_pitch > 0) goto celt_pitch_xcorr_neon_process_remaining +; BGT celt_pitch_xcorr_neon_process_remaining +;celt_pitch_xcorr_neon_done +; VMOV.32 r0, d30[0] +; LDMFD sp!, {r4-r6, pc} +; ENDP + +xcorr_kernel_neon PROC + ; input: + ; r0 = opus_val16 *x + ; r1 = opus_val16 *y + ; r2 = int len + ; q0 = opus_val32 sum (sum[3] | sum[2] | sum[1] | sum[0]) + + ; output: + ; q0 = sum + + ; internal usage: + ; r3 = j + ; d2 = x_3|x_2|x_1|x_0 d3 = y_3|y_2|y_1|y_0 + ; d4 = y_7|y_6|y_5|y_4 d5 = y_4|y_3|y_2|y_1 + ; d6 = y_5|y_4|y_3|y_2 d7 = y_6|y_5|y_4|y_3 + ; We will build d5, d6 and d7 vector from d3 and d4 + + + VLD1.16 {d3}, [r1]! ; Load y[3] downto y[0] to d3 lane (yy0) + SUB r3, r2, #1 + MOVS r3, r3, lsr #2 ; j=(len-1)>>2 + BEQ xcorr_kernel_neon_process4_done + + ; Process 4 x samples at a time + ; For this, we will need 4 y vectors +xcorr_kernel_neon_process4 + SUBS r3, r3, #1 ; j-- + VLD1.16 d4, [r1]! ; Load y[7] downto y[4] to d4 lane + VLD1.16 d2, [r0]! ; Load x[3] downto x[0] to d2 lane + VEXT.16 d5, d3, d4, #1 ; Build y[4] downto y[1] vector (yy1) + VEXT.16 d6, d3, d4, #2 ; Build y[5] downto y[2] vector (yy2) + VEXT.16 d7, d3, d4, #3 ; Build y[6] downto y[3] vector (yy3) + + VMLAL.S16 q0, d3, d2[0] ; MAC16_16(sum, x[0], yy0) + VMLAL.S16 q0, d5, d2[1] ; MAC16_16(sum, x[1], yy1) + VMLAL.S16 q0, d6, d2[2] ; MAC16_16(sum, x[2], yy2) + VMLAL.S16 q0, d7, d2[3] ; MAC16_16(sum, x[3], yy3) + + VMOV.S16 d3, d4 ; Next y vector should be in d3 (yy0) + + BNE xcorr_kernel_neon_process4 + +xcorr_kernel_neon_process4_done + ;Process len-1 to len + VLD1.16 {d2[]}, [r0]! ; Load *x and duplicate to d2 lane + + SUB r3, r2, #1 + ANDS r3, r3, #3 ; j=(len-1)&3 + VMLAL.S16 q0, d3, d2 ; MAC16_16(sum, *x, yy0) + BEQ xcorr_kernel_neon_done + +xcorr_kernel_neon_process_remaining + SUBS r3, r3, #1 ; j-- + VLD1.16 {d4[]}, [r1]! ; Load y value and duplicate to d4 lane + VLD1.16 {d2[]}, [r0]! ; Load *x and duplicate to d2 lane + VEXT.16 d3, d3, d4, #1 ; Build y vector from previous and d4 + VMLAL.S16 q0, d3, d2 ; MAC16_16(sum, *x, yy0) + BNE xcorr_kernel_neon_process_remaining + +xcorr_kernel_neon_done + MOV pc, lr + ENDP + +celt_pitch_xcorr_neon PROC + ; input: + ; r0 = opus_val16 *_x + ; r1 = opus_val16 *_y + ; r2 = opus_val32 *xcorr + ; r3 = int len + + ; output: + ; r0 = maxcorr + + STMFD sp!, {r4-r9, lr} + + LDR r4, [sp, #28] ; r4 = int max_pitch + MOV r5, r0 ; r5 = _x + MOV r6, r1 ; r6 = _y + MOV r7, r2 ; r7 = xcorr + MOV r2, r3 ; r2 = len + + VMOV.S32 d16, #1 ; d16 = {1, 1} (not used by xcorr_kernel_neon) + MOV r8, #0 ; r8 = i = 0 + CMP r4, #3 ; max_pitch-3 <= 0 ---> pitch_xcorr_neon_process4_done + BLE celt_pitch_xcorr_neon_process4_done + + SUB r9, r4, #3 ; r9 = max_pitch-3 + +celt_pitch_xcorr_neon_process4 + MOV r0, r5 ; r0 = _x + ADD r1, r6 ,r8, LSL #1 ; r1 = _y + i + VMOV.I32 q0, #0 ; q0 = opus_val32 sum[4] = {0, 0, 0, 0} + + ; xcorr_kernel_neon don't touch r2 (len) + ; So we don't store it + BL xcorr_kernel_neon ; xcorr_kernel_neon(_x, _y+i, sum, len) + + VST1.32 {q0}, [r7]! ; Store sum to xcorr + VPMAX.S32 d0, d0, d1 ; d0 = max(sum[3], sum[2]) | max(sum[1], sum[0]) + ADD r8, r8, #4 ; i+=4 + VPMAX.S32 d0, d0, d0 ; d0 = max(sum[3], sum[2], sum[1], sum[0]) + CMP r8, r9 ; i < max_pitch-3 ----> pitch_xcorr_neon_process4 + VMAX.S32 d16, d16, d0 ; d16 = maxcorr = max(maxcorr, sum) + + BLT celt_pitch_xcorr_neon_process4 + +celt_pitch_xcorr_neon_process4_done + CMP r8, r4; + BGE celt_pitch_xcorr_neon_done + +celt_pitch_xcorr_neon_process_remaining + MOV r0, r5 ; r0 = _x + ADD r1, r6, r8, LSL #1 ; r1 = _y + i + VMOV.I32 q0, #0 + MOVS r3, r2, LSR #2 ; r3 = j = len + BEQ inner_loop_neon_process4_done + +inner_loop_neon_process4 + VLD1.16 {d2}, [r0]! ; Load x + VLD1.16 {d3}, [r1]! ; Load y + SUBS r3, r3, #1 + VMLAL.S16 q0, d2, d3 + BNE inner_loop_neon_process4 + + VPADD.S32 d0, d0, d1 ; Reduce sum + VPADD.S32 d0, d0, d0 + +inner_loop_neon_process4_done + ANDS r3, r2, #3 + BEQ inner_loop_neon_done + +inner_loop_neon_process_remaining + VLD1.16 {d2[]}, [r0]! + VLD1.16 {d3[]}, [r1]! + SUBS r3, r3, #1 + VMLAL.S16 q0, d2, d3 + BNE inner_loop_neon_process_remaining + +inner_loop_neon_done + VST1.32 {d0[0]}, [r7]! + VMAX.S32 d16, d16, d0 + + ADD r8, r8, #1 + CMP r8, r4 + BCC celt_pitch_xcorr_neon_process_remaining + +celt_pitch_xcorr_neon_done + VMOV d0, d16 + VMOV.32 r0, d0[0] + LDMFD sp!, {r4-r9, pc} + ENDP + + +ENDIF + +IF OPUS_ARM_MAY_HAVE_EDSP + +; This will get used on ARMv7 devices without NEON, so it has been optimized +; to take advantage of dual-issuing where possible. +xcorr_kernel_edsp PROC + ; input: + ; r3 = int len + ; r4 = opus_val16 *_x + ; r5 = opus_val16 *_y + ; r6...r9 = opus_val32 sum[4] + ; output: + ; r6...r9 = opus_val32 sum[4] + ; preserved: r0-r5 + ; internal usage + ; r2 = int j + ; r12,r14 = opus_val16 x[4] + ; r10,r11 = opus_val16 y[4] + STMFD sp!, {r2,r4,r5,lr} + SUBS r2, r3, #4 ; j = len-4 + LDRD r10, r11, [r5], #8 ; Load y[0...3] + BLE xcorr_kernel_edsp_process4_done + LDR r12, [r4], #4 ; Load x[0...1] + ; Stall +xcorr_kernel_edsp_process4 + ; The multiplies must issue from pipeline 0, and can't dual-issue with each + ; other. Every other instruction here dual-issues with a multiply, and is + ; thus "free". There should be no stalls in the body of the loop. + SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_0,y_0) + LDR r14, [r4], #4 ; Load x[2...3] + SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x_0,y_1) + SUBS r2, r2, #4 ; j-=4 + SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_0,y_2) + SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x_0,y_3) + SMLATT r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x_1,y_1) + LDR r10, [r5], #4 ; Load y[4...5] + SMLATB r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],x_1,y_2) + SMLATT r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x_1,y_3) + SMLATB r9, r12, r10, r9 ; sum[3] = MAC16_16(sum[3],x_1,y_4) + LDRGT r12, [r4], #4 ; Load x[0...1] + SMLABB r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_2,y_2) + SMLABT r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x_2,y_3) + SMLABB r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_2,y_4) + SMLABT r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x_2,y_5) + SMLATT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],x_3,y_3) + LDR r11, [r5], #4 ; Load y[6...7] + SMLATB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],x_3,y_4) + SMLATT r8, r14, r10, r8 ; sum[2] = MAC16_16(sum[2],x_3,y_5) + SMLATB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],x_3,y_6) + BGT xcorr_kernel_edsp_process4 +xcorr_kernel_edsp_process4_done + ADDS r2, r2, #4 + BLE xcorr_kernel_edsp_done + LDRH r12, [r4], #2 ; r12 = *x++ + SUBS r2, r2, #1 ; j-- + ; Stall + SMLABB r6, r12, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_0) + LDRGTH r14, [r4], #2 ; r14 = *x++ + SMLABT r7, r12, r10, r7 ; sum[1] = MAC16_16(sum[1],x,y_1) + SMLABB r8, r12, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_2) + SMLABT r9, r12, r11, r9 ; sum[3] = MAC16_16(sum[3],x,y_3) + BLE xcorr_kernel_edsp_done + SMLABT r6, r14, r10, r6 ; sum[0] = MAC16_16(sum[0],x,y_1) + SUBS r2, r2, #1 ; j-- + SMLABB r7, r14, r11, r7 ; sum[1] = MAC16_16(sum[1],x,y_2) + LDRH r10, [r5], #2 ; r10 = y_4 = *y++ + SMLABT r8, r14, r11, r8 ; sum[2] = MAC16_16(sum[2],x,y_3) + LDRGTH r12, [r4], #2 ; r12 = *x++ + SMLABB r9, r14, r10, r9 ; sum[3] = MAC16_16(sum[3],x,y_4) + BLE xcorr_kernel_edsp_done + SMLABB r6, r12, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_2) + CMP r2, #1 ; j-- + SMLABT r7, r12, r11, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_3) + LDRH r2, [r5], #2 ; r2 = y_5 = *y++ + SMLABB r8, r12, r10, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_4) + LDRGTH r14, [r4] ; r14 = *x + SMLABB r9, r12, r2, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_5) + BLE xcorr_kernel_edsp_done + SMLABT r6, r14, r11, r6 ; sum[0] = MAC16_16(sum[0],tmp,y_3) + LDRH r11, [r5] ; r11 = y_6 = *y + SMLABB r7, r14, r10, r7 ; sum[1] = MAC16_16(sum[1],tmp,y_4) + SMLABB r8, r14, r2, r8 ; sum[2] = MAC16_16(sum[2],tmp,y_5) + SMLABB r9, r14, r11, r9 ; sum[3] = MAC16_16(sum[3],tmp,y_6) +xcorr_kernel_edsp_done + LDMFD sp!, {r2,r4,r5,pc} + ENDP + +celt_pitch_xcorr_edsp PROC + ; input: + ; r0 = opus_val16 *_x + ; r1 = opus_val16 *_y + ; r2 = opus_val32 *xcorr + ; r3 = int len + ; output: + ; r0 = maxcorr + ; internal usage + ; r4 = opus_val16 *x + ; r5 = opus_val16 *y + ; r6 = opus_val32 sum0 + ; r7 = opus_val32 sum1 + ; r8 = opus_val32 sum2 + ; r9 = opus_val32 sum3 + ; r1 = int max_pitch + ; r12 = int j + STMFD sp!, {r4-r11, lr} + MOV r5, r1 + LDR r1, [sp, #36] + MOV r4, r0 + ; maxcorr = 1 + MOV r0, #1 + ; if (max_pitch < 4) goto celt_pitch_xcorr_edsp_process4_done + SUBS r1, r1, #4 + BLT celt_pitch_xcorr_edsp_process4_done +celt_pitch_xcorr_edsp_process4 + ; xcorr_kernel_edsp parameters: + ; r3 = len, r4 = _x, r5 = _y, r6...r9 = sum[4] = {0, 0, 0, 0} + MOV r6, #0 + MOV r7, #0 + MOV r8, #0 + MOV r9, #0 + BL xcorr_kernel_edsp ; xcorr_kernel_edsp(_x, _y+i, xcorr+i, len) + ; maxcorr = max(maxcorr, sum0, sum1, sum2, sum3) + CMP r0, r6 + ; _y+=4 + ADD r5, r5, #8 + MOVLT r0, r6 + CMP r0, r7 + STRD r6, r7, [r2], #8 + MOVLT r0, r7 + CMP r0, r8 + STRD r8, r9, [r2], #8 + MOVLT r0, r8 + CMP r0, r9 + MOVLT r0, r9 + SUBS r1, r1, #4 + BGE celt_pitch_xcorr_edsp_process4 +celt_pitch_xcorr_edsp_process4_done + ADDS r1, r1, #4 + BLE celt_pitch_xcorr_edsp_done +; Now compute each remaining sum one at a time. +celt_pitch_xcorr_edsp_process_remaining + SUBS r12, r3, #4 + ; r14 = sum = 0 + MOV r14, #0 + BLT celt_pitch_xcorr_edsp_process_remaining_loop_done + LDRD r6, r7, [r4], #8 + LDRD r8, r9, [r5], #8 + ; Stall +celt_pitch_xcorr_edsp_process_remaining_loop4 + SMLABB r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) + SUBS r12, r12, #4 ; j-- + SMLATT r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) + LDRGE r6, [r4], #4 + SMLABB r14, r7, r9, r14 ; sum = MAC16_16(sum, x_2, y_2) + LDRGE r8, [r5], #4 + SMLATT r14, r7, r9, r14 ; sum = MAC16_16(sum, x_3, y_3) + LDRGE r7, [r4], #4 + LDRGE r9, [r5], #4 + BGE celt_pitch_xcorr_edsp_process_remaining_loop4 +celt_pitch_xcorr_edsp_process_remaining_loop_done + ADDS r12, r12, #2 + LDRGE r6, [r4], #4 + LDRGE r8, [r5], #4 + ; Stall + SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_0, y_0) + SUBGE r12, r12, #2 + SMLATTGE r14, r6, r8, r14 ; sum = MAC16_16(sum, x_1, y_1) + ADDS r12, r12, #1 + LDRGEH r6, [r4], #2 + LDRGEH r8, [r5], #2 + ; Restore _x + SUB r4, r4, r3, LSL #1 + ; Stall + SMLABBGE r14, r6, r8, r14 ; sum = MAC16_16(sum, *x, *y) + ; Restore and advance _y + SUB r5, r5, r3, LSL #1 + ; maxcorr = max(maxcorr, sum) + ; Stall + CMP r0, r14 + ADD r5, r5, #2 + MOVLT r0, r14 + SUBS r1, r1, #1 + ; xcorr[i] = sum + STR r14, [r2], #4 + BGT celt_pitch_xcorr_edsp_process_remaining +celt_pitch_xcorr_edsp_done + LDMFD sp!, {r4-r11, pc} + ENDP + +ENDIF + +END diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h new file mode 100644 index 00000000..a07f8ac2 --- /dev/null +++ b/celt/arm/pitch_arm.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2010 Xiph.Org Foundation + * Copyright (c) 2013 Parrot */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if !defined(PITCH_ARM_H) +# define PITCH_ARM_H + +# include "armcpu.h" + +# if defined(FIXED_POINT) + +# if defined(OPUS_ARM_MAY_HAVE_NEON) +opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch); +# endif + +# if defined(OPUS_ARM_MAY_HAVE_MEDIA) +# define celt_pitch_xcorr_media MAY_HAVE_EDSP(celt_pitch_xcorr) +# endif + +# if defined(OPUS_ARM_MAY_HAVE_EDSP) +opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y, + opus_val32 *xcorr, int len, int max_pitch); +# endif + +# if !defined(OPUS_HAVE_RTCD) +# define OVERRIDE_PITCH_XCORR (1) +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \ + ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch)) +# endif + +# endif + +#endif |