8 files changed, 220 insertions, 7 deletions
diff --git a/AUTHORS b/AUTHORS
index a42f1e411e..e0e0cecbe4 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -125,6 +125,7 @@ Patryk Kaczmarek <patryk.k@samsung.com>
 Zbigniew Kosinski <z.kosinski@samsung.com>
 Paulo Cavalcanti <paulo.cavalcanti@linux.intel.com>
 Jean-Philippe Andre <jp.andre@samsung.com>
+Yury Usischev <y.usishchev@samsung.com>
 
 
 Ecore
diff --git a/ChangeLog b/ChangeLog
index 4cd2a4f364..d45dab1af3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2013-08-02  Yury Usischev
+
+        * Add neon optimizations for several scaling/map routines in evas
+
 2013-08-02  Cedric Bail
 
         * Evas: change mapping policy for image loader (RANDOM during header,
diff --git a/NEWS b/NEWS
index bbbdc06763..243bf6d12a 100644
--- a/NEWS
+++ b/NEWS
@@ -201,6 +201,7 @@ Improvements:
      - Use eo array of callbacks to reduce callbacks memory footprint of Evas_Object_Box and Evas_Object_Table.
      - Optimized path for when map use the same color for all corner.
      - Asynchronous preload of GL texture.
+     - Add neon assembly for upscaling and map routines
     * Ecore_Con:
      - Rebase dns.c against upstream
     * Edje:
diff --git a/src/lib/evas/common/evas_map_image_core.c b/src/lib/evas/common/evas_map_image_core.c
index 7e44c4b161..6e2be0e30a 100644
--- a/src/lib/evas/common/evas_map_image_core.c
+++ b/src/lib/evas/common/evas_map_image_core.c
@@ -19,6 +19,9 @@
 #ifdef SCALE_USING_MMX
              pxor_r2r(mm0, mm0);
              MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+             FPU_NEON;
+             VMOV_I2R_NEON(q2, #255);
 #endif
                
              line = &(spans[y - ystart]);
diff --git a/src/lib/evas/common/evas_map_image_loop.c b/src/lib/evas/common/evas_map_image_loop.c
index fc322860aa..a8a49eb7f4 100644
--- a/src/lib/evas/common/evas_map_image_loop.c
+++ b/src/lib/evas/common/evas_map_image_loop.c
@@ -1,13 +1,27 @@
 #ifdef SMOOTH
 {
 # ifdef SCALE_USING_MMX
-#   ifdef COLMUL
-#    ifdef COLSAME
+#  ifdef COLMUL
+#   ifdef COLSAME
    MOV_P2R(c1, mm7, mm0); // col
-#    endif   
 #   endif   
 #  endif   
-   while (ww > 0)
+# endif
+# ifdef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+   // this part can be done here as c1 and c2 are constants in the cycle
+   FPU_NEON;
+   VMOV_M2R_NEON(d18, c1);
+   VEOR_NEON(q8);
+   VMOV_M2R_NEON(d19, c2);
+   VZIP_NEON(q9, q8);
+   VMOV_R2R_NEON(d19, d16);
+   // here we have c1 and c2 spread through q9 register
+#   endif
+#  endif
+# endif
+     while (ww > 0)
      {
 # ifdef COLBLACK
         *d = 0xff000000; // col
@@ -77,6 +91,41 @@
 #    endif        
 #   endif                            
         MOV_R2P(mm1, *d, mm0);
+#  elif defined SCALE_USING_NEON
+        // not sure if we need this condition, but it doesn't affect the result
+        if (val1 | val2 | val3 | val4)
+          {
+            FPU_NEON;
+#   ifdef COLMUL
+            // initialize alpha for interpolation of c1 and c2
+            VDUP_NEON(d15, cv >> 16);
+            // copy c1 and c2 as algorithm will overwrite it
+            VMOV_R2R_NEON(q6, q9);
+            cv += cd; // col
+#   endif
+            VMOV_M2R_NEON(d8, val1);
+            VEOR_NEON(q0);
+            VMOV_M2R_NEON(d9, val3);
+            VMOV_M2R_NEON(d10, val2);
+            VEOR_NEON(q1);
+            VMOV_M2R_NEON(d11, val4);
+            VDUP_NEON(q3, ru);
+            VDUP_NEON(d14, rv);
+            VZIP_NEON(q4, q0);
+            VZIP_NEON(q5, q1);
+            VMOV_R2R_NEON(d9, d0);
+            VMOV_R2R_NEON(d11, d2);
+            // by this point we have all required data in right registers
+            INTERP_256_NEON(q3, q5, q4, q2); // interpolate val1,val2 and val3,val4
+            VSWP_NEON(d9, d12); // move result of val3,val4 interpolation (and c1 if COLMUL is defined) for next step
+            INTERP_256_NEON(q7, q6, q4, q2); // second stage of interpolation, also here c1 and c2 are interpolated
+#   ifdef COLMUL
+            MUL4_SYM_NEON(d8, d9, d4); // do required multiplication
+#   endif
+            VMOV_R2M_NEON(q4, d8, d); // save result to d
+          }
+        else
+          *d = val1;
 #  else
         val1 = INTERP_256(ru, val2, val1);
         val3 = INTERP_256(ru, val4, val3);
@@ -102,10 +151,23 @@
 }
 #else
 {
+# ifdef SCALE_USING_NEON
+#  ifdef COLMUL
+#   ifndef COLBLACK
+   // c1 and c2 are constants inside the cycle
+   FPU_NEON;
+   VMOV_M2R_NEON(d10, c1);
+   VEOR_NEON(q0);
+   VMOV_M2R_NEON(d11, c2);
+   VZIP_NEON(q5, q0);
+   VMOV_R2R_NEON(d11, d0);
+#   endif
+#  endif
+# endif
    while (ww > 0)
      {
 # ifdef COLMUL
-#  ifndef COLBLACK        
+#  ifndef COLBLACK
         DATA32 val1;
 #   ifdef COLSAME
 #   else        
@@ -121,11 +183,27 @@
 #  ifdef COLMUL
         val1 = *s; // col
 #   ifdef COLSAME
+#    ifdef SCALE_USING_NEON
         *d = MUL4_SYM(c1, val1);
-#   else        
+#    else
+        *d = MUL4_SYM(c1, val1); // XXX: do this in neon
+#    endif
+#   else
+#    ifdef SCALE_USING_NEON
+        FPU_NEON;
+        VMOV_M2R_NEON(d12, val1);
+        VMOV_R2R_NEON(q4, q5);
+        VEOR_NEON(q1);
+        VDUP_NEON(d15, cv >> 16);
+        VZIP_NEON(q6, q1);
+        INTERP_256_NEON(d15, d9, d8, d4); // interpolate c1 and c2
+        MUL4_SYM_NEON(d8, d12, d4); // multiply
+        VMOV_R2M_NEON(q4, d8, d); // save result
+#    else
         cval = INTERP_256((cv >> 16), c2, c1); // col
         *d = MUL4_SYM(cval, val1);
         cv += cd; // col              
+#    endif
 #   endif        
 #  else
         *d = *s;
diff --git a/src/lib/evas/common/evas_scale_smooth.c b/src/lib/evas/common/evas_scale_smooth.c
index 02dbe7d44d..61bda22b0a 100644
--- a/src/lib/evas/common/evas_scale_smooth.c
+++ b/src/lib/evas/common/evas_scale_smooth.c
@@ -97,6 +97,15 @@ scale_calc_a_points(int *p, int s, int d, int c, int cc)
 # include "evas_scale_smooth_scaler.c"
 #endif
 
+#ifdef BUILD_NEON
+# undef SCALE_FUNC
+# undef SCALE_USING_NEON
+# define SCALE_USING_NEON
+# define SCALE_FUNC evas_common_scale_rgba_in_to_out_clip_smooth_neon
+# include "evas_scale_smooth_scaler.c"
+# undef SCALE_USING_NEON
+#endif
+
 #undef SCALE_FUNC
 #define SCALE_FUNC _evas_common_scale_rgba_in_to_out_clip_smooth_c
 #undef SCALE_USING_MMX
@@ -197,6 +206,11 @@ evas_common_scale_rgba_in_to_out_clip_smooth(RGBA_Image *src, RGBA_Image *dst,
      cb = evas_common_scale_rgba_in_to_out_clip_smooth_mmx;
    else
 #endif
+#ifdef BUILD_NEON
+     if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+       cb = evas_common_scale_rgba_in_to_out_clip_smooth_neon;
+   else
+#endif
      cb = evas_common_scale_rgba_in_to_out_clip_smooth_c;
 
    return evas_common_scale_rgba_in_to_out_clip_cb(src, dst, dc,
@@ -223,6 +237,16 @@ evas_common_scale_rgba_smooth_draw(RGBA_Image *src, RGBA_Image *dst, int dst_cli
         dst_region_x, dst_region_y, dst_region_w, dst_region_h);
    else
 #endif
+#ifdef BUILD_NEON
+     if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+       _evas_common_scale_rgba_in_to_out_clip_smooth_neon
+     (src, dst,
+         dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h,
+         mul_col, render_op,
+         src_region_x, src_region_y, src_region_w, src_region_h,
+         dst_region_x, dst_region_y, dst_region_w, dst_region_h);
+   else
+#endif
      _evas_common_scale_rgba_in_to_out_clip_smooth_c
        (src, dst,
         dst_clip_x, dst_clip_y, dst_clip_w, dst_clip_h,
@@ -263,6 +287,15 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
 					       dst_region_w, dst_region_h);
 	else
 # endif
+#ifdef BUILD_NEON
+          if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+            evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+                                                              src_region_x, src_region_y,
+                                                              src_region_w, src_region_h,
+                                                              dst_region_x, dst_region_y,
+                                                              dst_region_w, dst_region_h);
+        else
+#endif
 	  evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
                                                          src_region_x, src_region_y,
                                                          src_region_w, src_region_h,
@@ -287,7 +320,16 @@ evas_common_scale_rgba_in_to_out_clip_smooth_do(const Cutout_Rects *reuse,
 					       dst_region_w, dst_region_h);
 	else
 # endif
-	  evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
+#ifdef BUILD_NEON
+          if (evas_common_cpu_has_feature(CPU_FEATURE_NEON))
+            evas_common_scale_rgba_in_to_out_clip_smooth_neon(src, dst, dc,
+                                                              src_region_x, src_region_y,
+                                                              src_region_w, src_region_h,
+                                                              dst_region_x, dst_region_y,
+                                                              dst_region_w, dst_region_h);
+        else
+#endif
+            evas_common_scale_rgba_in_to_out_clip_smooth_c(src, dst, dc,
                                                          src_region_x, src_region_y,
                                                          src_region_w, src_region_h,
                                                          dst_region_x, dst_region_y,
diff --git a/src/lib/evas/common/evas_scale_smooth_scaler_up.c b/src/lib/evas/common/evas_scale_smooth_scaler_up.c
index e43e0c7a6c..4b21d598dd 100644
--- a/src/lib/evas/common/evas_scale_smooth_scaler_up.c
+++ b/src/lib/evas/common/evas_scale_smooth_scaler_up.c
@@ -172,6 +172,10 @@
 	    MOV_A2R(ay, mm4)
 	    pxor_r2r(mm0, mm0);
 	    MOV_A2R(ALPHA_255, mm5)
+#elif defined SCALE_USING_NEON
+	    FPU_NEON;
+	    VDUP_NEON(d12, ay);
+	    VMOV_I2R_NEON(q2, #255);
 #endif
 	    pbuf = buf;  pbuf_end = buf + dst_clip_w;
 	    sxx = sxx0;
@@ -210,6 +214,28 @@
 		INTERP_256_R2R(mm4, mm2, mm1, mm5)
 		MOV_R2P(mm1, *pbuf, mm0)
 		pbuf++;
+#elif defined SCALE_USING_NEON
+		if (p0 | p1 | p2 | p3)
+		  {
+		    FPU_NEON;
+		    VMOV_M2R_NEON(d8, p0);
+		    VEOR_NEON(q0);
+		    VMOV_M2R_NEON(d9, p2);
+		    VMOV_M2R_NEON(d10, p1);
+		    VEOR_NEON(q1);
+		    VMOV_M2R_NEON(d11, p3);
+		    VDUP_NEON(q3, ax);
+		    VZIP_NEON(q4, q0);
+		    VZIP_NEON(q5, q1);
+		    VMOV_R2R_NEON(d9, d0);
+		    VMOV_R2R_NEON(d11, d2);
+		    INTERP_256_NEON(q3, q5, q4, q2);
+		    INTERP_256_NEON(d12, d9, d8, d5);
+		    VMOV_R2M_NEON(q4, d8, pbuf);
+		    pbuf++;
+		  }
+		else
+		  *pbuf++ = p0;
 #else
 		if (p0 | p1)
 		  p0 = INTERP_256(ax, p1, p0);
diff --git a/src/lib/evas/include/evas_blend_ops.h b/src/lib/evas/include/evas_blend_ops.h
index 0a78843579..3ae94379ec 100644
--- a/src/lib/evas/include/evas_blend_ops.h
+++ b/src/lib/evas/include/evas_blend_ops.h
@@ -186,6 +186,64 @@ extern const DATA32 ALPHA_256;
 
 #endif
 
+/* some useful NEON macros */
+
+#ifdef BUILD_NEON
+#define FPU_NEON \
+	__asm__ __volatile__(".fpu neon \n\t");
+
+/* copy reg1 to reg2 */
+#define VMOV_R2R_NEON(reg1, reg2) \
+	__asm__ __volatile__("vmov " #reg1 ", " #reg2 " \n\t" ::: #reg1);
+
+/* copy 32bit value to lower bits of register reg */
+#define VMOV_M2R_NEON(reg, value) \
+	__asm__ __volatile__("vmov.32 " #reg "[0], %[val] \n\t" :: [val] "r" (value) : #reg); 
+
+/* save 32bit value from lower 64 bits of register regq to memory location */
+/* pointed to by pointer, using 64bit register regd as temporary location */
+#define VMOV_R2M_NEON(regq, regd, pointer) \
+	__asm__ __volatile__("vqmovn.u16 " #regd ", " #regq " \n\t" \
+			     "vst1.32 {" #regd "[0]}, [%[p]] \n\t" :: [p] "r" (pointer) : #regd, "memory");
+
+/* spread constant imm in register reg */
+#define VMOV_I2R_NEON(reg, imm) \
+	__asm__ __volatile__("vmov.i16 " #reg ", " #imm " \n\t" ::: #reg);
+
+/* spread value in register reg */
+#define VDUP_NEON(reg, value) \
+	__asm__ __volatile__("vdup.16 " #reg ", %[val] \n\t" :: [val] "r" (value) : #reg); 
+
+/* interleave contents of reg1 and reg2 */
+#define VZIP_NEON(reg1, reg2) \
+	__asm__ __volatile__("vzip.8 " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
+
+/* swap contents of two registers */
+#define VSWP_NEON(reg1, reg2) \
+	__asm__ __volatile__("vswp " #reg1 ", " #reg2 " \n\t" ::: #reg1 , #reg2);
+
+/* set register to zero */
+#define VEOR_NEON(reg) \
+	__asm__ __volatile__("veor " #reg ", " #reg ", " #reg " \n\t" ::: #reg);
+
+/* do interpolation of every channel RGBA, result is contained in regy */
+#define INTERP_256_NEON(rega, regx, regy, reg255) \
+	__asm__ __volatile__("vsub.i16 " #regx ", " #regx ", " #regy " \n\t" \
+			     "vmul.u16 " #regx ", " #regx ", " #rega " \n\t" \
+			     "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+			     "vadd.i16 " #regx ", " #regx ", " #regy " \n\t" \
+			     "vand " #regy ", " #regx ", " #reg255 " \n\t" \
+			     ::: #regx, #regy );
+
+/* multiply every channel of regx and regy */
+#define MUL4_SYM_NEON(regx, regy, reg255) \
+	__asm__ __volatile__("vmul.u16 " #regx ", " #regx ", " #regy " \n\t" \
+			     "vadd.i16 " #regx ", " #regx ", " #reg255 " \n\t" \
+			     "vsri.16 " #regx ", " #regx ", #8 \n\t" \
+			     "vand " #regx ", " #regx ", " #reg255 " \n\t" \
+			     ::: #regx );
+
+#endif
 
 /* some useful SSE3 inline functions */