solve neon rotation issue by moving to the tiled rotator

the tiles rotator is faster no matter what. this will fix D8099 by movoing to tiled rotation and nuking the neon code and we end uop being faster anyway in all cases. @fix
author: Carsten Haitzler (Rasterman) <raster@rasterman.com> 2019-03-09 15:19:28 +0000
committer: Carsten Haitzler (Rasterman) <raster@rasterman.com> 2019-03-09 15:21:46 +0000
commit: 4758f06e637239f981eedbaaf8c0d613b78e4417 (patch)
tree: 01e220a40497a9288cc2390fdc37e4e7e9cd5bb9
parent: f234a2b6c77af85c3a272cb417257cf1bc531b4f (diff)
download: efl-4758f06e637239f981eedbaaf8c0d613b78e4417.tar.gz
2 files changed, 215 insertions, 352 deletions
diff --git a/configure.ac b/configure.ac
index 34d6abb4a0..9157bfd338 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2538,19 +2538,6 @@ AC_ARG_ENABLE([pixman-image-scale-sample],
    ],
    [have_pixman_image_scale_sample="no"])
 
-# Tile rotate
-AC_ARG_ENABLE([tile-rotate],
-   [AS_HELP_STRING([--enable-tile-rotate],[Enable tiled rotate algorithm. @<:@default=disabled@:>@])],
-   [
-    if test "x${enableval}" = "xyes" ; then
-       have_tile_rotate="yes"
-       CFOPT_WARNING="xyes"
-    else
-       have_tile_rotate="no"
-    fi
-  ],
-  [have_tile_rotate="no"])
-
 # Ecore Buffer
 AC_ARG_ENABLE([ecore-buffer],
    [AS_HELP_STRING([--enable-ecore-buffer],[enable ecore-buffer. @<:@default=disabled@:>@])],
@@ -2984,13 +2971,6 @@ AC_CHECK_LIB([m], [lround],
 
 ### Configuration
 
-## Tile rotation
-
-if test "x${have_tile_rotate}" = "xyes" ; then
-   AC_DEFINE(TILE_ROTATE, 1, [Enable tiled rotate algorithm])
-fi
-
-
 ## dither options
 
 AC_ARG_WITH([evas-dither-mask],
diff --git a/src/lib/evas/common/evas_convert_rgb_32.c b/src/lib/evas/common/evas_convert_rgb_32.c
index 89789b2ac5..11671466b2 100644
--- a/src/lib/evas/common/evas_convert_rgb_32.c
+++ b/src/lib/evas/common/evas_convert_rgb_32.c
@@ -1,9 +1,13 @@
 #include "evas_common_private.h"
 #include "evas_convert_rgb_32.h"
 #ifdef BUILD_NEON
-#include <arm_neon.h>
+# include <arm_neon.h>
 #endif
 
+// tiled rotate is faster in every case i've tested, so just use this
+// by default.
+#define TILE_ROTATE 1
+
 void
 evas_common_convert_rgba_to_32bpp_rgb_8888 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
@@ -19,9 +23,9 @@ evas_common_convert_rgba_to_32bpp_rgb_8888 (DATA32 *src, DATA8 *dst, int src_jum
 
    for (y = 0; y < h; y++)
      {
-	func(src_ptr, dst_ptr, w);
-	src_ptr += w + src_jump;
-	dst_ptr += w + dst_jump;
+        func(src_ptr, dst_ptr, w);
+        src_ptr += w + src_jump;
+        dst_ptr += w + dst_jump;
      }
    return;
 }
@@ -44,234 +48,205 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int
 }
 
 #ifdef TILE_ROTATE
-#ifdef BUILD_NEON
-#define ROT90_QUAD_COPY_LOOP(pix_type) \
-   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) \
-   { \
-      if((w%4) == 0) \
-      { \
-        int klght = 4 * src_stride; \
-        for(y = 0; y < h; y++) \
-        { \
-          const pix_type *s = &(src[(h - y - 1)]); \
-          pix_type *d = &(dst[(dst_stride * y)]); \
-          pix_type *ptr1 = s; \
-          pix_type *ptr2 = ptr1 + src_stride; \
-          pix_type *ptr3 = ptr2 + src_stride; \
-          pix_type *ptr4 = ptr3 + src_stride; \
-          for(x = 0; x < w; x+=4) \
-          { \
-            pix_type s_array[4] = {*ptr1, *ptr2, *ptr3, *ptr4}; \
-            vst1q_s32(d, vld1q_s32(s_array)); \
-            d += 4; \
-            ptr1 += klght; \
-            ptr2 += klght; \
-            ptr3 += klght; \
-            ptr4 += klght; \
-          } \
-        } \
+# ifdef BUILD_NEON
+#  define ROT90_QUAD_COPY_LOOP(pix_type) \
+   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) { \
+      if ((w % 4) == 0) { \
+         int klght = 4 * src_stride; \
+         for (y = 0; y < h; y++) { \
+            const pix_type *s = &(src[h - y - 1]); \
+            pix_type *d = &(dst[dst_stride * y]); \
+            pix_type *ptr1 = s; \
+            pix_type *ptr2 = ptr1 + src_stride; \
+            pix_type *ptr3 = ptr2 + src_stride; \
+            pix_type *ptr4 = ptr3 + src_stride; \
+            for(x = 0; x < w; x += 4) { \
+               pix_type s_array[4] = { *ptr1, *ptr2, *ptr3, *ptr4 }; \
+               vst1q_s32(d, vld1q_s32(s_array)); \
+               d += 4; \
+               ptr1 += klght; \
+               ptr2 += klght; \
+               ptr3 += klght; \
+               ptr4 += klght; \
+            } \
+         } \
       } \
-      else \
-      { \
-        for (y = 0; y < h; y++) \
-          { \
-             const pix_type *s = &(src[(h - y - 1)]); \
-             pix_type *d = &(dst[(dst_stride * y)]); \
-             for (x = 0; x < w; x++) \
-               { \
-                  *d++ = *s; \
-                  s += src_stride; \
-               } \
-          } \
+      else { \
+         for (y = 0; y < h; y++) { \
+            const pix_type *s = &(src[h - y - 1]); \
+            pix_type *d = &(dst[dst_stride * y]); \
+            for (x = 0; x < w; x++) { \
+               *d++ = *s; \
+               s += src_stride; \
+            } \
+         } \
       } \
    } \
    else
-#define ROT270_QUAD_COPY_LOOP(pix_type) \
-   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) \
-   { \
-      if((w%4) == 0) \
-      { \
-        int klght = 4 * src_stride; \
-        for(y = 0; y < h; y++) \
-        { \
-          const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
-          pix_type *d = &(dst[(dst_stride * y)]); \
-          pix_type *ptr1 = s; \
-          pix_type *ptr2 = ptr1 + src_stride; \
-          pix_type *ptr3 = ptr2 + src_stride; \
-          pix_type *ptr4 = ptr3 + src_stride; \
-          for(x = 0; x < w; x+=4) \
-          { \
-            pix_type s_array[4] = {*ptr1, *ptr2, *ptr3, *ptr4}; \
-            vst1q_s32(d, vld1q_s32(s_array)); \
-            d += 4; \
-            ptr1 += klght; \
-            ptr2 += klght; \
-            ptr3 += klght; \
-            ptr4 += klght; \
-          } \
-        } \
+#  define ROT270_QUAD_COPY_LOOP(pix_type) \
+   if (evas_common_cpu_has_feature(CPU_FEATURE_NEON)) { \
+      if ((w % 4) == 0) { \
+         int klght = 4 * src_stride; \
+         for (y = 0; y < h; y++) { \
+            const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
+            pix_type *d = &(dst[dst_stride * y]); \
+            pix_type *ptr1 = s; \
+            pix_type *ptr2 = ptr1 + src_stride; \
+            pix_type *ptr3 = ptr2 + src_stride; \
+            pix_type *ptr4 = ptr3 + src_stride; \
+            for(x = 0; x < w; x+=4) { \
+               pix_type s_array[4] = { *ptr1, *ptr2, *ptr3, *ptr4 }; \
+               vst1q_s32(d, vld1q_s32(s_array)); \
+               d += 4; \
+               ptr1 += klght; \
+               ptr2 += klght; \
+               ptr3 += klght; \
+               ptr4 += klght; \
+            } \
+         } \
       } \
-      else \
-      { \
-        for (y = 0; y < h; y++) \
-          { \
-             const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
-             pix_type *d = &(dst[(dst_stride * y)]); \
-             for (x = 0; x < w; x++) \
-               { \
-                  *d++ = *s; \
-                  s += src_stride; \
-               } \
-          } \
+      else { \
+        for (y = 0; y < h; y++) { \
+           const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
+           pix_type *d = &(dst[dst_stride * y]); \
+           for (x = 0; x < w; x++) { \
+              *d++ = *s; \
+              s += src_stride; \
+           } \
+        } \
       } \
    } \
    else
-#else
-#define ROT90_QUAD_COPY_LOOP(pix_type)
-#define ROT270_QUAD_COPY_LOOP(pix_type)
-#endif
-#define FAST_SIMPLE_ROTATE(suffix, pix_type) \
+# else
+#  define ROT90_QUAD_COPY_LOOP(pix_type)
+#  define ROT270_QUAD_COPY_LOOP(pix_type)
+# endif
+
+# define FAST_SIMPLE_ROTATE(suffix, pix_type) \
    static void \
-   blt_rotated_90_trivial_##suffix(pix_type * restrict dst, \
-                                   int             dst_stride, \
+   blt_rotated_90_trivial_##suffix(pix_type       * restrict dst, \
+                                   int              dst_stride, \
                                    const pix_type * restrict src, \
-                                   int             src_stride, \
-                                   int             w, \
-                                   int             h) \
+                                   int              src_stride, \
+                                   int              w, \
+                                   int              h) \
    { \
       int x, y; \
-      ROT90_QUAD_COPY_LOOP(pix_type) \
-      { \
-        for (y = 0; y < h; y++) \
-          { \
-             const pix_type *s = &(src[(h - y - 1)]); \
-             pix_type *d = &(dst[(dst_stride * y)]); \
-             for (x = 0; x < w; x++) \
-               { \
-                  *d++ = *s; \
-                  s += src_stride; \
-               } \
-          } \
+      ROT90_QUAD_COPY_LOOP(pix_type) { \
+         for (y = 0; y < h; y++) { \
+            const pix_type *s = &(src[h - y - 1]); \
+            pix_type *d = &(dst[dst_stride * y]); \
+            for (x = 0; x < w; x++) { \
+               *d++ = *s; \
+               s += src_stride; \
+            } \
+         } \
       } \
    } \
    static void \
-   blt_rotated_270_trivial_##suffix(pix_type * restrict dst, \
-                                    int             dst_stride, \
+   blt_rotated_270_trivial_##suffix(pix_type       * restrict dst, \
+                                    int              dst_stride, \
                                     const pix_type * restrict src, \
-                                    int             src_stride, \
-                                    int             w, \
-                                    int             h) \
+                                    int              src_stride, \
+                                    int              w, \
+                                    int              h) \
    { \
       int x, y; \
-      ROT270_QUAD_COPY_LOOP(pix_type) \
-      { \
-        for(y = 0; y < h; y++) \
-        { \
-           const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
-           pix_type *d = &(dst[(dst_stride * y)]); \
-           for (x = 0; x < w; x++) \
-           { \
-              *d++ = *s; \
-              s -= src_stride; \
-           } \
-        } \
+      ROT270_QUAD_COPY_LOOP(pix_type) { \
+         for (y = 0; y < h; y++) { \
+            const pix_type *s = &(src[(src_stride * (w - 1)) + y]); \
+            pix_type *d = &(dst[dst_stride * y]); \
+            for (x = 0; x < w; x++) { \
+               *d++ = *s; \
+               s -= src_stride; \
+            } \
+         } \
       } \
    } \
    static void \
-   blt_rotated_90_##suffix(pix_type * restrict dst, \
-                           int             dst_stride, \
+   blt_rotated_90_##suffix(pix_type       * restrict dst, \
+                           int              dst_stride, \
                            const pix_type * restrict src, \
-                           int             src_stride, \
-                           int             w, \
-                           int             h) \
+                           int              src_stride, \
+                           int              w, \
+                           int              h) \
    { \
       int x, leading_pixels = 0, trailing_pixels = 0; \
       const int TILE_SIZE = TILE_CACHE_LINE_SIZE / sizeof(pix_type); \
-      if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) \
-        { \
-           leading_pixels = TILE_SIZE - \
-             (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
-           if (leading_pixels > w) \
-             leading_pixels = w; \
-           blt_rotated_90_trivial_##suffix(dst, \
-                                           dst_stride, \
-                                           src, \
-                                           src_stride, \
-                                           leading_pixels, \
-                                           h); \
-           dst += leading_pixels; \
-           src += leading_pixels * src_stride; \
-           w -= leading_pixels; \
-        } \
-      if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) \
-        { \
-           trailing_pixels = (((uintptr_t)(dst + w) & \
-                               (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
-           if (trailing_pixels > w) \
-             trailing_pixels = w; \
-           w -= trailing_pixels; \
-        } \
-      for (x = 0; x < w; x += TILE_SIZE) \
-        { \
-           blt_rotated_90_trivial_##suffix(dst + x, \
-                                           dst_stride, \
-                                           &(src[(src_stride * x)]), \
-                                           src_stride, \
-                                           TILE_SIZE, \
-                                           h); \
-        } \
+      if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) { \
+         leading_pixels = TILE_SIZE - \
+         (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+         if (leading_pixels > w) leading_pixels = w; \
+         blt_rotated_90_trivial_##suffix(dst, \
+                                         dst_stride, \
+                                         src, \
+                                         src_stride, \
+                                         leading_pixels, \
+                                         h); \
+         dst += leading_pixels; \
+         src += leading_pixels * src_stride; \
+         w -= leading_pixels; \
+      } \
+      if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) { \
+         trailing_pixels = (((uintptr_t)(dst + w) & \
+                             (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+         if (trailing_pixels > w) trailing_pixels = w; \
+         w -= trailing_pixels; \
+      } \
+      for (x = 0; x < w; x += TILE_SIZE) { \
+         blt_rotated_90_trivial_##suffix(dst + x, \
+                                         dst_stride, \
+                                         &(src[src_stride * x]), \
+                                         src_stride, \
+                                         TILE_SIZE, \
+                                         h); \
+      } \
       if (trailing_pixels) \
         blt_rotated_90_trivial_##suffix(dst + w, \
                                         dst_stride, \
-                                        &(src[(w * src_stride)]), \
+                                        &(src[src_stride * w]), \
                                         src_stride, \
                                         trailing_pixels, \
                                         h); \
    } \
    static void \
-   blt_rotated_270_##suffix(pix_type * restrict dst, \
-                            int             dst_stride, \
+   blt_rotated_270_##suffix(pix_type       * restrict dst, \
+                            int              dst_stride, \
                             const pix_type * restrict src, \
-                            int             src_stride, \
-                            int             w, \
-                            int             h) \
+                            int              src_stride, \
+                            int              w, \
+                            int              h) \
    { \
       int x, leading_pixels = 0, trailing_pixels = 0; \
       const int TILE_SIZE = TILE_CACHE_LINE_SIZE / sizeof(pix_type); \
-      if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) \
-        { \
-           leading_pixels = TILE_SIZE - \
-             (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
-           if (leading_pixels > w) \
-             leading_pixels = w; \
-           blt_rotated_270_trivial_##suffix(dst, \
-                                            dst_stride, \
-                                            &(src[(src_stride * (w - leading_pixels))]), \
-                                            src_stride, \
-                                            leading_pixels, \
-                                            h); \
-           dst += leading_pixels; \
-           w -= leading_pixels; \
-        } \
-      if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) \
-        { \
-           trailing_pixels = (((uintptr_t)(dst + w) & \
-                               (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
-           if (trailing_pixels > w) \
-             trailing_pixels = w; \
-           w -= trailing_pixels; \
-           src += trailing_pixels * src_stride; \
-        } \
-      for (x = 0; x < w; x += TILE_SIZE) \
-        { \
-           blt_rotated_270_trivial_##suffix(dst + x, \
-                                            dst_stride, \
-                                            &(src[(src_stride * (w - x - TILE_SIZE))]), \
-                                            src_stride, \
-                                            TILE_SIZE, \
-                                            h); \
-        } \
+      if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) { \
+         leading_pixels = TILE_SIZE - \
+         (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+         if (leading_pixels > w) leading_pixels = w; \
+         blt_rotated_270_trivial_##suffix(dst, \
+                                          dst_stride, \
+                                          &(src[src_stride * (w - leading_pixels)]), \
+                                          src_stride, \
+                                          leading_pixels, \
+                                          h); \
+         dst += leading_pixels; \
+         w -= leading_pixels; \
+      } \
+      if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) { \
+         trailing_pixels = (((uintptr_t)(dst + w) & \
+                             (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+         if (trailing_pixels > w) trailing_pixels = w; \
+         w -= trailing_pixels; \
+         src += trailing_pixels * src_stride; \
+      } \
+      for (x = 0; x < w; x += TILE_SIZE) { \
+         blt_rotated_270_trivial_##suffix(dst + x, \
+                                          dst_stride, \
+                                          &(src[src_stride * (w - x - TILE_SIZE)]), \
+                                          src_stride, \
+                                          TILE_SIZE, \
+                                          h); \
+      } \
       if (trailing_pixels) \
         blt_rotated_270_trivial_##suffix(dst + w, \
                                          dst_stride, \
@@ -288,12 +263,13 @@ void
 evas_common_convert_rgba_to_32bpp_rgb_8888_rot_270 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
 #ifdef TILE_ROTATE
-   blt_rotated_270_8888((DATA8 *)dst,  dst_jump+w, (const DATA8 *)src, src_jump+h,  w, h) ;
+   blt_rotated_270_8888((DATA32 *)dst, dst_jump + w,
+                        src, src_jump + h,
+                        w, h);
 #else
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr *dst_ptr;
    int x, y;
-   
+
    dst_ptr = (DATA32 *)dst;
 
    CONVERT_LOOP_START_ROT_270();
@@ -305,15 +281,32 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_270 (DATA32 *src, DATA8 *dst, int
    return;
 }
 
+/* speed measuring code - enable when optimizing to compare
+#include <time.h>
+static double
+get_time(void)
+{
+   struct timespec t;
+
+   clock_gettime(CLOCK_MONOTONIC, &t);
+   return (double)t.tv_sec + (((double)t.tv_nsec) / 1000000000.0);
+}
+*/
+
 void
 evas_common_convert_rgba_to_32bpp_rgb_8888_rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
+/*
+   static double tt = 0.0;
+   static unsigned long long pt = 0;
+   double t0 = get_time();
+ */
 #ifdef TILE_ROTATE
-   blt_rotated_90_8888((DATA8 *)dst,  dst_jump+w, (const DATA8 *)src, src_jump+h, w, h) ;
+   blt_rotated_90_8888((DATA32 *)dst, dst_jump + w,
+                       src, src_jump + h,
+                       w, h);
 #else
-# ifndef BUILD_NEON
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -322,117 +315,19 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_90 (DATA32 *src, DATA8 *dst, int
    *dst_ptr = *src_ptr;
 
    CONVERT_LOOP_END_ROT_90();
-# elif defined BUILD_NEON_INTRINSICS
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
-   int x, y;
-
-   dst_ptr = (DATA32 *)dst;
-   CONVERT_LOOP_START_ROT_90();
-
-   *dst_ptr = *src_ptr;
-
-   CONVERT_LOOP_END_ROT_90();
-# else
-   if ((w & 1) || (h & 1))
-     {
-        /* Rarely (if ever) if ever: so slow path is fine */
-        DATA32 *src_ptr;
-        DATA32 *dst_ptr;
-        int x, y;
-
-        dst_ptr = (DATA32 *)dst;
-        CONVERT_LOOP_START_ROT_90();
-
-        *dst_ptr = *src_ptr;
-
-        CONVERT_LOOP_END_ROT_90();
-     }
-   else
-     {
-#  define AP  "convert_rgba32_rot_90_"
-        asm volatile (
-        ".fpu neon                      \n\t"
-        "   mov     %[s1],  %[src]          \n\t"
-        "   add     %[s1],  %[s1],  %[h],lsl #2 \n\t"
-        "   sub     %[s1],  #8          \n\t"
-
-        "   mov     %[s2],  %[src]          \n\t"
-        "   add     %[s2],  %[s2],  %[h], lsl #3    \n\t"
-        "   add     %[s2],  %[s2],  %[sjmp], lsr #1 \n\t"
-        "   sub     %[s2],  #8          \n\t"
-
-        "   mov     %[d1],  %[dst]          \n\t"
-
-        "   add     %[d2],  %[d1], %[djmp]      \n\t"
-        "   add     %[d2],  %[d2],  %[w], lsl #2    \n\t"
-
-        "   mov     %[sadv], %[h], lsl #3       \n\t"
-        "   add     %[sadv], %[sadv], %[sjmp], lsl #1\n\t"
-
-        "   mov     %[y],   #0          \n\t"
-        "   mov     %[x],   #0          \n\t"
-        AP"loop:                        \n\t"
-        "   vld1.u32    d0, [%[s1]]         \n\t"
-        "   vld1.u32    d1, [%[s2]]         \n\t"
-        "   add     %[x],   #2          \n\t"
-        "   add     %[s1],  %[sadv]         \n\t"
-        "   add     %[s2],  %[sadv]         \n\t"
-        "   vtrn.u32    d0, d1          \n\t"
-        "   cmp     %[x],   %[w]            \n\t"
-        "   vst1.u32    d1, [%[d1]]!        \n\t"
-        "   vst1.u32    d0, [%[d2]]!        \n\t"
-        "   blt     "AP"loop            \n\t"
-
-        "   mov     %[x],   #0          \n\t"
-        "   add     %[d1],  %[djmp]         \n\t"
-        "   add     %[d1],  %[d1],  %[w], lsl #2    \n\t"
-        "   add     %[d2],  %[djmp]         \n\t"
-        "   add     %[d2],  %[d2],  %[w], lsl #2    \n\t"
-
-        "   mov     %[s1],  %[src]          \n\t"
-        "   add     %[s1],  %[s1],  %[h], lsl #2    \n\t"
-        "   sub     %[s1],  %[s1],  %[y], lsl #2    \n\t"
-        "   sub     %[s1],  #16         \n\t"
-
-        "   add     %[s2],  %[s1], %[h], lsl #2 \n\t"
-        "   add     %[s2],  %[s2],  %[sjmp], lsl #2 \n\t"
-
-        "   add     %[y],   #2          \n\t"
-
-        "   cmp     %[y],   %[h]            \n\t"
-        "   blt     "AP"loop            \n\t"
-
-    : // Out
-    :   [s1] "r" (1),
-        [s2] "r" (11),
-        [d1] "r" (2),
-        [d2] "r" (12),
-        [src] "r" (src),
-        [dst] "r" (dst),
-        [x] "r" (3),
-        [y] "r" (4),
-        [w] "r" (w),
-        [h] "r" (h),
-        [sadv] "r" (5),
-        [sjmp] "r" (src_jump * 4),
-        [djmp] "r" (dst_jump * 4 * 2)
-    : "d0", "d1", "memory", "cc"// Clober
-
-
-        );
-     }
-#  undef AP
-# endif
 #endif
-   return;
+/*
+   double t1 = get_time();
+   tt += t1 - t0;
+   pt += (w * h);
+   printf("%1.2f mpix/sec (%1.9f @ %1.9f)\n", (double)pt / (tt * 1000000), tt, t1);
+*/
 }
 
 void
 evas_common_convert_rgba_to_32bpp_rgbx_8888 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -449,8 +344,7 @@ evas_common_convert_rgba_to_32bpp_rgbx_8888 (DATA32 *src, DATA8 *dst, int src_ju
 void
 evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_180 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -467,8 +361,7 @@ evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_180 (DATA32 *src, DATA8 *dst, in
 void
 evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_270 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -485,8 +378,7 @@ evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_270 (DATA32 *src, DATA8 *dst, in
 void
 evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -503,8 +395,7 @@ evas_common_convert_rgba_to_32bpp_rgbx_8888_rot_90 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_bgr_8888 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -520,8 +411,7 @@ evas_common_convert_rgba_to_32bpp_bgr_8888 (DATA32 *src, DATA8 *dst, int src_jum
 void
 evas_common_convert_rgba_to_32bpp_bgr_8888_rot_180 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -537,8 +427,7 @@ evas_common_convert_rgba_to_32bpp_bgr_8888_rot_180 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_bgr_8888_rot_270 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -554,8 +443,7 @@ evas_common_convert_rgba_to_32bpp_bgr_8888_rot_270 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_bgr_8888_rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -571,8 +459,7 @@ evas_common_convert_rgba_to_32bpp_bgr_8888_rot_90 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_bgrx_8888 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -588,8 +475,7 @@ evas_common_convert_rgba_to_32bpp_bgrx_8888 (DATA32 *src, DATA8 *dst, int src_ju
 void
 evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_180 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -605,8 +491,7 @@ evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_180 (DATA32 *src, DATA8 *dst, in
 void
 evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_270 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -622,8 +507,7 @@ evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_270 (DATA32 *src, DATA8 *dst, in
 void
 evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
@@ -639,8 +523,7 @@ evas_common_convert_rgba_to_32bpp_bgrx_8888_rot_90 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_rgb_666(DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-   DATA32 *src_ptr;
-   DATA32 *dst_ptr;
+   DATA32 *src_ptr, *dst_ptr;
    int x, y;
 
    dst_ptr = (DATA32 *)dst;
author	Carsten Haitzler (Rasterman) <raster@rasterman.com>	2019-03-09 15:19:28 +0000
committer	Carsten Haitzler (Rasterman) <raster@rasterman.com>	2019-03-09 15:21:46 +0000
commit	4758f06e637239f981eedbaaf8c0d613b78e4417 (patch)
tree	01e220a40497a9288cc2390fdc37e4e7e9cd5bb9
parent	f234a2b6c77af85c3a272cb417257cf1bc531b4f (diff)
download	efl-4758f06e637239f981eedbaaf8c0d613b78e4417.tar.gz