[Vm-dev] [commit][3139] update some ARM specific bitblt code for the new pixel/ color compare prim

commits at squeakvm.org commits at squeakvm.org
Fri Nov 21 03:38:53 UTC 2014


Revision: 3139
Author:   rowledge
Date:     2014-11-20 19:38:51 -0800 (Thu, 20 Nov 2014)
Log Message:
-----------
update some ARM specific bitblt code for the new pixel/color compare prim

Modified Paths:
--------------
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h

Added Paths:
-----------
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s

Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c	2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c	2014-11-21 03:38:51 UTC (rev 3139)
@@ -257,8 +257,57 @@
 		{ fastPathBitAnd32_32,           CR_bitAnd,     STD_FLAGS(32,32,NO,NO) },
 };
 
+#define TALLY_FAST_PATH(op, srcA_bpp, srcB_bpp)                                                                                                 \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_wide  (uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny  (uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+static uint32_t tallyFastPath##op##srcA_bpp##_##srcB_bpp(compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB)                         \
+{                                                                                                                                               \
+    IGNORE(log2bppA);                                                                                                                           \
+    IGNORE(log2bppB);                                                                                                                           \
+    COPY_COMPARE_OP_TO_LOCALS(op, uint32_t, uint32_t);                                                                                          \
+    /* Get pointers to initial words */                                                                                                         \
+    uint32_t *srcA = srcABits + srcAPitch * srcAY + srcAX * srcA_bpp / 32;                                                                      \
+    uint32_t *srcB = srcBBits + srcBPitch * srcBY + srcBX * srcB_bpp / 32;                                                                      \
+    /* Get initial pixel offset within words, mangle into pitch if possible */                                                                  \
+    uint32_t bitPtrs = 0;                                                                                                                       \
+    uint32_t srcAXpix = srcAX & (31 / srcA_bpp);                                                                                                \
+    if (srcA_bpp < 8)                                                                                                                           \
+        bitPtrs = srcAXpix;                                                                                                                     \
+    else if (srcA_bpp == 8 || srcA_bpp == 16)                                                                                                   \
+        srcAPitch |= srcAXpix << 30;                                                                                                            \
+    uint32_t srcBXpix = srcBX & (31 / srcB_bpp);                                                                                                \
+    if (srcB_bpp < 8)                                                                                                                           \
+        bitPtrs |= srcBXpix << 27;                                                                                                              \
+    else if (srcB_bpp == 8 || srcB_bpp == 16)                                                                                                   \
+        srcBPitch |= srcBXpix << 30;                                                                                                            \
+    /* Adjust strides to remove number of words partially or wholly read/written */                                                             \
+    srcAPitch -= (srcA_bpp * (srcAXpix + width) + 31) / 32;                                                                                     \
+    srcBPitch -= (srcB_bpp * (srcBXpix + width) + 31) / 32;                                                                                     \
+    /* Work out which width class this operation is.                                                                                            \
+     * Rather than re-evaluate this for each line, we want one choice                                                                           \
+     * for the whole operation; this means we can't assume anything about                                                                       \
+     * alignment to sizes larger than 4 bytes, because that's the only                                                                          \
+     * guarantee we have about line stride. */                                                                                                  \
+    if (width > (128-32)/srcA_bpp && (((srcAXpix-1) ^ (srcAXpix+width-(128-32)/srcA_bpp)) &~ (31/srcA_bpp)))                                    \
+        return armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_wide(width, height, srcA, srcAPitch, srcB, srcBPitch, colorA, colorB, 0, bitPtrs);   \
+    else if (srcA_bpp > 8 || (((srcAXpix-1) ^ (srcAXpix+width)) &~ (31/srcA_bpp)))                                                              \
+        return armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_narrow(width, height, srcA, srcAPitch, srcB, srcBPitch, colorA, colorB, 0, bitPtrs); \
+    else                                                                                                                                        \
+        return armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny(width, height, srcA, srcAPitch, srcB, srcBPitch, colorA, colorB, 0, bitPtrs);   \
+}
+
+#define ADD_TALLY_FN(op, srcA_bpp, srcB_bpp)                     \
+    do { compareColorsFns[(((MR_##op * 2) + 1) * 3 +             \
+            (srcA_bpp == 8 ? 0 : srcA_bpp == 16 ? 1 : 2)) * 3 +  \
+            (srcB_bpp == 8 ? 0 : srcB_bpp == 16 ? 1 : 2)] =      \
+            tallyFastPath##op##srcA_bpp##_##srcB_bpp; } while(0)
+
+TALLY_FAST_PATH(pixelMatch, 32, 32)
+
 void addArmSimdFastPaths(void)
 {
 	addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
+
+	ADD_TALLY_FN(pixelMatch, 32, 32);
 }
-

Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr	2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr	2014-11-21 03:38:51 UTC (rev 3139)
@@ -41,18 +41,20 @@
 
 FLAG_DST_WRITEONLY              *       0 :SHL: 3
 FLAG_DST_READWRITE              *       1 :SHL: 3
-FLAG_SPILL_NO_LINE_VARS         *       0 :SHL: 4
-FLAG_SPILL_LINE_VARS_WIDE       *       1 :SHL: 4
-FLAG_SPILL_LINE_VARS_NON_WIDE   *       2 :SHL: 4
-FLAG_SPILL_LINE_VARS            *       3 :SHL: 4
-FLAG_EXPAND_SKEW                *       0 :SHL: 6
-FLAG_NO_EXPAND_SKEW             *       1 :SHL: 6
-FLAG_PROCESS_SERIAL             *       0 :SHL: 7  ; sub-word data is presented MS-aligned, and results are expected LS-aligned
-FLAG_PROCESS_PARALLEL           *       1 :SHL: 7  ; sub-word data retains its original alignment throughout (only useful if src & dest depths same)
-FLAG_MAX_128BIT_MACRO           *       0 :SHL: 8
-FLAG_MAX_256BIT_MACRO           *       1 :SHL: 8  ; particularly tight loops can sometimes benefit from being unrolled to allow 2x 128-bit blocks to be staggered
-FLAG_PRELOAD_DST                *       0 :SHL: 9
-FLAG_NO_PRELOAD_DST             *       1 :SHL: 9
+FLAG_DST_READONLY               *       2 :SHL: 3
+FLAG_DST_ACCESS                 *       3 :SHL: 3
+FLAG_SPILL_NO_LINE_VARS         *       0 :SHL: 5
+FLAG_SPILL_LINE_VARS_WIDE       *       1 :SHL: 5
+FLAG_SPILL_LINE_VARS_NON_WIDE   *       2 :SHL: 5
+FLAG_SPILL_LINE_VARS            *       3 :SHL: 5
+FLAG_EXPAND_SKEW                *       0 :SHL: 7
+FLAG_NO_EXPAND_SKEW             *       1 :SHL: 7
+FLAG_PROCESS_SERIAL             *       0 :SHL: 8  ; sub-word data is presented MS-aligned, and results are expected LS-aligned
+FLAG_PROCESS_PARALLEL           *       1 :SHL: 8  ; sub-word data retains its original alignment throughout (only useful if src & dest depths same)
+FLAG_MAX_128BIT_MACRO           *       0 :SHL: 9
+FLAG_MAX_256BIT_MACRO           *       1 :SHL: 9  ; particularly tight loops can sometimes benefit from being unrolled to allow 2x 128-bit blocks to be staggered
+FLAG_PRELOAD_DST                *       0 :SHL: 10
+FLAG_NO_PRELOAD_DST             *       1 :SHL: 10
 
 ; Offsets into stack
         GBLA    args_stack_offset
@@ -676,6 +678,7 @@
 
         MACRO
 $lab    WriteFirstSubWord $base, $data, $pixels, $tmp1, $tmp2
+        ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
         ; It is assumed that there is at least 1 pixel to write
         LCLS    reg0
 reg0    LookupWk $data
@@ -719,6 +722,7 @@
 
         MACRO
 $lab    WriteLastSubWord $base, $data, $pixels, $aligned, $tmp1, $tmp2
+        ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
         ; It is assumed that there is at least 1 pixel to write
         LCLS    reg0
 reg0    LookupWk $data
@@ -810,7 +814,8 @@
 reg0    LookupWk $first
         Print   Data, "Write1Word: %08X @%p\n", $reg0, $base
 $lab
-     IF (flags :AND: FLAG_DST_READWRITE) > 0
+        ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
+     IF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READWRITE
         STR     $reg0, [$base, #-4] ; base is assumed previously updated during read
    ELSE
         STR     $reg0, [$base], #4
@@ -829,7 +834,8 @@
       ]
         Print   Data, "Write2Words: %08X %08X @%p\n", $reg0, $reg1, $base
 $lab
-     IF (flags :AND: FLAG_DST_READWRITE) > 0
+        ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
+     IF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READWRITE
         STMDB   $base, {$reg0, $reg1} ; base is assumed previously updated during read
    ELSE
         STMIA   $base!, {$reg0, $reg1}
@@ -861,7 +867,8 @@
         Print   Data, "Write4Words: %08X %08X", $reg0, $reg1
         Print   Data, " %08X %08X @%p\n", $reg2, $reg3, $base
 $lab
-     IF (flags :AND: FLAG_DST_READWRITE) > 0
+        ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
+     IF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READWRITE
         STMDB   $base, {$reg0, $reg1, $reg2, $reg3} ; base is assumed previously updated during read
    ELSE
         STMIA   $base!, {$reg0, $reg1, $reg2, $reg3}
@@ -876,11 +883,13 @@
   [ dst_w_bpp < 32
         ANDS    scratch, $pixels, #32/dst_w_bpp - 1
         BEQ     %FT02
-      [ flags :AND: FLAG_DST_READWRITE > 0
+     IF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READWRITE
         LDR     $wk1, [dst]
-      ]
+   ELIF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READONLY
+        LDR     $wk1, [dst], #4
+  ENDIF
     [ flags :AND: FLAG_PROCESS_PARALLEL = 0
-      [ flags :AND: FLAG_DST_READWRITE > 0
+      [ (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_WRITEONLY
         MOV     scratch, scratch, LSL #dst_bpp_shift
         MOV     $wk1, $wk1, ROR scratch
       ]
@@ -895,8 +904,11 @@
 01
 pow2    SETA    pow2 * 2
         WEND
+      [ (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
         WriteFirstSubWord dst, 1, $pixels, scratch, $wk2
+      ]
     |
+        ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY ; Can't really do read-only 32-bit processing without an indication of which bits are valid
         $prefix._32bits $wk0, $wk1, $fixed_skew ; and return result in $wk0
         WriteFirstSubWord dst, 0, $pixels, scratch, $wk2
     ]
@@ -945,9 +957,11 @@
   [ dst_w_bpp < 32
         TST     $pixels, #32/dst_w_bpp - 1
         BEQ     %FT02
-      [ flags :AND: FLAG_DST_READWRITE > 0
+     IF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READWRITE
         LDR     $wk1, [dst]
-      ]
+   ELIF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READONLY
+        LDR     $wk1, [dst], #4
+  ENDIF
     [ flags :AND: FLAG_PROCESS_PARALLEL = 0
         LCLA    pow2
         LCLS    pow2str
@@ -968,8 +982,11 @@
 01
 pow2    SETA    pow2 / 2
         WEND
+      [ (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
         WriteLastSubWord dst, 1, $pixels, ls_aligned, scratch, $wk0
+      ]
     |
+        ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY ; Can't really do read-only 32-bit processing without an indication of which bits are valid
         ReadLastSubWord src, 0, carry, $pixels, $fixed_skew, skew, scratch
         $prefix._32bits $wk0, $wk1, $fixed_skew ; and return result in $wk0
         WriteLastSubWord dst, 0, $pixels, ms_aligned, scratch, $wk1
@@ -1386,7 +1403,7 @@
 
 src_bpp_shift   Log2    src_bpp
 dst_bpp_shift   Log2    dst_w_bpp
-      [ flags :AND: FLAG_DST_READWRITE = 0
+      [ (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_WRITEONLY
 dst_r_bpp       SETA    0
       |
 dst_r_bpp       SETA    dst_w_bpp
@@ -1520,7 +1537,7 @@
 dst_prefetch_offset SETA 0
       ]
 52
-        WHILE   dst_prefetch_offset <= 0
+        WHILE   dst_prefetch_offset <> 16
 subblock SETA   0
         WHILE   subblock < pix_per_block*dst_w_bpp/128
       [ dst_w_bpp > 4 * src_bpp :LAND: src_bpp > 0
@@ -1573,7 +1590,7 @@
         WEND
         SUBS    x, x, #pix_per_block
         BHS     %BT52
-      [ dst_prefetch_offset < 0
+      [ dst_prefetch_offset = -16
         B       %FT55
 54
       ]

Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s	                        (rev 0)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s	2014-11-21 03:38:51 UTC (rev 3139)
@@ -0,0 +1,102 @@
+;
+; Copyright © 2014 Raspberry Pi Foundation
+; Copyright © 2014 RISC OS Open Ltd
+;
+; Permission to use, copy, modify, distribute, and sell this software and its
+; documentation for any purpose is hereby granted without fee, provided that
+; the above copyright notice appear in all copies and that both that
+; copyright notice and this permission notice appear in supporting
+; documentation, and that the name of the copyright holders not be used in
+; advertising or publicity pertaining to distribution of the software without
+; specific, written prior permission.  The copyright holders make no
+; representations about the suitability of this software for any purpose.  It
+; is provided "as is" without express or implied warranty.
+;
+; THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+; SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+; FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+; SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+; OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+; SOFTWARE.
+;
+
+; Debug options
+                GBLL    DebugData
+;DebugData       SETL    {TRUE}
+                GBLL    DebugPld
+;DebugPld        SETL    {TRUE}
+                GBLL    VerboseBuild
+;VerboseBuild    SETL    {TRUE}
+
+        GET     BitBltArmSimdAsm.hdr
+
+        AREA    |BitBltArmSimdCompare$$Code|, CODE, READONLY
+        ARM
+
+; We use the two halftone arguments/registers to hold the two comparison colours
+; and the map register to hold the hit count.
+; Source A is referenced by dst/stride_d.
+; Source B is referenced by src/stride_s.
+
+; ********************************************************************
+
+        MACRO
+        pixelMatchTally32_32_init
+        MOV     map, #0
+        MEND
+
+        MACRO
+        pixelMatchTally32_32_cleanup
+        MOV     a1, map
+        MEND
+
+        MACRO
+        pixelMatchTally32_32_1pixel $srcA, $srcB
+        EOR     $srcA, $srcA, ht
+        CLZ     $srcA, $srcA ; bit 5 set => all bits were 0
+        TEQ     $srcB, ht_info
+        ADDEQ   map, map, $srcA, LSR #5
+        MEND
+
+        MACRO
+        pixelMatchTally32_32_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        pixelMatchTally32_32_1pixel $wk0, $wk1
+        MEND
+
+        MACRO
+        pixelMatchTally32_32_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        pixelMatchTally32_32_1pixel $wk0, $wk2
+        pixelMatchTally32_32_1pixel $wk1, $wk3
+        MEND
+
+        MACRO
+        pixelMatchTally32_32_128bits_head $src, $fixed_skew, $intra_preloads
+        Read4Words dst, 0,, 0
+        Read4Words src, 4, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        pixelMatchTally32_32_128bits_tail $src
+        pixelMatchTally32_32_1pixel $wk0, $wk4
+        pixelMatchTally32_32_1pixel $wk1, $wk5
+        pixelMatchTally32_32_1pixel $wk2, $wk6
+        pixelMatchTally32_32_1pixel $wk3, $wk7
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+pixelMatchTally GenerateFunctions 32, 32,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+  "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+  "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+        END

Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c	2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c	2014-11-21 03:38:51 UTC (rev 3139)
@@ -90,7 +90,10 @@
 static const unsigned int  maskTable85[4] = { 0xF80000, 0x00F800, 0x0000F8, 0x000000 };
 static const          int shiftTable85[4] = {       -9,       -6,       -3,        0 };
 
+/** The dispatch table for >= 8bpp big-endian compareColors functions */
+compare_colors_fn_t compareColorsFns[3*2*3*3];
 
+
 #ifdef PROFILING
 static uint64_t gettime(void)
 {
@@ -436,3 +439,33 @@
 #endif
 }
 
+sqInt compareColorsDispatch(compare_operation_t *op)
+{
+    uint32_t log2bppA;
+    uint32_t log2bppB;
+    switch (op->srcA.depth)
+    {
+    case 1:  log2bppA = 0; break;
+    case 2:  log2bppA = 1; break;
+    case 4:  log2bppA = 2; break;
+    case 8:  log2bppA = 3; break;
+    case 16: log2bppA = 4; break;
+    case 32: log2bppA = 5; break;
+    default: abort();
+    }
+    switch (op->srcB.depth)
+    {
+    case 1:  log2bppB = 0; break;
+    case 2:  log2bppB = 1; break;
+    case 4:  log2bppB = 2; break;
+    case 8:  log2bppB = 3; break;
+    case 16: log2bppB = 4; break;
+    case 32: log2bppB = 5; break;
+    default: abort();
+    }
+    if (log2bppA < 3 || log2bppB < 3 || !op->srcA.msb || !op->srcB.msb)
+        /* These cases aren't catered for by the function table */
+        return genericCompareColors(op, log2bppA, log2bppB);
+    else
+        return compareColorsFns[(((op->matchRule * 2) + op->tally) * 3 + (log2bppA - 3)) * 3 + (log2bppB - 3)](op, log2bppA, log2bppB);
+}

Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h	2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h	2014-11-21 03:38:51 UTC (rev 3139)
@@ -82,6 +82,13 @@
 }
 combination_rule_t;
 
+typedef enum {
+    MR_pixelMatch, /* 0 */
+    MR_notAnotB,   /* 1 */
+    MR_notAmatchB, /* 2 */
+}
+match_rule_t;
+
 typedef struct {
 	void  *bits;
 	usqInt depth;
@@ -119,7 +126,24 @@
 }
 operation_t;
 
+typedef struct {
+    match_rule_t  matchRule;
+    bool          tally;
+    src_or_dest_t srcA;
+    src_or_dest_t srcB;
+    usqInt        width;
+    usqInt        height;
+    usqInt        colorA;
+    usqInt        colorB;
+}
+compare_operation_t;
+
+typedef usqInt (*compare_colors_fn_t)(compare_operation_t *op, usqInt log2bppA, usqInt log2bppB);
+
+extern compare_colors_fn_t compareColorsFns[3*2*3*3];
+
 void initialiseCopyBits(void);
 void copyBitsDispatch(operation_t *op);
+sqInt compareColorsDispatch(compare_operation_t *op);
 
 #endif /* BITBLTDISPATCH_H_ */

Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c	2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c	2014-11-21 03:38:51 UTC (rev 3139)
@@ -456,8 +456,96 @@
 		{ fastPathDepthConv,             CR_any,             FAST_PATH_SRC_0BPP | FAST_PATH_SRC_1BPP  | ONLY_DEST_1BPP },
 };
 
+static uint32_t genericCompareRow(uint32_t        width,
+                                  const uint32_t *ptrA,
+                                  const uint32_t *ptrB,
+                                  uint32_t        colorA,
+                                  uint32_t        colorB,
+                                  uint32_t        pixelIndexes,
+                                  match_rule_t    matchRule,
+                                  bool            tally,
+                                  uint32_t        bppA,
+                                  uint32_t        bppB,
+                                  uint32_t        ppwA,
+                                  uint32_t        ppwB,
+                                  bool            msbA,
+                                  bool            msbB)
+{
+    uint32_t count = 0;
+    uint32_t a32 = *ptrA++;
+    uint32_t b32 = *ptrB++;
+    if (msbA)
+        a32 <<= bppA * (pixelIndexes & 0x1F);
+    else
+        a32 >>= bppA * (pixelIndexes & 0x1F);
+    if (msbB)
+        b32 <<= bppB * (pixelIndexes >> 27);
+    else
+        b32 >>= bppB * (pixelIndexes >> 27);
+    while (width > 0)
+    {
+        uint32_t a = msbA ? a32 >> (32-bppA) : a32 & ((1<<bppA)-1);
+        uint32_t b = msbB ? b32 >> (32-bppB) : b32 & ((1<<bppB)-1);
+        uint32_t nextPixelIndexes;
+        if (matchRule == MR_pixelMatch)
+            count += a == colorA && b == colorB;
+        else if (matchRule == MR_notAnotB)
+            count += a != colorA && b != colorB;
+        else // MR_notAmatchB
+            count += a != colorA && b == colorB;
+        if (count && !tally)
+            return count;
+        if (--width == 0)
+            break;
+        nextPixelIndexes = pixelIndexes + 1 + (1<<27);
+        if (nextPixelIndexes & ppwA)
+        {
+            a32 = *ptrA++;
+            nextPixelIndexes -= ppwA;
+        }
+        if (ppwB == 32)
+        {
+            if (nextPixelIndexes < pixelIndexes)
+                b32 = *ptrB++;
+        }
+        else
+        {
+            if (nextPixelIndexes & (ppwB<<27))
+            {
+                b32 = *ptrB++;
+                nextPixelIndexes -= ppwB<<27;
+            }
+        }
+        pixelIndexes = nextPixelIndexes;
+    }
+    return count;
+}
+
+uint32_t genericCompareColors(compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB)
+{
+    uint32_t count = 0;
+    uint32_t pixelIndexes;
+    COPY_COMPARE_OP_TO_LOCALS(op, uint32_t, uint32_t);
+    srcABits += srcAY * srcAPitch + srcAX >> (5 - log2bppA);
+    srcBBits += srcBY * srcBPitch + srcBX >> (5 - log2bppB);
+    pixelIndexes = (srcAX & (srcADepth - 1)) + ((srcBX & (srcBDepth - 1)) << 27);
+    /* This routine is never going to be especially fast, so just use a simple loop */
+    while (height--)
+    {
+        count += genericCompareRow(width, srcABits, srcBBits, colorA, colorB, pixelIndexes,
+                    matchRule, tally, srcADepth, srcBDepth, 32 >> log2bppA, 32 >> log2bppB, srcAMSB, srcBMSB);
+        if (count && !tally)
+            return count;
+        srcABits += srcAPitch;
+        srcBBits += srcBPitch;
+    }
+    return count;
+}
+
 void addGenericFastPaths(void)
 {
+    int i;
 	addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
+	for (i = 0; i < sizeof compareColorsFns / sizeof *compareColorsFns; i++)
+	    compareColorsFns[i] = genericCompareColors;
 }
-

Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h	2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h	2014-11-21 03:38:51 UTC (rev 3139)
@@ -27,5 +27,6 @@
 #define BITBLTGENERIC_H_
 
 void addGenericFastPaths(void);
+uint32_t genericCompareColors(compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB);
 
 #endif /* BITBLTGENERIC_H_ */

Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h	2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h	2014-11-21 03:38:51 UTC (rev 3139)
@@ -125,7 +125,7 @@
 /** This macro basically tells the compiler that the pointer to the
  * "op" structure doesn't alias with any other pointers. I'd use the
  * restrict keyword instead, but Squeak is built C89. */
-#define COPY_OP_TO_LOCALS(op, src_type, dest_type)                      \
+#define COPY_OP_TO_LOCALS(op, src_type, dest_type)                             \
 	combination_rule_t combinationRule  = op->combinationRule;                 \
 	bool               noSource         = op->noSource;                        \
 	src_type          *srcBits          = op->src.bits;                        \
@@ -175,7 +175,45 @@
 	IGNORE(halftoneHeight);  \
 	IGNORE(halftoneBase);    \
 
+#define COPY_COMPARE_OP_TO_LOCALS(op, srcA_type, srcB_type)                    \
+    match_rule_t       matchRule        = op->matchRule;                       \
+    bool               tally            = op->tally;                           \
+    srcA_type         *srcABits         = op->srcA.bits;                       \
+    uint32_t           srcADepth        = op->srcA.depth;                      \
+    uint32_t           srcAPitch        = op->srcA.pitch / sizeof (srcA_type); \
+    bool               srcAMSB          = op->srcA.msb;                        \
+    uint32_t           srcAX            = op->srcA.x;                          \
+    uint32_t           srcAY            = op->srcA.y;                          \
+    srcB_type         *srcBBits         = op->srcB.bits;                       \
+    uint32_t           srcBDepth        = op->srcB.depth;                      \
+    uint32_t           srcBPitch        = op->srcB.pitch / sizeof (srcB_type); \
+    bool               srcBMSB          = op->srcB.msb;                        \
+    uint32_t           srcBX            = op->srcB.x;                          \
+    uint32_t           srcBY            = op->srcB.y;                          \
+    uint32_t           width            = op->width;                           \
+    uint32_t           height           = op->height;                          \
+    uint32_t           colorA           = op->colorA;                          \
+    uint32_t           colorB           = op->colorB;                          \
+    IGNORE(matchRule); \
+    IGNORE(tally);     \
+    IGNORE(srcABits);  \
+    IGNORE(srcADepth); \
+    IGNORE(srcAPitch); \
+    IGNORE(srcAMSB);   \
+    IGNORE(srcAX);     \
+    IGNORE(srcAY);     \
+    IGNORE(srcBBits);  \
+    IGNORE(srcBDepth); \
+    IGNORE(srcBPitch); \
+    IGNORE(srcBMSB);   \
+    IGNORE(srcBX);     \
+    IGNORE(srcBY);     \
+    IGNORE(width);     \
+    IGNORE(height);    \
+    IGNORE(colorA);    \
+    IGNORE(colorB);    \
 
+
 typedef struct {
 	void             (*func)(operation_t *, uint32_t);
 	combination_rule_t combinationRule;



More information about the Vm-dev mailing list