[Vm-dev] [commit][3139] update some ARM specific bitblt code for
the new pixel/ color compare prim
commits at squeakvm.org
commits at squeakvm.org
Fri Nov 21 03:38:53 UTC 2014
Revision: 3139
Author: rowledge
Date: 2014-11-20 19:38:51 -0800 (Thu, 20 Nov 2014)
Log Message:
-----------
update some ARM specific bitblt code for the new pixel/color compare prim
Modified Paths:
--------------
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h
Added Paths:
-----------
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c 2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c 2014-11-21 03:38:51 UTC (rev 3139)
@@ -257,8 +257,57 @@
{ fastPathBitAnd32_32, CR_bitAnd, STD_FLAGS(32,32,NO,NO) },
};
+#define TALLY_FAST_PATH(op, srcA_bpp, srcB_bpp) \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_wide (uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny (uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+static uint32_t tallyFastPath##op##srcA_bpp##_##srcB_bpp(compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB) \
+{ \
+ IGNORE(log2bppA); \
+ IGNORE(log2bppB); \
+ COPY_COMPARE_OP_TO_LOCALS(op, uint32_t, uint32_t); \
+ /* Get pointers to initial words */ \
+ uint32_t *srcA = srcABits + srcAPitch * srcAY + srcAX * srcA_bpp / 32; \
+ uint32_t *srcB = srcBBits + srcBPitch * srcBY + srcBX * srcB_bpp / 32; \
+ /* Get initial pixel offset within words, mangle into pitch if possible */ \
+ uint32_t bitPtrs = 0; \
+ uint32_t srcAXpix = srcAX & (31 / srcA_bpp); \
+ if (srcA_bpp < 8) \
+ bitPtrs = srcAXpix; \
+ else if (srcA_bpp == 8 || srcA_bpp == 16) \
+ srcAPitch |= srcAXpix << 30; \
+ uint32_t srcBXpix = srcBX & (31 / srcB_bpp); \
+ if (srcB_bpp < 8) \
+ bitPtrs |= srcBXpix << 27; \
+ else if (srcB_bpp == 8 || srcB_bpp == 16) \
+ srcBPitch |= srcBXpix << 30; \
+ /* Adjust strides to remove number of words partially or wholly read/written */ \
+ srcAPitch -= (srcA_bpp * (srcAXpix + width) + 31) / 32; \
+ srcBPitch -= (srcB_bpp * (srcBXpix + width) + 31) / 32; \
+ /* Work out which width class this operation is. \
+ * Rather than re-evaluate this for each line, we want one choice \
+ * for the whole operation; this means we can't assume anything about \
+ * alignment to sizes larger than 4 bytes, because that's the only \
+ * guarantee we have about line stride. */ \
+ if (width > (128-32)/srcA_bpp && (((srcAXpix-1) ^ (srcAXpix+width-(128-32)/srcA_bpp)) &~ (31/srcA_bpp))) \
+ return armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_wide(width, height, srcA, srcAPitch, srcB, srcBPitch, colorA, colorB, 0, bitPtrs); \
+ else if (srcA_bpp > 8 || (((srcAXpix-1) ^ (srcAXpix+width)) &~ (31/srcA_bpp))) \
+ return armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_narrow(width, height, srcA, srcAPitch, srcB, srcBPitch, colorA, colorB, 0, bitPtrs); \
+ else \
+ return armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny(width, height, srcA, srcAPitch, srcB, srcBPitch, colorA, colorB, 0, bitPtrs); \
+}
+
+#define ADD_TALLY_FN(op, srcA_bpp, srcB_bpp) \
+ do { compareColorsFns[(((MR_##op * 2) + 1) * 3 + \
+ (srcA_bpp == 8 ? 0 : srcA_bpp == 16 ? 1 : 2)) * 3 + \
+ (srcB_bpp == 8 ? 0 : srcB_bpp == 16 ? 1 : 2)] = \
+ tallyFastPath##op##srcA_bpp##_##srcB_bpp; } while(0)
+
+TALLY_FAST_PATH(pixelMatch, 32, 32)
+
void addArmSimdFastPaths(void)
{
addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
+
+ ADD_TALLY_FN(pixelMatch, 32, 32);
}
-
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr 2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr 2014-11-21 03:38:51 UTC (rev 3139)
@@ -41,18 +41,20 @@
FLAG_DST_WRITEONLY * 0 :SHL: 3
FLAG_DST_READWRITE * 1 :SHL: 3
-FLAG_SPILL_NO_LINE_VARS * 0 :SHL: 4
-FLAG_SPILL_LINE_VARS_WIDE * 1 :SHL: 4
-FLAG_SPILL_LINE_VARS_NON_WIDE * 2 :SHL: 4
-FLAG_SPILL_LINE_VARS * 3 :SHL: 4
-FLAG_EXPAND_SKEW * 0 :SHL: 6
-FLAG_NO_EXPAND_SKEW * 1 :SHL: 6
-FLAG_PROCESS_SERIAL * 0 :SHL: 7 ; sub-word data is presented MS-aligned, and results are expected LS-aligned
-FLAG_PROCESS_PARALLEL * 1 :SHL: 7 ; sub-word data retains its original alignment throughout (only useful if src & dest depths same)
-FLAG_MAX_128BIT_MACRO * 0 :SHL: 8
-FLAG_MAX_256BIT_MACRO * 1 :SHL: 8 ; particularly tight loops can sometimes benefit from being unrolled to allow 2x 128-bit blocks to be staggered
-FLAG_PRELOAD_DST * 0 :SHL: 9
-FLAG_NO_PRELOAD_DST * 1 :SHL: 9
+FLAG_DST_READONLY * 2 :SHL: 3
+FLAG_DST_ACCESS * 3 :SHL: 3
+FLAG_SPILL_NO_LINE_VARS * 0 :SHL: 5
+FLAG_SPILL_LINE_VARS_WIDE * 1 :SHL: 5
+FLAG_SPILL_LINE_VARS_NON_WIDE * 2 :SHL: 5
+FLAG_SPILL_LINE_VARS * 3 :SHL: 5
+FLAG_EXPAND_SKEW * 0 :SHL: 7
+FLAG_NO_EXPAND_SKEW * 1 :SHL: 7
+FLAG_PROCESS_SERIAL * 0 :SHL: 8 ; sub-word data is presented MS-aligned, and results are expected LS-aligned
+FLAG_PROCESS_PARALLEL * 1 :SHL: 8 ; sub-word data retains its original alignment throughout (only useful if src & dest depths same)
+FLAG_MAX_128BIT_MACRO * 0 :SHL: 9
+FLAG_MAX_256BIT_MACRO * 1 :SHL: 9 ; particularly tight loops can sometimes benefit from being unrolled to allow 2x 128-bit blocks to be staggered
+FLAG_PRELOAD_DST * 0 :SHL: 10
+FLAG_NO_PRELOAD_DST * 1 :SHL: 10
; Offsets into stack
GBLA args_stack_offset
@@ -676,6 +678,7 @@
MACRO
$lab WriteFirstSubWord $base, $data, $pixels, $tmp1, $tmp2
+ ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
; It is assumed that there is at least 1 pixel to write
LCLS reg0
reg0 LookupWk $data
@@ -719,6 +722,7 @@
MACRO
$lab WriteLastSubWord $base, $data, $pixels, $aligned, $tmp1, $tmp2
+ ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
; It is assumed that there is at least 1 pixel to write
LCLS reg0
reg0 LookupWk $data
@@ -810,7 +814,8 @@
reg0 LookupWk $first
Print Data, "Write1Word: %08X @%p\n", $reg0, $base
$lab
- IF (flags :AND: FLAG_DST_READWRITE) > 0
+ ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
+ IF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READWRITE
STR $reg0, [$base, #-4] ; base is assumed previously updated during read
ELSE
STR $reg0, [$base], #4
@@ -829,7 +834,8 @@
]
Print Data, "Write2Words: %08X %08X @%p\n", $reg0, $reg1, $base
$lab
- IF (flags :AND: FLAG_DST_READWRITE) > 0
+ ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
+ IF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READWRITE
STMDB $base, {$reg0, $reg1} ; base is assumed previously updated during read
ELSE
STMIA $base!, {$reg0, $reg1}
@@ -861,7 +867,8 @@
Print Data, "Write4Words: %08X %08X", $reg0, $reg1
Print Data, " %08X %08X @%p\n", $reg2, $reg3, $base
$lab
- IF (flags :AND: FLAG_DST_READWRITE) > 0
+ ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
+ IF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READWRITE
STMDB $base, {$reg0, $reg1, $reg2, $reg3} ; base is assumed previously updated during read
ELSE
STMIA $base!, {$reg0, $reg1, $reg2, $reg3}
@@ -876,11 +883,13 @@
[ dst_w_bpp < 32
ANDS scratch, $pixels, #32/dst_w_bpp - 1
BEQ %FT02
- [ flags :AND: FLAG_DST_READWRITE > 0
+ IF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READWRITE
LDR $wk1, [dst]
- ]
+ ELIF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READONLY
+ LDR $wk1, [dst], #4
+ ENDIF
[ flags :AND: FLAG_PROCESS_PARALLEL = 0
- [ flags :AND: FLAG_DST_READWRITE > 0
+ [ (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_WRITEONLY
MOV scratch, scratch, LSL #dst_bpp_shift
MOV $wk1, $wk1, ROR scratch
]
@@ -895,8 +904,11 @@
01
pow2 SETA pow2 * 2
WEND
+ [ (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
WriteFirstSubWord dst, 1, $pixels, scratch, $wk2
+ ]
|
+ ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY ; Can't really do read-only 32-bit processing without an indication of which bits are valid
$prefix._32bits $wk0, $wk1, $fixed_skew ; and return result in $wk0
WriteFirstSubWord dst, 0, $pixels, scratch, $wk2
]
@@ -945,9 +957,11 @@
[ dst_w_bpp < 32
TST $pixels, #32/dst_w_bpp - 1
BEQ %FT02
- [ flags :AND: FLAG_DST_READWRITE > 0
+ IF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READWRITE
LDR $wk1, [dst]
- ]
+ ELIF (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_READONLY
+ LDR $wk1, [dst], #4
+ ENDIF
[ flags :AND: FLAG_PROCESS_PARALLEL = 0
LCLA pow2
LCLS pow2str
@@ -968,8 +982,11 @@
01
pow2 SETA pow2 / 2
WEND
+ [ (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY
WriteLastSubWord dst, 1, $pixels, ls_aligned, scratch, $wk0
+ ]
|
+ ASSERT (flags :AND: FLAG_DST_ACCESS) <> FLAG_DST_READONLY ; Can't really do read-only 32-bit processing without an indication of which bits are valid
ReadLastSubWord src, 0, carry, $pixels, $fixed_skew, skew, scratch
$prefix._32bits $wk0, $wk1, $fixed_skew ; and return result in $wk0
WriteLastSubWord dst, 0, $pixels, ms_aligned, scratch, $wk1
@@ -1386,7 +1403,7 @@
src_bpp_shift Log2 src_bpp
dst_bpp_shift Log2 dst_w_bpp
- [ flags :AND: FLAG_DST_READWRITE = 0
+ [ (flags :AND: FLAG_DST_ACCESS) = FLAG_DST_WRITEONLY
dst_r_bpp SETA 0
|
dst_r_bpp SETA dst_w_bpp
@@ -1520,7 +1537,7 @@
dst_prefetch_offset SETA 0
]
52
- WHILE dst_prefetch_offset <= 0
+ WHILE dst_prefetch_offset <> 16
subblock SETA 0
WHILE subblock < pix_per_block*dst_w_bpp/128
[ dst_w_bpp > 4 * src_bpp :LAND: src_bpp > 0
@@ -1573,7 +1590,7 @@
WEND
SUBS x, x, #pix_per_block
BHS %BT52
- [ dst_prefetch_offset < 0
+ [ dst_prefetch_offset = -16
B %FT55
54
]
Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s (rev 0)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s 2014-11-21 03:38:51 UTC (rev 3139)
@@ -0,0 +1,102 @@
+;
+; Copyright © 2014 Raspberry Pi Foundation
+; Copyright © 2014 RISC OS Open Ltd
+;
+; Permission to use, copy, modify, distribute, and sell this software and its
+; documentation for any purpose is hereby granted without fee, provided that
+; the above copyright notice appear in all copies and that both that
+; copyright notice and this permission notice appear in supporting
+; documentation, and that the name of the copyright holders not be used in
+; advertising or publicity pertaining to distribution of the software without
+; specific, written prior permission. The copyright holders make no
+; representations about the suitability of this software for any purpose. It
+; is provided "as is" without express or implied warranty.
+;
+; THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+; SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+; FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+; SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+; OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+; SOFTWARE.
+;
+
+; Debug options
+ GBLL DebugData
+;DebugData SETL {TRUE}
+ GBLL DebugPld
+;DebugPld SETL {TRUE}
+ GBLL VerboseBuild
+;VerboseBuild SETL {TRUE}
+
+ GET BitBltArmSimdAsm.hdr
+
+ AREA |BitBltArmSimdCompare$$Code|, CODE, READONLY
+ ARM
+
+; We use the two halftone arguments/registers to hold the two comparison colours
+; and the map register to hold the hit count.
+; Source A is referenced by dst/stride_d.
+; Source B is referenced by src/stride_s.
+
+; ********************************************************************
+
+ MACRO
+ pixelMatchTally32_32_init
+ MOV map, #0
+ MEND
+
+ MACRO
+ pixelMatchTally32_32_cleanup
+ MOV a1, map
+ MEND
+
+ MACRO
+ pixelMatchTally32_32_1pixel $srcA, $srcB
+ EOR $srcA, $srcA, ht
+ CLZ $srcA, $srcA ; bit 5 set => all bits were 0
+ TEQ $srcB, ht_info
+ ADDEQ map, map, $srcA, LSR #5
+ MEND
+
+ MACRO
+ pixelMatchTally32_32_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ pixelMatchTally32_32_1pixel $wk0, $wk1
+ MEND
+
+ MACRO
+ pixelMatchTally32_32_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ pixelMatchTally32_32_1pixel $wk0, $wk2
+ pixelMatchTally32_32_1pixel $wk1, $wk3
+ MEND
+
+ MACRO
+ pixelMatchTally32_32_128bits_head $src, $fixed_skew, $intra_preloads
+ Read4Words dst, 0,, 0
+ Read4Words src, 4, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ pixelMatchTally32_32_128bits_tail $src
+ pixelMatchTally32_32_1pixel $wk0, $wk4
+ pixelMatchTally32_32_1pixel $wk1, $wk5
+ pixelMatchTally32_32_1pixel $wk2, $wk6
+ pixelMatchTally32_32_1pixel $wk3, $wk7
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+pixelMatchTally GenerateFunctions 32, 32,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+ "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+ "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+ END
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c 2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c 2014-11-21 03:38:51 UTC (rev 3139)
@@ -90,7 +90,10 @@
static const unsigned int maskTable85[4] = { 0xF80000, 0x00F800, 0x0000F8, 0x000000 };
static const int shiftTable85[4] = { -9, -6, -3, 0 };
+/** The dispatch table for >= 8bpp big-endian compareColors functions */
+compare_colors_fn_t compareColorsFns[3*2*3*3];
+
#ifdef PROFILING
static uint64_t gettime(void)
{
@@ -436,3 +439,33 @@
#endif
}
+sqInt compareColorsDispatch(compare_operation_t *op)
+{
+ uint32_t log2bppA;
+ uint32_t log2bppB;
+ switch (op->srcA.depth)
+ {
+ case 1: log2bppA = 0; break;
+ case 2: log2bppA = 1; break;
+ case 4: log2bppA = 2; break;
+ case 8: log2bppA = 3; break;
+ case 16: log2bppA = 4; break;
+ case 32: log2bppA = 5; break;
+ default: abort();
+ }
+ switch (op->srcB.depth)
+ {
+ case 1: log2bppB = 0; break;
+ case 2: log2bppB = 1; break;
+ case 4: log2bppB = 2; break;
+ case 8: log2bppB = 3; break;
+ case 16: log2bppB = 4; break;
+ case 32: log2bppB = 5; break;
+ default: abort();
+ }
+ if (log2bppA < 3 || log2bppB < 3 || !op->srcA.msb || !op->srcB.msb)
+ /* These cases aren't catered for by the function table */
+ return genericCompareColors(op, log2bppA, log2bppB);
+ else
+ return compareColorsFns[(((op->matchRule * 2) + op->tally) * 3 + (log2bppA - 3)) * 3 + (log2bppB - 3)](op, log2bppA, log2bppB);
+}
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h 2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h 2014-11-21 03:38:51 UTC (rev 3139)
@@ -82,6 +82,13 @@
}
combination_rule_t;
+typedef enum {
+ MR_pixelMatch, /* 0 */
+ MR_notAnotB, /* 1 */
+ MR_notAmatchB, /* 2 */
+}
+match_rule_t;
+
typedef struct {
void *bits;
usqInt depth;
@@ -119,7 +126,24 @@
}
operation_t;
+typedef struct {
+ match_rule_t matchRule;
+ bool tally;
+ src_or_dest_t srcA;
+ src_or_dest_t srcB;
+ usqInt width;
+ usqInt height;
+ usqInt colorA;
+ usqInt colorB;
+}
+compare_operation_t;
+
+typedef usqInt (*compare_colors_fn_t)(compare_operation_t *op, usqInt log2bppA, usqInt log2bppB);
+
+extern compare_colors_fn_t compareColorsFns[3*2*3*3];
+
void initialiseCopyBits(void);
void copyBitsDispatch(operation_t *op);
+sqInt compareColorsDispatch(compare_operation_t *op);
#endif /* BITBLTDISPATCH_H_ */
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c 2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c 2014-11-21 03:38:51 UTC (rev 3139)
@@ -456,8 +456,96 @@
{ fastPathDepthConv, CR_any, FAST_PATH_SRC_0BPP | FAST_PATH_SRC_1BPP | ONLY_DEST_1BPP },
};
+static uint32_t genericCompareRow(uint32_t width,
+ const uint32_t *ptrA,
+ const uint32_t *ptrB,
+ uint32_t colorA,
+ uint32_t colorB,
+ uint32_t pixelIndexes,
+ match_rule_t matchRule,
+ bool tally,
+ uint32_t bppA,
+ uint32_t bppB,
+ uint32_t ppwA,
+ uint32_t ppwB,
+ bool msbA,
+ bool msbB)
+{
+ uint32_t count = 0;
+ uint32_t a32 = *ptrA++;
+ uint32_t b32 = *ptrB++;
+ if (msbA)
+ a32 <<= bppA * (pixelIndexes & 0x1F);
+ else
+ a32 >>= bppA * (pixelIndexes & 0x1F);
+ if (msbB)
+ b32 <<= bppB * (pixelIndexes >> 27);
+ else
+ b32 >>= bppB * (pixelIndexes >> 27);
+ while (width > 0)
+ {
+ uint32_t a = msbA ? a32 >> (32-bppA) : a32 & ((1<<bppA)-1);
+ uint32_t b = msbB ? b32 >> (32-bppB) : b32 & ((1<<bppB)-1);
+ uint32_t nextPixelIndexes;
+ if (matchRule == MR_pixelMatch)
+ count += a == colorA && b == colorB;
+ else if (matchRule == MR_notAnotB)
+ count += a != colorA && b != colorB;
+ else // MR_notAmatchB
+ count += a != colorA && b == colorB;
+ if (count && !tally)
+ return count;
+ if (--width == 0)
+ break;
+ nextPixelIndexes = pixelIndexes + 1 + (1<<27);
+ if (nextPixelIndexes & ppwA)
+ {
+ a32 = *ptrA++;
+ nextPixelIndexes -= ppwA;
+ }
+ if (ppwB == 32)
+ {
+ if (nextPixelIndexes < pixelIndexes)
+ b32 = *ptrB++;
+ }
+ else
+ {
+ if (nextPixelIndexes & (ppwB<<27))
+ {
+ b32 = *ptrB++;
+ nextPixelIndexes -= ppwB<<27;
+ }
+ }
+ pixelIndexes = nextPixelIndexes;
+ }
+ return count;
+}
+
+uint32_t genericCompareColors(compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB)
+{
+ uint32_t count = 0;
+ uint32_t pixelIndexes;
+ COPY_COMPARE_OP_TO_LOCALS(op, uint32_t, uint32_t);
+ srcABits += srcAY * srcAPitch + srcAX >> (5 - log2bppA);
+ srcBBits += srcBY * srcBPitch + srcBX >> (5 - log2bppB);
+ pixelIndexes = (srcAX & (srcADepth - 1)) + ((srcBX & (srcBDepth - 1)) << 27);
+ /* This routine is never going to be especially fast, so just use a simple loop */
+ while (height--)
+ {
+ count += genericCompareRow(width, srcABits, srcBBits, colorA, colorB, pixelIndexes,
+ matchRule, tally, srcADepth, srcBDepth, 32 >> log2bppA, 32 >> log2bppB, srcAMSB, srcBMSB);
+ if (count && !tally)
+ return count;
+ srcABits += srcAPitch;
+ srcBBits += srcBPitch;
+ }
+ return count;
+}
+
void addGenericFastPaths(void)
{
+ int i;
addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
+ for (i = 0; i < sizeof compareColorsFns / sizeof *compareColorsFns; i++)
+ compareColorsFns[i] = genericCompareColors;
}
-
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h 2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h 2014-11-21 03:38:51 UTC (rev 3139)
@@ -27,5 +27,6 @@
#define BITBLTGENERIC_H_
void addGenericFastPaths(void);
+uint32_t genericCompareColors(compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB);
#endif /* BITBLTGENERIC_H_ */
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h 2014-11-21 02:57:22 UTC (rev 3138)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h 2014-11-21 03:38:51 UTC (rev 3139)
@@ -125,7 +125,7 @@
/** This macro basically tells the compiler that the pointer to the
* "op" structure doesn't alias with any other pointers. I'd use the
* restrict keyword instead, but Squeak is built C89. */
-#define COPY_OP_TO_LOCALS(op, src_type, dest_type) \
+#define COPY_OP_TO_LOCALS(op, src_type, dest_type) \
combination_rule_t combinationRule = op->combinationRule; \
bool noSource = op->noSource; \
src_type *srcBits = op->src.bits; \
@@ -175,7 +175,45 @@
IGNORE(halftoneHeight); \
IGNORE(halftoneBase); \
+#define COPY_COMPARE_OP_TO_LOCALS(op, srcA_type, srcB_type) \
+ match_rule_t matchRule = op->matchRule; \
+ bool tally = op->tally; \
+ srcA_type *srcABits = op->srcA.bits; \
+ uint32_t srcADepth = op->srcA.depth; \
+ uint32_t srcAPitch = op->srcA.pitch / sizeof (srcA_type); \
+ bool srcAMSB = op->srcA.msb; \
+ uint32_t srcAX = op->srcA.x; \
+ uint32_t srcAY = op->srcA.y; \
+ srcB_type *srcBBits = op->srcB.bits; \
+ uint32_t srcBDepth = op->srcB.depth; \
+ uint32_t srcBPitch = op->srcB.pitch / sizeof (srcB_type); \
+ bool srcBMSB = op->srcB.msb; \
+ uint32_t srcBX = op->srcB.x; \
+ uint32_t srcBY = op->srcB.y; \
+ uint32_t width = op->width; \
+ uint32_t height = op->height; \
+ uint32_t colorA = op->colorA; \
+ uint32_t colorB = op->colorB; \
+ IGNORE(matchRule); \
+ IGNORE(tally); \
+ IGNORE(srcABits); \
+ IGNORE(srcADepth); \
+ IGNORE(srcAPitch); \
+ IGNORE(srcAMSB); \
+ IGNORE(srcAX); \
+ IGNORE(srcAY); \
+ IGNORE(srcBBits); \
+ IGNORE(srcBDepth); \
+ IGNORE(srcBPitch); \
+ IGNORE(srcBMSB); \
+ IGNORE(srcBX); \
+ IGNORE(srcBY); \
+ IGNORE(width); \
+ IGNORE(height); \
+ IGNORE(colorA); \
+ IGNORE(colorB); \
+
typedef struct {
void (*func)(operation_t *, uint32_t);
combination_rule_t combinationRule;
More information about the Vm-dev
mailing list