[Vm-dev] [commit][3188] updates for ARM related fastblt
commits at squeakvm.org
commits at squeakvm.org
Wed Dec 17 01:43:52 UTC 2014
Revision: 3188
Author: rowledge
Date: 2014-12-16 17:43:47 -0800 (Tue, 16 Dec 2014)
Log Message:
-----------
updates for ARM related fastblt
Modified Paths:
--------------
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c 2014-12-16 23:23:27 UTC (rev 3187)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c 2014-12-17 01:43:47 UTC (rev 3188)
@@ -258,17 +258,17 @@
};
#define TALLY_FAST_PATH(op, srcA_bpp, srcB_bpp) \
-extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_wide (uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
-extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
-extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny (uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
-static uint32_t tallyFastPath##op##srcA_bpp##_##srcB_bpp(compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB) \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_wide (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+static uint32_t tallyFastPath##op##srcA_bpp##_##srcB_bpp(const compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB) \
{ \
IGNORE(log2bppA); \
IGNORE(log2bppB); \
COPY_COMPARE_OP_TO_LOCALS(op, uint32_t, uint32_t); \
/* Get pointers to initial words */ \
- uint32_t *srcA = srcABits + srcAPitch * srcAY + srcAX * srcA_bpp / 32; \
- uint32_t *srcB = srcBBits + srcBPitch * srcBY + srcBX * srcB_bpp / 32; \
+ const uint32_t *srcA = srcABits + srcAPitch * srcAY + srcAX * srcA_bpp / 32; \
+ const uint32_t *srcB = srcBBits + srcBPitch * srcBY + srcBX * srcB_bpp / 32; \
/* Get initial pixel offset within words, mangle into pitch if possible */ \
uint32_t bitPtrs = 0; \
uint32_t srcAXpix = srcAX & (31 / srcA_bpp); \
@@ -297,17 +297,141 @@
return armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny(width, height, srcA, srcAPitch, srcB, srcBPitch, colorA, colorB, 0, bitPtrs); \
}
+#define TEST_FAST_PATH(op, srcA_bpp, srcB_bpp) \
+extern uint32_t armSimd##op##Test##srcB_bpp##_##srcA_bpp##_wide (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Test##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Test##srcB_bpp##_##srcA_bpp##_tiny (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+static uint32_t testFastPath##op##srcA_bpp##_##srcB_bpp(const compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB) \
+{ \
+ IGNORE(log2bppA); \
+ IGNORE(log2bppB); \
+ COPY_COMPARE_OP_TO_LOCALS(op, uint32_t, uint32_t); \
+ /* Early termination is most likely in the centre, so start from the */ \
+ /* middle and work outwards */ \
+ const uint32_t *srcAUp = srcABits + srcAPitch * (srcAY + (height >> 1)) + srcAX * srcA_bpp / 32; \
+ const uint32_t *srcBUp = srcBBits + srcBPitch * (srcBY + (height >> 1)) + srcBX * srcB_bpp / 32; \
+ const uint32_t *srcADown = srcAUp; \
+ const uint32_t *srcBDown = srcBUp; \
+ /* Get initial pixel offset within words, mangle into pitch if possible */ \
+ uint32_t bitPtrs = 0; \
+ uint32_t srcAXpix = srcAX & (31 / srcA_bpp); \
+ if (srcA_bpp < 8) \
+ bitPtrs = srcAXpix; \
+ else if (srcA_bpp == 8 || srcA_bpp == 16) \
+ srcAPitch |= srcAXpix << 30; \
+ uint32_t srcBXpix = srcBX & (31 / srcB_bpp); \
+ if (srcB_bpp < 8) \
+ bitPtrs |= srcBXpix << 27; \
+ else if (srcB_bpp == 8 || srcB_bpp == 16) \
+ srcBPitch |= srcBXpix << 30; \
+ /* Work out which width class this operation is. */ \
+ /* Rather than re-evaluate this for each line, we want one choice for the */ \
+ /* whole operation; this means we can't assume anything about alignment */ \
+ /* to sizes larger than 4 bytes, because that's the only guarantee we */ \
+ /* have about line stride. */ \
+ uint32_t (*testRow)(uint32_t, uint32_t, const uint32_t *, uint32_t, const uint32_t *, uint32_t, uint32_t, uint32_t, void *, uint32_t); \
+ if (width > (128-32)/srcA_bpp && (((srcAXpix-1) ^ (srcAXpix+width-(128-32)/srcA_bpp)) &~ (31/srcA_bpp))) \
+ testRow = armSimd##op##Test##srcB_bpp##_##srcA_bpp##_wide; \
+ else if (srcA_bpp > 8 || (((srcAXpix-1) ^ (srcAXpix+width)) &~ (31/srcA_bpp))) \
+ testRow = armSimd##op##Test##srcB_bpp##_##srcA_bpp##_narrow; \
+ else \
+ testRow = armSimd##op##Test##srcB_bpp##_##srcA_bpp##_tiny; \
+ if (height & 1) \
+ { \
+ height++; \
+ goto odd_number_of_rows_remain; \
+ } \
+ while (height != 0) \
+ { \
+ srcADown -= srcAPitch; \
+ srcBDown -= srcBPitch; \
+ if (testRow(width, 1, srcADown, srcAPitch, srcBDown, srcBPitch, colorA, colorB, 0, bitPtrs)) \
+ return 1; \
+ odd_number_of_rows_remain: \
+ if (testRow(width, 1, srcAUp, srcAPitch, srcBUp, srcBPitch, colorA, colorB, 0, bitPtrs)) \
+ return 1; \
+ srcAUp += srcAPitch; \
+ srcBUp += srcBPitch; \
+ height -= 2; \
+ } \
+ return 0; \
+}
+
#define ADD_TALLY_FN(op, srcA_bpp, srcB_bpp) \
do { compareColorsFns[(((MR_##op * 2) + 1) * 3 + \
(srcA_bpp == 8 ? 0 : srcA_bpp == 16 ? 1 : 2)) * 3 + \
(srcB_bpp == 8 ? 0 : srcB_bpp == 16 ? 1 : 2)] = \
tallyFastPath##op##srcA_bpp##_##srcB_bpp; } while(0)
+#define ADD_TEST_FN(op, srcA_bpp, srcB_bpp) \
+ do { compareColorsFns[(((MR_##op * 2) + 0) * 3 + \
+ (srcA_bpp == 8 ? 0 : srcA_bpp == 16 ? 1 : 2)) * 3 + \
+ (srcB_bpp == 8 ? 0 : srcB_bpp == 16 ? 1 : 2)] = \
+ testFastPath##op##srcA_bpp##_##srcB_bpp; } while(0)
+
TALLY_FAST_PATH(pixelMatch, 32, 32)
+TALLY_FAST_PATH(notAnotB, 32, 32)
+TALLY_FAST_PATH(notAmatchB, 32, 32)
+TEST_FAST_PATH(pixelMatch, 32, 32)
+TEST_FAST_PATH(notAnotB, 32, 32)
+TEST_FAST_PATH(notAmatchB, 32, 32)
+TALLY_FAST_PATH(pixelMatch, 16, 16)
+TALLY_FAST_PATH(notAnotB, 16, 16)
+TALLY_FAST_PATH(notAmatchB, 16, 16)
+TEST_FAST_PATH(pixelMatch, 16, 16)
+TEST_FAST_PATH(notAnotB, 16, 16)
+TEST_FAST_PATH(notAmatchB, 16, 16)
+
+TALLY_FAST_PATH(pixelMatch, 16, 32)
+TALLY_FAST_PATH(notAnotB, 16, 32)
+TALLY_FAST_PATH(notAmatchB, 16, 32)
+TEST_FAST_PATH(pixelMatch, 16, 32)
+TEST_FAST_PATH(notAnotB, 16, 32)
+TEST_FAST_PATH(notAmatchB, 16, 32)
+
+TALLY_FAST_PATH(notAmatchB, 32, 16)
+TEST_FAST_PATH(notAmatchB, 32, 16)
+
+TALLY_FAST_PATH(pixelMatch, 8, 8)
+TALLY_FAST_PATH(notAnotB, 8, 8)
+TALLY_FAST_PATH(notAmatchB, 8, 8)
+TEST_FAST_PATH(pixelMatch, 8, 8)
+TEST_FAST_PATH(notAnotB, 8, 8)
+TEST_FAST_PATH(notAmatchB, 8, 8)
+
void addArmSimdFastPaths(void)
{
addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
- ADD_TALLY_FN(pixelMatch, 32, 32);
+ ADD_TALLY_FN(pixelMatch, 32, 32);
+ ADD_TALLY_FN(notAnotB, 32, 32);
+ ADD_TALLY_FN(notAmatchB, 32, 32);
+ ADD_TEST_FN(pixelMatch, 32, 32);
+ ADD_TEST_FN(notAnotB, 32, 32);
+ ADD_TEST_FN(notAmatchB, 32, 32);
+
+ ADD_TALLY_FN(pixelMatch, 16, 16);
+ ADD_TALLY_FN(notAnotB, 16, 16);
+ ADD_TALLY_FN(notAmatchB, 16, 16);
+ ADD_TEST_FN(pixelMatch, 16, 16);
+ ADD_TEST_FN(notAnotB, 16, 16);
+ ADD_TEST_FN(notAmatchB, 16, 16);
+
+ ADD_TALLY_FN(pixelMatch, 16, 32);
+ ADD_TALLY_FN(notAnotB, 16, 32);
+ ADD_TALLY_FN(notAmatchB, 16, 32);
+ ADD_TEST_FN(pixelMatch, 16, 32);
+ ADD_TEST_FN(notAnotB, 16, 32);
+ ADD_TEST_FN(notAmatchB, 16, 32);
+
+ ADD_TALLY_FN(notAmatchB, 32, 16);
+ ADD_TEST_FN(notAmatchB, 32, 16);
+
+ ADD_TALLY_FN(pixelMatch, 8, 8);
+ ADD_TALLY_FN(notAnotB, 8, 8);
+ ADD_TALLY_FN(notAmatchB, 8, 8);
+ ADD_TEST_FN(pixelMatch, 8, 8);
+ ADD_TEST_FN(notAnotB, 8, 8);
+ ADD_TEST_FN(notAmatchB, 8, 8);
}
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr 2014-12-16 23:23:27 UTC (rev 3187)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr 2014-12-17 01:43:47 UTC (rev 3188)
@@ -1484,7 +1484,7 @@
]
EXPORT armSimd$prefix._wide
-armSimd$prefix._wide
+armSimd$prefix._wide ROUT
[ src_bpp > 0 :LOR: dst_r_bpp > 0
; Check whether this is actually a medium-width operation
; (decision made here rather in C due to availability of
@@ -1498,14 +1498,14 @@
BLO armSimd$prefix._medium
FunctionPrologue WIDE, (prefetch_distance+2)*pix_per_block
51
- PreloadLeadingStep1 $src_bpp, $preload_src, src
+ PreloadLeadingStep1 src_bpp, $preload_src, src
[ flags :AND: FLAG_NO_PRELOAD_DST = 0
- PreloadLeadingStep1 $dst_r_bpp, $preload_dst, dst
+ PreloadLeadingStep1 dst_r_bpp, $preload_dst, dst
]
CalculateLeadingPixels
- PreloadLeadingStep2 $src_bpp, $src_bpp_shift, $preload_src, src, $tmp_leading_pixels, scratch
+ PreloadLeadingStep2 src_bpp, src_bpp_shift, $preload_src, src, $tmp_leading_pixels, scratch
[ flags :AND: FLAG_NO_PRELOAD_DST = 0
- PreloadLeadingStep2 $dst_r_bpp, $dst_bpp_shift, $preload_dst, dst, $tmp_leading_pixels, scratch
+ PreloadLeadingStep2 dst_r_bpp, dst_bpp_shift, $preload_dst, dst, $tmp_leading_pixels, scratch
]
CalculateSkew
[ "$newline" <> ""
@@ -1556,7 +1556,7 @@
[ src_bpp > 0
[ flags & FLAG_MAX_256BIT_MACRO > 0
; prefetch distance = 256/bpp, block distance = 256/dst_w_bpp
-do_preload IsEndOfGroup subblock, 256/256*dst_w_bpp/src_bpp
+do_preload IsEndOfGroup subblock/2, 256/256*dst_w_bpp/src_bpp
|
; prefetch distance = 256/bpp, block distance = 128/dst_w_bpp
do_preload IsEndOfGroup subblock, 256/128*dst_w_bpp/src_bpp
@@ -1607,9 +1607,9 @@
|
ADD x, x, #(prefetch_distance + 2) * pix_per_block - 1
]
- PreloadTrailing $src_bpp, $src_bpp_shift, src, x, &$fixed_skew
+ PreloadTrailing src_bpp, src_bpp_shift, src, x, &$fixed_skew
[ flags :AND: FLAG_NO_PRELOAD_DST = 0
- PreloadTrailing $dst_r_bpp, $dst_bpp_shift, dst, x
+ PreloadTrailing dst_r_bpp, dst_bpp_shift, dst, x
]
SUB x, x, #128/dst_w_bpp - 1
]
@@ -1643,7 +1643,7 @@
LTORG
EXPORT armSimd$prefix._medium
-armSimd$prefix._medium
+armSimd$prefix._medium ROUT
]
FunctionPrologue NON_WIDE, 0
51
@@ -1709,7 +1709,8 @@
LTORG
EXPORT armSimd$prefix._narrow
-armSimd$prefix._narrow FunctionPrologue NON_WIDE, 0
+armSimd$prefix._narrow ROUT
+ FunctionPrologue NON_WIDE, 0
[ src_bpp > 0 :LAND: src_bpp < 32
; Because we're only aiming for 1-word alignment at the destination,
; we can at least have a constant skew for every scanline
@@ -1781,7 +1782,8 @@
[ dst_w_bpp <= 8
EXPORT armSimd$prefix._tiny
-armSimd$prefix._tiny FunctionPrologue NON_WIDE, 0
+armSimd$prefix._tiny ROUT
+ FunctionPrologue NON_WIDE, 0
51
PreloadLine src, src_bpp, src_bpp_shift, scratch, carry
BIC scratch, dst, #31 ; loading dest is unconditional below
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s 2014-12-16 23:23:27 UTC (rev 3187)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s 2014-12-17 01:43:47 UTC (rev 3188)
@@ -83,6 +83,7 @@
MACRO
pixelMatchTally32_32_128bits_tail $src
+ LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
pixelMatchTally32_32_1pixel $wk0, $wk4
pixelMatchTally32_32_1pixel $wk1, $wk5
pixelMatchTally32_32_1pixel $wk2, $wk6
@@ -93,10 +94,2574 @@
; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
pixelMatchTally GenerateFunctions 32, 32,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 3, \
+ "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+ "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+ MACRO
+ notAnotBTally32_32_init
+ MOV map, #0
+ MEND
+
+ MACRO
+ notAnotBTally32_32_cleanup
+ MOV a1, map
+ MEND
+
+ MACRO
+ notAnotBTally32_32_1pixel $srcA, $srcB
+ EORS $srcA, $srcA, ht
+ MOVNE $srcA, #1
+ TEQ $srcB, ht_info
+ ADDNE map, map, $srcA
+ MEND
+
+ MACRO
+ notAnotBTally32_32_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ notAnotBTally32_32_1pixel $wk0, $wk1
+ MEND
+
+ MACRO
+ notAnotBTally32_32_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ notAnotBTally32_32_1pixel $wk0, $wk2
+ notAnotBTally32_32_1pixel $wk1, $wk3
+ MEND
+
+ MACRO
+ notAnotBTally32_32_128bits_head $src, $fixed_skew, $intra_preloads
+ Read4Words dst, 0,, 0
+ Read4Words src, 4, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ notAnotBTally32_32_128bits_tail $src
+ LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
+ notAnotBTally32_32_1pixel $wk0, $wk4
+ notAnotBTally32_32_1pixel $wk1, $wk5
+ notAnotBTally32_32_1pixel $wk2, $wk6
+ notAnotBTally32_32_1pixel $wk3, $wk7
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAnotBTally GenerateFunctions 32, 32,, \
FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
"y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
"x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5
; ********************************************************************
+ MACRO
+ notAmatchBTally32_32_init
+ MOV map, #0
+ MEND
+
+ MACRO
+ notAmatchBTally32_32_cleanup
+ MOV a1, map
+ MEND
+
+ MACRO
+ notAmatchBTally32_32_1pixel $srcA, $srcB
+ EORS $srcA, $srcA, ht
+ MOVNE $srcA, #1
+ TEQ $srcB, ht_info
+ ADDEQ map, map, $srcA
+ MEND
+
+ MACRO
+ notAmatchBTally32_32_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ notAmatchBTally32_32_1pixel $wk0, $wk1
+ MEND
+
+ MACRO
+ notAmatchBTally32_32_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ notAmatchBTally32_32_1pixel $wk0, $wk2
+ notAmatchBTally32_32_1pixel $wk1, $wk3
+ MEND
+
+ MACRO
+ notAmatchBTally32_32_128bits_head $src, $fixed_skew, $intra_preloads
+ Read4Words dst, 0,, 0
+ Read4Words src, 4, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ notAmatchBTally32_32_128bits_tail $src
+ LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
+ notAmatchBTally32_32_1pixel $wk0, $wk4
+ notAmatchBTally32_32_1pixel $wk1, $wk5
+ notAmatchBTally32_32_1pixel $wk2, $wk6
+ notAmatchBTally32_32_1pixel $wk3, $wk7
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAmatchBTally GenerateFunctions 32, 32,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+ "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+ "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+ MACRO
+ pixelMatchTest32_32_cleanup
+ MOV a1, #0
+ B %FT95
+90 ADD sp, sp, #num_line_saved_regs * 4
+ MOV a1, #1
+95
+ MEND
+
+ MACRO
+ pixelMatchTest32_32_1pixel $srcA, $srcB
+ TEQ $srcA, ht
+ TEQEQ $srcB, ht_info
+ BEQ %FA90
+ MEND
+
+ MACRO
+ pixelMatchTest32_32_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ pixelMatchTest32_32_1pixel $wk0, $wk1
+ MEND
+
+ MACRO
+ pixelMatchTest32_32_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ pixelMatchTest32_32_1pixel $wk0, $wk2
+ pixelMatchTest32_32_1pixel $wk1, $wk3
+ MEND
+
+ MACRO
+ pixelMatchTest32_32_128bits_head $src, $fixed_skew, $intra_preloads
+ Read4Words dst, 0,, 0
+ Read4Words src, 4, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ pixelMatchTest32_32_128bits_tail $src
+ LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
+ pixelMatchTest32_32_1pixel $wk0, $wk4
+ pixelMatchTest32_32_1pixel $wk1, $wk5
+ pixelMatchTest32_32_1pixel $wk2, $wk6
+ pixelMatchTest32_32_1pixel $wk3, $wk7
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+pixelMatchTest GenerateFunctions 32, 32,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+ "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+ "x,y,stride_d,stride_s", orig_w,,,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+ MACRO
+ notAnotBTest32_32_cleanup
+ MOV a1, #0
+ B %FT95
+90 ADD sp, sp, #num_line_saved_regs * 4
+ MOV a1, #1
+95
+ MEND
+
+ MACRO
+ notAnotBTest32_32_1pixel $srcA, $srcB
+ TEQ $srcA, ht
+ TEQNE $srcB, ht_info
+ BNE %FA90
+ MEND
+
+ MACRO
+ notAnotBTest32_32_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ notAnotBTest32_32_1pixel $wk0, $wk1
+ MEND
+
+ MACRO
+ notAnotBTest32_32_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ notAnotBTest32_32_1pixel $wk0, $wk2
+ notAnotBTest32_32_1pixel $wk1, $wk3
+ MEND
+
+ MACRO
+ notAnotBTest32_32_128bits_head $src, $fixed_skew, $intra_preloads
+ Read4Words dst, 0,, 0
+ Read4Words src, 4, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ notAnotBTest32_32_128bits_tail $src
+ LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
+ notAnotBTest32_32_1pixel $wk0, $wk4
+ notAnotBTest32_32_1pixel $wk1, $wk5
+ notAnotBTest32_32_1pixel $wk2, $wk6
+ notAnotBTest32_32_1pixel $wk3, $wk7
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAnotBTest GenerateFunctions 32, 32,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 3, \
+ "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+ "x,y,stride_d,stride_s", orig_w,,,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+ MACRO
+ notAmatchBTest32_32_cleanup
+ MOV a1, #0
+ B %FT95
+90 ADD sp, sp, #num_line_saved_regs * 4
+ MOV a1, #1
+95
+ MEND
+
+ MACRO
+ notAmatchBTest32_32_1pixel $srcA, $srcB
+ TEQ $srcB, ht_info
+ MOVNE $srcA, ht
+ TEQ $srcA, ht
+ BNE %FA90
+ MEND
+
+ MACRO
+ notAmatchBTest32_32_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ notAmatchBTest32_32_1pixel $wk0, $wk1
+ MEND
+
+ MACRO
+ notAmatchBTest32_32_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ notAmatchBTest32_32_1pixel $wk0, $wk2
+ notAmatchBTest32_32_1pixel $wk1, $wk3
+ MEND
+
+ MACRO
+ notAmatchBTest32_32_128bits_head $src, $fixed_skew, $intra_preloads
+ Read4Words dst, 0,, 0
+ Read4Words src, 4, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ notAmatchBTest32_32_128bits_tail $src
+ LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
+ notAmatchBTest32_32_1pixel $wk0, $wk4
+ notAmatchBTest32_32_1pixel $wk1, $wk5
+ notAmatchBTest32_32_1pixel $wk2, $wk6
+ notAmatchBTest32_32_1pixel $wk3, $wk7
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAmatchBTest GenerateFunctions 32, 32,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+ "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+ "x,y,stride_d,stride_s", orig_w,,,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+ MACRO
+ pixelMatchTally16_16_init
+ LDR bitptrs, =0x00010001
+ MOV map, #0
+ PKHBT ht, ht, LSL #16 ; replicate the constant colours across words
+ PKHBT ht_info, ht_info, LSL #16
+ MEND
+
+ MACRO
+ pixelMatchTally16_16_cleanup
+ MOV a1, map
+ MEND
+
+ MACRO
+ pixelMatchTally16_16_1pixel $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $ones, $zeros
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $ones, $zeros
+ AND $srcA, $srcA, $srcB
+ UXTAH map, map, $srcA, ROR #16
+ MEND
+
+ MACRO
+ pixelMatchTally16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $ones, $zeros
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $ones, $zeros
+ [ $first :LAND: $last ; avoid touching wk4
+ AND $srcA, $srcA, $srcB
+ UXTAH map, map, $srcA
+ UXTAH map, map, $srcA, ROR #16
+ |
+ [ $first
+ AND $wk4, $srcA, $srcB
+ |
+ AND $srcA, $srcA, $srcB
+ ADD $wk4, $wk4, $srcA
+ ]
+ [ $last
+ UXTAH map, map, $wk4
+ UXTAH map, map, $wk4, ROR #16
+ ]
+ ]
+ MEND
+
+ MACRO
+ pixelMatchTally16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+ pixelMatchTally16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones
+ pixelMatchTally16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones
+ MEND
+
+ MACRO
+ pixelMatchTally16_16_16bits $src, $dst, $fixed_skew
+ MOV scratch, #0
+ pixelMatchTally16_16_1pixel $dst, $src, scratch, bitptrs
+ MEND
+
+ MACRO
+ pixelMatchTally16_16_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ MOV scratch, #0
+ pixelMatchTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, scratch, bitptrs
+ MEND
+
+ MACRO
+ pixelMatchTally16_16_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MOV scratch, #0
+ pixelMatchTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, scratch, bitptrs
+ pixelMatchTally16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, scratch, bitptrs
+ MEND
+
+ MACRO
+ pixelMatchTally16_16_128bits_head $src, $fixed_skew, $intra_preloads
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MOV scratch, #0
+ pixelMatchTally16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ pixelMatchTally16_16_128bits_tail $src
+ MOV scratch, #0
+ pixelMatchTally16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+pixelMatchTally GenerateFunctions 16, 16,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+ "y,stride_d,stride_s,skew,orig_w", \
+ "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4
+
+; ********************************************************************
+
+ MACRO
+ notAnotBTally16_16_init
+ LDR bitptrs, =0x00010001
+ MOV map, #0
+ PKHBT ht, ht, LSL #16 ; replicate the constant colours across words
+ PKHBT ht_info, ht_info, LSL #16
+ MEND
+
+ MACRO
+ notAnotBTally16_16_cleanup
+ MOV a1, map
+ MEND
+
+ MACRO
+ notAnotBTally16_16_1pixel $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $zeros, $ones
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $zeros, $ones
+ AND $srcA, $srcA, $srcB
+ UXTAH map, map, $srcA, ROR #16
+ MEND
+
+ MACRO
+ notAnotBTally16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $zeros, $ones
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $zeros, $ones
+ [ $first :LAND: $last ; avoid touching wk4
+ AND $srcA, $srcA, $srcB
+ UXTAH map, map, $srcA
+ UXTAH map, map, $srcA, ROR #16
+ |
+ [ $first
+ AND $wk4, $srcA, $srcB
+ |
+ AND $srcA, $srcA, $srcB
+ ADD $wk4, $wk4, $srcA
+ ]
+ [ $last
+ UXTAH map, map, $wk4
+ UXTAH map, map, $wk4, ROR #16
+ ]
+ ]
+ MEND
+
+ MACRO
+ notAnotBTally16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+ notAnotBTally16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones
+ notAnotBTally16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones
+ MEND
+
+ MACRO
+ notAnotBTally16_16_16bits $src, $dst, $fixed_skew
+ MOV scratch, #0
+ notAnotBTally16_16_1pixel $dst, $src, scratch, bitptrs
+ MEND
+
+ MACRO
+ notAnotBTally16_16_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ MOV scratch, #0
+ notAnotBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, scratch, bitptrs
+ MEND
+
+ MACRO
+ notAnotBTally16_16_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MOV scratch, #0
+ notAnotBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, scratch, bitptrs
+ notAnotBTally16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, scratch, bitptrs
+ MEND
+
+ MACRO
+ notAnotBTally16_16_128bits_head $src, $fixed_skew, $intra_preloads
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MOV scratch, #0
+ notAnotBTally16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ notAnotBTally16_16_128bits_tail $src
+ MOV scratch, #0
+ notAnotBTally16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAnotBTally GenerateFunctions 16, 16,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+ "y,stride_d,stride_s,skew,orig_w", \
+ "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4
+
+; ********************************************************************
+
+ MACRO
+ notAmatchBTally16_16_init
+ LDR bitptrs, =0x00010001
+ MOV map, #0
+ PKHBT ht, ht, LSL #16 ; replicate the constant colours across words
+ PKHBT ht_info, ht_info, LSL #16
+ MEND
+
+ MACRO
+ notAmatchBTally16_16_cleanup
+ MOV a1, map
+ MEND
+
+ MACRO
+ notAmatchBTally16_16_1pixel $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $zeros, $ones
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $ones, $zeros
+ AND $srcA, $srcA, $srcB
+ UXTAH map, map, $srcA, ROR #16
+ MEND
+
+ MACRO
+ notAmatchBTally16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $zeros, $ones
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $ones, $zeros
+ [ $first :LAND: $last ; avoid touching wk4
+ AND $srcA, $srcA, $srcB
+ UXTAH map, map, $srcA
+ UXTAH map, map, $srcA, ROR #16
+ |
+ [ $first
+ AND $wk4, $srcA, $srcB
+ |
+ AND $srcA, $srcA, $srcB
+ ADD $wk4, $wk4, $srcA
+ ]
+ [ $last
+ UXTAH map, map, $wk4
+ UXTAH map, map, $wk4, ROR #16
+ ]
+ ]
+ MEND
+
+ MACRO
+ notAmatchBTally16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+ notAmatchBTally16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones
+ notAmatchBTally16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones
+ MEND
+
+ MACRO
+ notAmatchBTally16_16_16bits $src, $dst, $fixed_skew
+ MOV scratch, #0
+ notAmatchBTally16_16_1pixel $dst, $src, scratch, bitptrs
+ MEND
+
+ MACRO
+ notAmatchBTally16_16_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ MOV scratch, #0
+ notAmatchBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, scratch, bitptrs
+ MEND
+
+ MACRO
+ notAmatchBTally16_16_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MOV scratch, #0
+ notAmatchBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, scratch, bitptrs
+ notAmatchBTally16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, scratch, bitptrs
+ MEND
+
+ MACRO
+ notAmatchBTally16_16_128bits_head $src, $fixed_skew, $intra_preloads
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MOV scratch, #0
+ notAmatchBTally16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ notAmatchBTally16_16_128bits_tail $src
+ MOV scratch, #0
+ notAmatchBTally16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAmatchBTally GenerateFunctions 16, 16,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+ "y,stride_d,stride_s,skew,orig_w", \
+ "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4
+
+; ********************************************************************
+
+ MACRO
+ pixelMatchTest16_16_init
+ LDR bitptrs, =0x00010001
+ MOV map, #0
+ PKHBT ht, ht, LSL #16 ; replicate the constant colours across words
+ PKHBT ht_info, ht_info, LSL #16
+ MEND
+
+ MACRO
+ pixelMatchTest16_16_cleanup
+ MOV a1, #0
+ B %FT95
+90 ADD sp, sp, #num_line_saved_regs * 4
+ MOV a1, #1
+95
+ MEND
+
+ MACRO
+ pixelMatchTest16_16_1pixel $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $ones, $zeros
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $ones, $zeros
+ AND $srcA, $srcA, $srcB
+ TST $srcA, #0x10000
+ BNE %FA90
+ MEND
+
+ MACRO
+ pixelMatchTest16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $ones, $zeros
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $ones, $zeros
+ [ $first :LAND: $last ; avoid touching wk4
+ TST $srcA, $srcB
+ |
+ [ $first
+ AND $wk4, $srcA, $srcB
+ |
+ AND $srcA, $srcA, $srcB
+ ORRS $wk4, $wk4, $srcA
+ ]
+ ]
+ [ $last
+ BNE %FA90
+ ]
+ MEND
+
+ MACRO
+ pixelMatchTest16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+ pixelMatchTest16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones
+ pixelMatchTest16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones
+ MEND
+
+ MACRO
+ pixelMatchTest16_16_16bits $src, $dst, $fixed_skew
+ pixelMatchTest16_16_1pixel $dst, $src, map, bitptrs
+ MEND
+
+ MACRO
+ pixelMatchTest16_16_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ pixelMatchTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, map, bitptrs
+ MEND
+
+ MACRO
+ pixelMatchTest16_16_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ pixelMatchTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, map, bitptrs
+ pixelMatchTest16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, map, bitptrs
+ MEND
+
+ MACRO
+ pixelMatchTest16_16_128bits_head $src, $fixed_skew, $intra_preloads
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ pixelMatchTest16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ pixelMatchTest16_16_128bits_tail $src
+ pixelMatchTest16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+pixelMatchTest GenerateFunctions 16, 16,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+ "y,stride_d,stride_s,skew,orig_w", \
+ "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4
+
+; ********************************************************************
+
+ MACRO
+ notAnotBTest16_16_init
+ LDR bitptrs, =0x00010001
+ MOV map, #0
+ PKHBT ht, ht, LSL #16 ; replicate the constant colours across words
+ PKHBT ht_info, ht_info, LSL #16
+ MEND
+
+ MACRO
+ notAnotBTest16_16_cleanup
+ MOV a1, #0
+ B %FT95
+90 ADD sp, sp, #num_line_saved_regs * 4
+ MOV a1, #1
+95
+ MEND
+
+ MACRO
+ notAnotBTest16_16_1pixel $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $zeros, $ones
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $zeros, $ones
+ AND $srcA, $srcA, $srcB
+ TST $srcA, #0x10000
+ BNE %FA90
+ MEND
+
+ MACRO
+ notAnotBTest16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $zeros, $ones
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $zeros, $ones
+ [ $first :LAND: $last ; avoid touching wk4
+ TST $srcA, $srcB
+ |
+ [ $first
+ AND $wk4, $srcA, $srcB
+ |
+ AND $srcA, $srcA, $srcB
+ ORRS $wk4, $wk4, $srcA
+ ]
+ ]
+ [ $last
+ BNE %FA90
+ ]
+ MEND
+
+ MACRO
+ notAnotBTest16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+ notAnotBTest16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones
+ notAnotBTest16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones
+ MEND
+
+ MACRO
+ notAnotBTest16_16_16bits $src, $dst, $fixed_skew
+ notAnotBTest16_16_1pixel $dst, $src, map, bitptrs
+ MEND
+
+ MACRO
+ notAnotBTest16_16_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
+ notAnotBTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, map, bitptrs
+ MEND
+
+ MACRO
+ notAnotBTest16_16_64bits $src, $fixed_skew
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ notAnotBTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, map, bitptrs
+ notAnotBTest16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, map, bitptrs
+ MEND
+
+ MACRO
+ notAnotBTest16_16_128bits_head $src, $fixed_skew, $intra_preloads
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ notAnotBTest16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs
+ Read2Words dst, 0,, 0
+ Read2Words src, 2, carry, $fixed_skew, skew, scratch
+ MEND
+
+ MACRO
+ notAnotBTest16_16_128bits_tail $src
+ notAnotBTest16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAnotBTest GenerateFunctions 16, 16,, \
+ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+ "y,stride_d,stride_s,skew,orig_w", \
+ "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4
+
+; ********************************************************************
+
+ MACRO
+ notAmatchBTest16_16_init
+ LDR bitptrs, =0x00010001
+ MOV map, #0
+ PKHBT ht, ht, LSL #16 ; replicate the constant colours across words
+ PKHBT ht_info, ht_info, LSL #16
+ MEND
+
+ MACRO
+ notAmatchBTest16_16_cleanup
+ MOV a1, #0
+ B %FT95
+90 ADD sp, sp, #num_line_saved_regs * 4
+ MOV a1, #1
+95
+ MEND
+
+ MACRO
+ notAmatchBTest16_16_1pixel $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $zeros, $ones
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $ones, $zeros
+ AND $srcA, $srcA, $srcB
+ TST $srcA, #0x10000
+ BNE %FA90
+ MEND
+
+ MACRO
+ notAmatchBTest16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+ EOR $srcA, $srcA, ht
+ EOR $srcB, $srcB, ht_info
+ USUB16 $srcA, $zeros, $srcA
+ SEL $srcA, $zeros, $ones
+ USUB16 $srcB, $zeros, $srcB
+ SEL $srcB, $ones, $zeros
+ [ $first :LAND: $last ; avoid touching wk4
+ TST $srcA, $srcB
+ |
+ [ $first
+ AND $wk4, $srcA, $srcB
+ |
+ AND $srcA, $srcA, $srcB
+ ORRS $wk4, $wk4, $srcA
+ ]
+ ]
+ [ $last
+ BNE %FA90
+ ]
+ MEND
+
+ MACRO
+ notAmatchBTest16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+ notAmatchBTest16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones
+ notAmatchBTest16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones
+ MEND
+
+ MACRO
+ notAmatchBTest16_16_16bits $src, $dst, $fixed_skew
+ notAmatchBTest16_16_1pixel $dst, $src, map, bitptrs
+ MEND
+
+ MACRO
+ notAmatchBTest16_16_32bits $src, $dst, $fixed_skew
+ Read1Word dst, 0,, 0
+ Read1Word src, 1, carry, $fixed_skew, skew, scratch
@@ Diff output truncated at 50000 characters. @@
More information about the Vm-dev
mailing list