Revision: 3188 Author: rowledge Date: 2014-12-16 17:43:47 -0800 (Tue, 16 Dec 2014) Log Message: ----------- updates for ARM related fastblt
Modified Paths: -------------- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c 2014-12-16 23:23:27 UTC (rev 3187) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c 2014-12-17 01:43:47 UTC (rev 3188) @@ -258,17 +258,17 @@ };
#define TALLY_FAST_PATH(op, srcA_bpp, srcB_bpp) \ -extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_wide (uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \ -extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \ -extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny (uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \ -static uint32_t tallyFastPath##op##srcA_bpp##_##srcB_bpp(compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB) \ +extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_wide (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \ +extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \ +extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \ +static uint32_t tallyFastPath##op##srcA_bpp##_##srcB_bpp(const compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB) \ { \ IGNORE(log2bppA); \ IGNORE(log2bppB); \ COPY_COMPARE_OP_TO_LOCALS(op, uint32_t, uint32_t); \ /* Get pointers to initial words */ \ - uint32_t *srcA = srcABits + srcAPitch * srcAY + srcAX * srcA_bpp / 32; \ - uint32_t *srcB = srcBBits + srcBPitch * srcBY + srcBX * srcB_bpp / 32; \ + const uint32_t *srcA = srcABits + srcAPitch * srcAY + srcAX * srcA_bpp / 32; \ + const uint32_t *srcB = srcBBits + srcBPitch * srcBY + srcBX * srcB_bpp / 32; \ /* Get initial pixel offset within words, mangle into pitch if possible */ \ uint32_t bitPtrs = 0; \ uint32_t srcAXpix = srcAX & (31 / srcA_bpp); \ @@ -297,17 +297,141 @@ return armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny(width, height, srcA, srcAPitch, srcB, srcBPitch, colorA, colorB, 0, bitPtrs); \ }
+#define TEST_FAST_PATH(op, srcA_bpp, srcB_bpp) \ +extern uint32_t armSimd##op##Test##srcB_bpp##_##srcA_bpp##_wide (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \ +extern uint32_t armSimd##op##Test##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \ +extern uint32_t armSimd##op##Test##srcB_bpp##_##srcA_bpp##_tiny (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \ +static uint32_t testFastPath##op##srcA_bpp##_##srcB_bpp(const compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB) \ +{ \ + IGNORE(log2bppA); \ + IGNORE(log2bppB); \ + COPY_COMPARE_OP_TO_LOCALS(op, uint32_t, uint32_t); \ + /* Early termination is most likely in the centre, so start from the */ \ + /* middle and work outwards */ \ + const uint32_t *srcAUp = srcABits + srcAPitch * (srcAY + (height >> 1)) + srcAX * srcA_bpp / 32; \ + const uint32_t *srcBUp = srcBBits + srcBPitch * (srcBY + (height >> 1)) + srcBX * srcB_bpp / 32; \ + const uint32_t *srcADown = srcAUp; \ + const uint32_t *srcBDown = srcBUp; \ + /* Get initial pixel offset within words, mangle into pitch if possible */ \ + uint32_t bitPtrs = 0; \ + uint32_t srcAXpix = srcAX & (31 / srcA_bpp); \ + if (srcA_bpp < 8) \ + bitPtrs = srcAXpix; \ + else if (srcA_bpp == 8 || srcA_bpp == 16) \ + srcAPitch |= srcAXpix << 30; \ + uint32_t srcBXpix = srcBX & (31 / srcB_bpp); \ + if (srcB_bpp < 8) \ + bitPtrs |= srcBXpix << 27; \ + else if (srcB_bpp == 8 || srcB_bpp == 16) \ + srcBPitch |= srcBXpix << 30; \ + /* Work out which width class this operation is. */ \ + /* Rather than re-evaluate this for each line, we want one choice for the */ \ + /* whole operation; this means we can't assume anything about alignment */ \ + /* to sizes larger than 4 bytes, because that's the only guarantee we */ \ + /* have about line stride. */ \ + uint32_t (*testRow)(uint32_t, uint32_t, const uint32_t *, uint32_t, const uint32_t *, uint32_t, uint32_t, uint32_t, void *, uint32_t); \ + if (width > (128-32)/srcA_bpp && (((srcAXpix-1) ^ (srcAXpix+width-(128-32)/srcA_bpp)) &~ (31/srcA_bpp))) \ + testRow = armSimd##op##Test##srcB_bpp##_##srcA_bpp##_wide; \ + else if (srcA_bpp > 8 || (((srcAXpix-1) ^ (srcAXpix+width)) &~ (31/srcA_bpp))) \ + testRow = armSimd##op##Test##srcB_bpp##_##srcA_bpp##_narrow; \ + else \ + testRow = armSimd##op##Test##srcB_bpp##_##srcA_bpp##_tiny; \ + if (height & 1) \ + { \ + height++; \ + goto odd_number_of_rows_remain; \ + } \ + while (height != 0) \ + { \ + srcADown -= srcAPitch; \ + srcBDown -= srcBPitch; \ + if (testRow(width, 1, srcADown, srcAPitch, srcBDown, srcBPitch, colorA, colorB, 0, bitPtrs)) \ + return 1; \ + odd_number_of_rows_remain: \ + if (testRow(width, 1, srcAUp, srcAPitch, srcBUp, srcBPitch, colorA, colorB, 0, bitPtrs)) \ + return 1; \ + srcAUp += srcAPitch; \ + srcBUp += srcBPitch; \ + height -= 2; \ + } \ + return 0; \ +} + #define ADD_TALLY_FN(op, srcA_bpp, srcB_bpp) \ do { compareColorsFns[(((MR_##op * 2) + 1) * 3 + \ (srcA_bpp == 8 ? 0 : srcA_bpp == 16 ? 1 : 2)) * 3 + \ (srcB_bpp == 8 ? 0 : srcB_bpp == 16 ? 1 : 2)] = \ tallyFastPath##op##srcA_bpp##_##srcB_bpp; } while(0)
+#define ADD_TEST_FN(op, srcA_bpp, srcB_bpp) \ + do { compareColorsFns[(((MR_##op * 2) + 0) * 3 + \ + (srcA_bpp == 8 ? 0 : srcA_bpp == 16 ? 1 : 2)) * 3 + \ + (srcB_bpp == 8 ? 0 : srcB_bpp == 16 ? 1 : 2)] = \ + testFastPath##op##srcA_bpp##_##srcB_bpp; } while(0) + TALLY_FAST_PATH(pixelMatch, 32, 32) +TALLY_FAST_PATH(notAnotB, 32, 32) +TALLY_FAST_PATH(notAmatchB, 32, 32) +TEST_FAST_PATH(pixelMatch, 32, 32) +TEST_FAST_PATH(notAnotB, 32, 32) +TEST_FAST_PATH(notAmatchB, 32, 32)
+TALLY_FAST_PATH(pixelMatch, 16, 16) +TALLY_FAST_PATH(notAnotB, 16, 16) +TALLY_FAST_PATH(notAmatchB, 16, 16) +TEST_FAST_PATH(pixelMatch, 16, 16) +TEST_FAST_PATH(notAnotB, 16, 16) +TEST_FAST_PATH(notAmatchB, 16, 16) + +TALLY_FAST_PATH(pixelMatch, 16, 32) +TALLY_FAST_PATH(notAnotB, 16, 32) +TALLY_FAST_PATH(notAmatchB, 16, 32) +TEST_FAST_PATH(pixelMatch, 16, 32) +TEST_FAST_PATH(notAnotB, 16, 32) +TEST_FAST_PATH(notAmatchB, 16, 32) + +TALLY_FAST_PATH(notAmatchB, 32, 16) +TEST_FAST_PATH(notAmatchB, 32, 16) + +TALLY_FAST_PATH(pixelMatch, 8, 8) +TALLY_FAST_PATH(notAnotB, 8, 8) +TALLY_FAST_PATH(notAmatchB, 8, 8) +TEST_FAST_PATH(pixelMatch, 8, 8) +TEST_FAST_PATH(notAnotB, 8, 8) +TEST_FAST_PATH(notAmatchB, 8, 8) + void addArmSimdFastPaths(void) { addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
- ADD_TALLY_FN(pixelMatch, 32, 32); + ADD_TALLY_FN(pixelMatch, 32, 32); + ADD_TALLY_FN(notAnotB, 32, 32); + ADD_TALLY_FN(notAmatchB, 32, 32); + ADD_TEST_FN(pixelMatch, 32, 32); + ADD_TEST_FN(notAnotB, 32, 32); + ADD_TEST_FN(notAmatchB, 32, 32); + + ADD_TALLY_FN(pixelMatch, 16, 16); + ADD_TALLY_FN(notAnotB, 16, 16); + ADD_TALLY_FN(notAmatchB, 16, 16); + ADD_TEST_FN(pixelMatch, 16, 16); + ADD_TEST_FN(notAnotB, 16, 16); + ADD_TEST_FN(notAmatchB, 16, 16); + + ADD_TALLY_FN(pixelMatch, 16, 32); + ADD_TALLY_FN(notAnotB, 16, 32); + ADD_TALLY_FN(notAmatchB, 16, 32); + ADD_TEST_FN(pixelMatch, 16, 32); + ADD_TEST_FN(notAnotB, 16, 32); + ADD_TEST_FN(notAmatchB, 16, 32); + + ADD_TALLY_FN(notAmatchB, 32, 16); + ADD_TEST_FN(notAmatchB, 32, 16); + + ADD_TALLY_FN(pixelMatch, 8, 8); + ADD_TALLY_FN(notAnotB, 8, 8); + ADD_TALLY_FN(notAmatchB, 8, 8); + ADD_TEST_FN(pixelMatch, 8, 8); + ADD_TEST_FN(notAnotB, 8, 8); + ADD_TEST_FN(notAmatchB, 8, 8); }
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr 2014-12-16 23:23:27 UTC (rev 3187) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr 2014-12-17 01:43:47 UTC (rev 3188) @@ -1484,7 +1484,7 @@ ]
EXPORT armSimd$prefix._wide -armSimd$prefix._wide +armSimd$prefix._wide ROUT [ src_bpp > 0 :LOR: dst_r_bpp > 0 ; Check whether this is actually a medium-width operation ; (decision made here rather in C due to availability of @@ -1498,14 +1498,14 @@ BLO armSimd$prefix._medium FunctionPrologue WIDE, (prefetch_distance+2)*pix_per_block 51 - PreloadLeadingStep1 $src_bpp, $preload_src, src + PreloadLeadingStep1 src_bpp, $preload_src, src [ flags :AND: FLAG_NO_PRELOAD_DST = 0 - PreloadLeadingStep1 $dst_r_bpp, $preload_dst, dst + PreloadLeadingStep1 dst_r_bpp, $preload_dst, dst ] CalculateLeadingPixels - PreloadLeadingStep2 $src_bpp, $src_bpp_shift, $preload_src, src, $tmp_leading_pixels, scratch + PreloadLeadingStep2 src_bpp, src_bpp_shift, $preload_src, src, $tmp_leading_pixels, scratch [ flags :AND: FLAG_NO_PRELOAD_DST = 0 - PreloadLeadingStep2 $dst_r_bpp, $dst_bpp_shift, $preload_dst, dst, $tmp_leading_pixels, scratch + PreloadLeadingStep2 dst_r_bpp, dst_bpp_shift, $preload_dst, dst, $tmp_leading_pixels, scratch ] CalculateSkew [ "$newline" <> "" @@ -1556,7 +1556,7 @@ [ src_bpp > 0 [ flags & FLAG_MAX_256BIT_MACRO > 0 ; prefetch distance = 256/bpp, block distance = 256/dst_w_bpp -do_preload IsEndOfGroup subblock, 256/256*dst_w_bpp/src_bpp +do_preload IsEndOfGroup subblock/2, 256/256*dst_w_bpp/src_bpp | ; prefetch distance = 256/bpp, block distance = 128/dst_w_bpp do_preload IsEndOfGroup subblock, 256/128*dst_w_bpp/src_bpp @@ -1607,9 +1607,9 @@ | ADD x, x, #(prefetch_distance + 2) * pix_per_block - 1 ] - PreloadTrailing $src_bpp, $src_bpp_shift, src, x, &$fixed_skew + PreloadTrailing src_bpp, src_bpp_shift, src, x, &$fixed_skew [ flags :AND: FLAG_NO_PRELOAD_DST = 0 - PreloadTrailing $dst_r_bpp, $dst_bpp_shift, dst, x + PreloadTrailing dst_r_bpp, dst_bpp_shift, dst, x ] SUB x, x, #128/dst_w_bpp - 1 ] @@ -1643,7 +1643,7 @@ LTORG
EXPORT armSimd$prefix._medium -armSimd$prefix._medium +armSimd$prefix._medium ROUT ] FunctionPrologue NON_WIDE, 0 51 @@ -1709,7 +1709,8 @@ LTORG
EXPORT armSimd$prefix._narrow -armSimd$prefix._narrow FunctionPrologue NON_WIDE, 0 +armSimd$prefix._narrow ROUT + FunctionPrologue NON_WIDE, 0 [ src_bpp > 0 :LAND: src_bpp < 32 ; Because we're only aiming for 1-word alignment at the destination, ; we can at least have a constant skew for every scanline @@ -1781,7 +1782,8 @@
[ dst_w_bpp <= 8 EXPORT armSimd$prefix._tiny -armSimd$prefix._tiny FunctionPrologue NON_WIDE, 0 +armSimd$prefix._tiny ROUT + FunctionPrologue NON_WIDE, 0 51 PreloadLine src, src_bpp, src_bpp_shift, scratch, carry BIC scratch, dst, #31 ; loading dest is unconditional below
Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s =================================================================== --- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s 2014-12-16 23:23:27 UTC (rev 3187) +++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s 2014-12-17 01:43:47 UTC (rev 3188) @@ -83,6 +83,7 @@
MACRO pixelMatchTally32_32_128bits_tail $src + LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations pixelMatchTally32_32_1pixel $wk0, $wk4 pixelMatchTally32_32_1pixel $wk1, $wk5 pixelMatchTally32_32_1pixel $wk2, $wk6 @@ -93,10 +94,2574 @@ ; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
pixelMatchTally GenerateFunctions 32, 32,, \ + FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 3, \ + "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \ + "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5 + +; ******************************************************************** + + MACRO + notAnotBTally32_32_init + MOV map, #0 + MEND + + MACRO + notAnotBTally32_32_cleanup + MOV a1, map + MEND + + MACRO + notAnotBTally32_32_1pixel $srcA, $srcB + EORS $srcA, $srcA, ht + MOVNE $srcA, #1 + TEQ $srcB, ht_info + ADDNE map, map, $srcA + MEND + + MACRO + notAnotBTally32_32_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch + notAnotBTally32_32_1pixel $wk0, $wk1 + MEND + + MACRO + notAnotBTally32_32_64bits $src, $fixed_skew + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + notAnotBTally32_32_1pixel $wk0, $wk2 + notAnotBTally32_32_1pixel $wk1, $wk3 + MEND + + MACRO + notAnotBTally32_32_128bits_head $src, $fixed_skew, $intra_preloads + Read4Words dst, 0,, 0 + Read4Words src, 4, carry, $fixed_skew, skew, scratch + MEND + + MACRO + notAnotBTally32_32_128bits_tail $src + LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations + notAnotBTally32_32_1pixel $wk0, $wk4 + notAnotBTally32_32_1pixel $wk1, $wk5 + notAnotBTally32_32_1pixel $wk2, $wk6 + notAnotBTally32_32_1pixel $wk3, $wk7 + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup + +notAnotBTally GenerateFunctions 32, 32,, \ FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \ "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \ "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5
; ********************************************************************
+ MACRO + notAmatchBTally32_32_init + MOV map, #0 + MEND + + MACRO + notAmatchBTally32_32_cleanup + MOV a1, map + MEND + + MACRO + notAmatchBTally32_32_1pixel $srcA, $srcB + EORS $srcA, $srcA, ht + MOVNE $srcA, #1 + TEQ $srcB, ht_info + ADDEQ map, map, $srcA + MEND + + MACRO + notAmatchBTally32_32_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch + notAmatchBTally32_32_1pixel $wk0, $wk1 + MEND + + MACRO + notAmatchBTally32_32_64bits $src, $fixed_skew + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + notAmatchBTally32_32_1pixel $wk0, $wk2 + notAmatchBTally32_32_1pixel $wk1, $wk3 + MEND + + MACRO + notAmatchBTally32_32_128bits_head $src, $fixed_skew, $intra_preloads + Read4Words dst, 0,, 0 + Read4Words src, 4, carry, $fixed_skew, skew, scratch + MEND + + MACRO + notAmatchBTally32_32_128bits_tail $src + LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations + notAmatchBTally32_32_1pixel $wk0, $wk4 + notAmatchBTally32_32_1pixel $wk1, $wk5 + notAmatchBTally32_32_1pixel $wk2, $wk6 + notAmatchBTally32_32_1pixel $wk3, $wk7 + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup + +notAmatchBTally GenerateFunctions 32, 32,, \ + FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \ + "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \ + "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5 + +; ******************************************************************** + + MACRO + pixelMatchTest32_32_cleanup + MOV a1, #0 + B %FT95 +90 ADD sp, sp, #num_line_saved_regs * 4 + MOV a1, #1 +95 + MEND + + MACRO + pixelMatchTest32_32_1pixel $srcA, $srcB + TEQ $srcA, ht + TEQEQ $srcB, ht_info + BEQ %FA90 + MEND + + MACRO + pixelMatchTest32_32_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch + pixelMatchTest32_32_1pixel $wk0, $wk1 + MEND + + MACRO + pixelMatchTest32_32_64bits $src, $fixed_skew + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + pixelMatchTest32_32_1pixel $wk0, $wk2 + pixelMatchTest32_32_1pixel $wk1, $wk3 + MEND + + MACRO + pixelMatchTest32_32_128bits_head $src, $fixed_skew, $intra_preloads + Read4Words dst, 0,, 0 + Read4Words src, 4, carry, $fixed_skew, skew, scratch + MEND + + MACRO + pixelMatchTest32_32_128bits_tail $src + LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations + pixelMatchTest32_32_1pixel $wk0, $wk4 + pixelMatchTest32_32_1pixel $wk1, $wk5 + pixelMatchTest32_32_1pixel $wk2, $wk6 + pixelMatchTest32_32_1pixel $wk3, $wk7 + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup + +pixelMatchTest GenerateFunctions 32, 32,, \ + FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \ + "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \ + "x,y,stride_d,stride_s", orig_w,,,,, cleanup ; leading_pixels_reg = wk5 + +; ******************************************************************** + + MACRO + notAnotBTest32_32_cleanup + MOV a1, #0 + B %FT95 +90 ADD sp, sp, #num_line_saved_regs * 4 + MOV a1, #1 +95 + MEND + + MACRO + notAnotBTest32_32_1pixel $srcA, $srcB + TEQ $srcA, ht + TEQNE $srcB, ht_info + BNE %FA90 + MEND + + MACRO + notAnotBTest32_32_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch + notAnotBTest32_32_1pixel $wk0, $wk1 + MEND + + MACRO + notAnotBTest32_32_64bits $src, $fixed_skew + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + notAnotBTest32_32_1pixel $wk0, $wk2 + notAnotBTest32_32_1pixel $wk1, $wk3 + MEND + + MACRO + notAnotBTest32_32_128bits_head $src, $fixed_skew, $intra_preloads + Read4Words dst, 0,, 0 + Read4Words src, 4, carry, $fixed_skew, skew, scratch + MEND + + MACRO + notAnotBTest32_32_128bits_tail $src + LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations + notAnotBTest32_32_1pixel $wk0, $wk4 + notAnotBTest32_32_1pixel $wk1, $wk5 + notAnotBTest32_32_1pixel $wk2, $wk6 + notAnotBTest32_32_1pixel $wk3, $wk7 + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup + +notAnotBTest GenerateFunctions 32, 32,, \ + FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 3, \ + "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \ + "x,y,stride_d,stride_s", orig_w,,,,, cleanup ; leading_pixels_reg = wk5 + +; ******************************************************************** + + MACRO + notAmatchBTest32_32_cleanup + MOV a1, #0 + B %FT95 +90 ADD sp, sp, #num_line_saved_regs * 4 + MOV a1, #1 +95 + MEND + + MACRO + notAmatchBTest32_32_1pixel $srcA, $srcB + TEQ $srcB, ht_info + MOVNE $srcA, ht + TEQ $srcA, ht + BNE %FA90 + MEND + + MACRO + notAmatchBTest32_32_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch + notAmatchBTest32_32_1pixel $wk0, $wk1 + MEND + + MACRO + notAmatchBTest32_32_64bits $src, $fixed_skew + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + notAmatchBTest32_32_1pixel $wk0, $wk2 + notAmatchBTest32_32_1pixel $wk1, $wk3 + MEND + + MACRO + notAmatchBTest32_32_128bits_head $src, $fixed_skew, $intra_preloads + Read4Words dst, 0,, 0 + Read4Words src, 4, carry, $fixed_skew, skew, scratch + MEND + + MACRO + notAmatchBTest32_32_128bits_tail $src + LDR scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations + notAmatchBTest32_32_1pixel $wk0, $wk4 + notAmatchBTest32_32_1pixel $wk1, $wk5 + notAmatchBTest32_32_1pixel $wk2, $wk6 + notAmatchBTest32_32_1pixel $wk3, $wk7 + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup + +notAmatchBTest GenerateFunctions 32, 32,, \ + FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \ + "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \ + "x,y,stride_d,stride_s", orig_w,,,,, cleanup ; leading_pixels_reg = wk5 + +; ******************************************************************** + + MACRO + pixelMatchTally16_16_init + LDR bitptrs, =0x00010001 + MOV map, #0 + PKHBT ht, ht, LSL #16 ; replicate the constant colours across words + PKHBT ht_info, ht_info, LSL #16 + MEND + + MACRO + pixelMatchTally16_16_cleanup + MOV a1, map + MEND + + MACRO + pixelMatchTally16_16_1pixel $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $ones, $zeros + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $ones, $zeros + AND $srcA, $srcA, $srcB + UXTAH map, map, $srcA, ROR #16 + MEND + + MACRO + pixelMatchTally16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $ones, $zeros + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $ones, $zeros + [ $first :LAND: $last ; avoid touching wk4 + AND $srcA, $srcA, $srcB + UXTAH map, map, $srcA + UXTAH map, map, $srcA, ROR #16 + | + [ $first + AND $wk4, $srcA, $srcB + | + AND $srcA, $srcA, $srcB + ADD $wk4, $wk4, $srcA + ] + [ $last + UXTAH map, map, $wk4 + UXTAH map, map, $wk4, ROR #16 + ] + ] + MEND + + MACRO + pixelMatchTally16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones + pixelMatchTally16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones + pixelMatchTally16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones + MEND + + MACRO + pixelMatchTally16_16_16bits $src, $dst, $fixed_skew + MOV scratch, #0 + pixelMatchTally16_16_1pixel $dst, $src, scratch, bitptrs + MEND + + MACRO + pixelMatchTally16_16_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch + MOV scratch, #0 + pixelMatchTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, scratch, bitptrs + MEND + + MACRO + pixelMatchTally16_16_64bits $src, $fixed_skew + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MOV scratch, #0 + pixelMatchTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, scratch, bitptrs + pixelMatchTally16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, scratch, bitptrs + MEND + + MACRO + pixelMatchTally16_16_128bits_head $src, $fixed_skew, $intra_preloads + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MOV scratch, #0 + pixelMatchTally16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MEND + + MACRO + pixelMatchTally16_16_128bits_tail $src + MOV scratch, #0 + pixelMatchTally16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup + +pixelMatchTally GenerateFunctions 16, 16,, \ + FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \ + "y,stride_d,stride_s,skew,orig_w", \ + "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4 + +; ******************************************************************** + + MACRO + notAnotBTally16_16_init + LDR bitptrs, =0x00010001 + MOV map, #0 + PKHBT ht, ht, LSL #16 ; replicate the constant colours across words + PKHBT ht_info, ht_info, LSL #16 + MEND + + MACRO + notAnotBTally16_16_cleanup + MOV a1, map + MEND + + MACRO + notAnotBTally16_16_1pixel $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $zeros, $ones + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $zeros, $ones + AND $srcA, $srcA, $srcB + UXTAH map, map, $srcA, ROR #16 + MEND + + MACRO + notAnotBTally16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $zeros, $ones + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $zeros, $ones + [ $first :LAND: $last ; avoid touching wk4 + AND $srcA, $srcA, $srcB + UXTAH map, map, $srcA + UXTAH map, map, $srcA, ROR #16 + | + [ $first + AND $wk4, $srcA, $srcB + | + AND $srcA, $srcA, $srcB + ADD $wk4, $wk4, $srcA + ] + [ $last + UXTAH map, map, $wk4 + UXTAH map, map, $wk4, ROR #16 + ] + ] + MEND + + MACRO + notAnotBTally16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones + notAnotBTally16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones + notAnotBTally16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones + MEND + + MACRO + notAnotBTally16_16_16bits $src, $dst, $fixed_skew + MOV scratch, #0 + notAnotBTally16_16_1pixel $dst, $src, scratch, bitptrs + MEND + + MACRO + notAnotBTally16_16_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch + MOV scratch, #0 + notAnotBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, scratch, bitptrs + MEND + + MACRO + notAnotBTally16_16_64bits $src, $fixed_skew + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MOV scratch, #0 + notAnotBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, scratch, bitptrs + notAnotBTally16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, scratch, bitptrs + MEND + + MACRO + notAnotBTally16_16_128bits_head $src, $fixed_skew, $intra_preloads + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MOV scratch, #0 + notAnotBTally16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MEND + + MACRO + notAnotBTally16_16_128bits_tail $src + MOV scratch, #0 + notAnotBTally16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup + +notAnotBTally GenerateFunctions 16, 16,, \ + FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \ + "y,stride_d,stride_s,skew,orig_w", \ + "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4 + +; ******************************************************************** + + MACRO + notAmatchBTally16_16_init + LDR bitptrs, =0x00010001 + MOV map, #0 + PKHBT ht, ht, LSL #16 ; replicate the constant colours across words + PKHBT ht_info, ht_info, LSL #16 + MEND + + MACRO + notAmatchBTally16_16_cleanup + MOV a1, map + MEND + + MACRO + notAmatchBTally16_16_1pixel $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $zeros, $ones + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $ones, $zeros + AND $srcA, $srcA, $srcB + UXTAH map, map, $srcA, ROR #16 + MEND + + MACRO + notAmatchBTally16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $zeros, $ones + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $ones, $zeros + [ $first :LAND: $last ; avoid touching wk4 + AND $srcA, $srcA, $srcB + UXTAH map, map, $srcA + UXTAH map, map, $srcA, ROR #16 + | + [ $first + AND $wk4, $srcA, $srcB + | + AND $srcA, $srcA, $srcB + ADD $wk4, $wk4, $srcA + ] + [ $last + UXTAH map, map, $wk4 + UXTAH map, map, $wk4, ROR #16 + ] + ] + MEND + + MACRO + notAmatchBTally16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones + notAmatchBTally16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones + notAmatchBTally16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones + MEND + + MACRO + notAmatchBTally16_16_16bits $src, $dst, $fixed_skew + MOV scratch, #0 + notAmatchBTally16_16_1pixel $dst, $src, scratch, bitptrs + MEND + + MACRO + notAmatchBTally16_16_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch + MOV scratch, #0 + notAmatchBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, scratch, bitptrs + MEND + + MACRO + notAmatchBTally16_16_64bits $src, $fixed_skew + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MOV scratch, #0 + notAmatchBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, scratch, bitptrs + notAmatchBTally16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, scratch, bitptrs + MEND + + MACRO + notAmatchBTally16_16_128bits_head $src, $fixed_skew, $intra_preloads + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MOV scratch, #0 + notAmatchBTally16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MEND + + MACRO + notAmatchBTally16_16_128bits_tail $src + MOV scratch, #0 + notAmatchBTally16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup + +notAmatchBTally GenerateFunctions 16, 16,, \ + FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \ + "y,stride_d,stride_s,skew,orig_w", \ + "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4 + +; ******************************************************************** + + MACRO + pixelMatchTest16_16_init + LDR bitptrs, =0x00010001 + MOV map, #0 + PKHBT ht, ht, LSL #16 ; replicate the constant colours across words + PKHBT ht_info, ht_info, LSL #16 + MEND + + MACRO + pixelMatchTest16_16_cleanup + MOV a1, #0 + B %FT95 +90 ADD sp, sp, #num_line_saved_regs * 4 + MOV a1, #1 +95 + MEND + + MACRO + pixelMatchTest16_16_1pixel $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $ones, $zeros + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $ones, $zeros + AND $srcA, $srcA, $srcB + TST $srcA, #0x10000 + BNE %FA90 + MEND + + MACRO + pixelMatchTest16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $ones, $zeros + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $ones, $zeros + [ $first :LAND: $last ; avoid touching wk4 + TST $srcA, $srcB + | + [ $first + AND $wk4, $srcA, $srcB + | + AND $srcA, $srcA, $srcB + ORRS $wk4, $wk4, $srcA + ] + ] + [ $last + BNE %FA90 + ] + MEND + + MACRO + pixelMatchTest16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones + pixelMatchTest16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones + pixelMatchTest16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones + MEND + + MACRO + pixelMatchTest16_16_16bits $src, $dst, $fixed_skew + pixelMatchTest16_16_1pixel $dst, $src, map, bitptrs + MEND + + MACRO + pixelMatchTest16_16_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch + pixelMatchTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, map, bitptrs + MEND + + MACRO + pixelMatchTest16_16_64bits $src, $fixed_skew + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + pixelMatchTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, map, bitptrs + pixelMatchTest16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, map, bitptrs + MEND + + MACRO + pixelMatchTest16_16_128bits_head $src, $fixed_skew, $intra_preloads + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + pixelMatchTest16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MEND + + MACRO + pixelMatchTest16_16_128bits_tail $src + pixelMatchTest16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup + +pixelMatchTest GenerateFunctions 16, 16,, \ + FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \ + "y,stride_d,stride_s,skew,orig_w", \ + "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4 + +; ******************************************************************** + + MACRO + notAnotBTest16_16_init + LDR bitptrs, =0x00010001 + MOV map, #0 + PKHBT ht, ht, LSL #16 ; replicate the constant colours across words + PKHBT ht_info, ht_info, LSL #16 + MEND + + MACRO + notAnotBTest16_16_cleanup + MOV a1, #0 + B %FT95 +90 ADD sp, sp, #num_line_saved_regs * 4 + MOV a1, #1 +95 + MEND + + MACRO + notAnotBTest16_16_1pixel $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $zeros, $ones + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $zeros, $ones + AND $srcA, $srcA, $srcB + TST $srcA, #0x10000 + BNE %FA90 + MEND + + MACRO + notAnotBTest16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $zeros, $ones + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $zeros, $ones + [ $first :LAND: $last ; avoid touching wk4 + TST $srcA, $srcB + | + [ $first + AND $wk4, $srcA, $srcB + | + AND $srcA, $srcA, $srcB + ORRS $wk4, $wk4, $srcA + ] + ] + [ $last + BNE %FA90 + ] + MEND + + MACRO + notAnotBTest16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones + notAnotBTest16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones + notAnotBTest16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones + MEND + + MACRO + notAnotBTest16_16_16bits $src, $dst, $fixed_skew + notAnotBTest16_16_1pixel $dst, $src, map, bitptrs + MEND + + MACRO + notAnotBTest16_16_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch + notAnotBTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, map, bitptrs + MEND + + MACRO + notAnotBTest16_16_64bits $src, $fixed_skew + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + notAnotBTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, map, bitptrs + notAnotBTest16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, map, bitptrs + MEND + + MACRO + notAnotBTest16_16_128bits_head $src, $fixed_skew, $intra_preloads + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + notAnotBTest16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs + Read2Words dst, 0,, 0 + Read2Words src, 2, carry, $fixed_skew, skew, scratch + MEND + + MACRO + notAnotBTest16_16_128bits_tail $src + notAnotBTest16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs + MEND + +;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance, +; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup + +notAnotBTest GenerateFunctions 16, 16,, \ + FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \ + "y,stride_d,stride_s,skew,orig_w", \ + "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4 + +; ******************************************************************** + + MACRO + notAmatchBTest16_16_init + LDR bitptrs, =0x00010001 + MOV map, #0 + PKHBT ht, ht, LSL #16 ; replicate the constant colours across words + PKHBT ht_info, ht_info, LSL #16 + MEND + + MACRO + notAmatchBTest16_16_cleanup + MOV a1, #0 + B %FT95 +90 ADD sp, sp, #num_line_saved_regs * 4 + MOV a1, #1 +95 + MEND + + MACRO + notAmatchBTest16_16_1pixel $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $zeros, $ones + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $ones, $zeros + AND $srcA, $srcA, $srcB + TST $srcA, #0x10000 + BNE %FA90 + MEND + + MACRO + notAmatchBTest16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones + EOR $srcA, $srcA, ht + EOR $srcB, $srcB, ht_info + USUB16 $srcA, $zeros, $srcA + SEL $srcA, $zeros, $ones + USUB16 $srcB, $zeros, $srcB + SEL $srcB, $ones, $zeros + [ $first :LAND: $last ; avoid touching wk4 + TST $srcA, $srcB + | + [ $first + AND $wk4, $srcA, $srcB + | + AND $srcA, $srcA, $srcB + ORRS $wk4, $wk4, $srcA + ] + ] + [ $last + BNE %FA90 + ] + MEND + + MACRO + notAmatchBTest16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones + notAmatchBTest16_16_2pixels $first, {FALSE}, $srcA0, $srcB0, $zeros, $ones + notAmatchBTest16_16_2pixels {FALSE}, $last, $srcA1, $srcB1, $zeros, $ones + MEND + + MACRO + notAmatchBTest16_16_16bits $src, $dst, $fixed_skew + notAmatchBTest16_16_1pixel $dst, $src, map, bitptrs + MEND + + MACRO + notAmatchBTest16_16_32bits $src, $dst, $fixed_skew + Read1Word dst, 0,, 0 + Read1Word src, 1, carry, $fixed_skew, skew, scratch
@@ Diff output truncated at 50000 characters. @@
vm-dev@lists.squeakfoundation.org