[Vm-dev] [commit][3188] updates for ARM related fastblt

commits at squeakvm.org commits at squeakvm.org
Wed Dec 17 01:43:52 UTC 2014


Revision: 3188
Author:   rowledge
Date:     2014-12-16 17:43:47 -0800 (Tue, 16 Dec 2014)
Log Message:
-----------
updates for ARM related fastblt

Modified Paths:
--------------
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h

Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c	2014-12-16 23:23:27 UTC (rev 3187)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c	2014-12-17 01:43:47 UTC (rev 3188)
@@ -258,17 +258,17 @@
 };
 
 #define TALLY_FAST_PATH(op, srcA_bpp, srcB_bpp)                                                                                                 \
-extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_wide  (uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
-extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
-extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny  (uint32_t width, uint32_t height, uint32_t *srcA, uint32_t srcAStride, uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
-static uint32_t tallyFastPath##op##srcA_bpp##_##srcB_bpp(compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB)                         \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_wide  (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny  (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+static uint32_t tallyFastPath##op##srcA_bpp##_##srcB_bpp(const compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB)                   \
 {                                                                                                                                               \
     IGNORE(log2bppA);                                                                                                                           \
     IGNORE(log2bppB);                                                                                                                           \
     COPY_COMPARE_OP_TO_LOCALS(op, uint32_t, uint32_t);                                                                                          \
     /* Get pointers to initial words */                                                                                                         \
-    uint32_t *srcA = srcABits + srcAPitch * srcAY + srcAX * srcA_bpp / 32;                                                                      \
-    uint32_t *srcB = srcBBits + srcBPitch * srcBY + srcBX * srcB_bpp / 32;                                                                      \
+    const uint32_t *srcA = srcABits + srcAPitch * srcAY + srcAX * srcA_bpp / 32;                                                                \
+    const uint32_t *srcB = srcBBits + srcBPitch * srcBY + srcBX * srcB_bpp / 32;                                                                \
     /* Get initial pixel offset within words, mangle into pitch if possible */                                                                  \
     uint32_t bitPtrs = 0;                                                                                                                       \
     uint32_t srcAXpix = srcAX & (31 / srcA_bpp);                                                                                                \
@@ -297,17 +297,141 @@
         return armSimd##op##Tally##srcB_bpp##_##srcA_bpp##_tiny(width, height, srcA, srcAPitch, srcB, srcBPitch, colorA, colorB, 0, bitPtrs);   \
 }
 
+#define TEST_FAST_PATH(op, srcA_bpp, srcB_bpp)                                                                                                  \
+extern uint32_t armSimd##op##Test##srcB_bpp##_##srcA_bpp##_wide  (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Test##srcB_bpp##_##srcA_bpp##_narrow(uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+extern uint32_t armSimd##op##Test##srcB_bpp##_##srcA_bpp##_tiny  (uint32_t width, uint32_t height, const uint32_t *srcA, uint32_t srcAStride, const uint32_t *srcB, uint32_t srcBStride, uint32_t colorA, uint32_t colorB, void *unused, uint32_t bitPtrs); \
+static uint32_t testFastPath##op##srcA_bpp##_##srcB_bpp(const compare_operation_t *op, uint32_t log2bppA, uint32_t log2bppB)                    \
+{                                                                                                                                               \
+    IGNORE(log2bppA);                                                                                                                           \
+    IGNORE(log2bppB);                                                                                                                           \
+    COPY_COMPARE_OP_TO_LOCALS(op, uint32_t, uint32_t);                                                                                          \
+    /* Early termination is most likely in the centre, so start from the      */                                                                \
+    /* middle and work outwards                                               */                                                                \
+    const uint32_t *srcAUp = srcABits + srcAPitch * (srcAY + (height >> 1)) + srcAX * srcA_bpp / 32;                                            \
+    const uint32_t *srcBUp = srcBBits + srcBPitch * (srcBY + (height >> 1)) + srcBX * srcB_bpp / 32;                                            \
+    const uint32_t *srcADown = srcAUp;                                                                                                          \
+    const uint32_t *srcBDown = srcBUp;                                                                                                          \
+    /* Get initial pixel offset within words, mangle into pitch if possible   */                                                                \
+    uint32_t bitPtrs = 0;                                                                                                                       \
+    uint32_t srcAXpix = srcAX & (31 / srcA_bpp);                                                                                                \
+    if (srcA_bpp < 8)                                                                                                                           \
+        bitPtrs = srcAXpix;                                                                                                                     \
+    else if (srcA_bpp == 8 || srcA_bpp == 16)                                                                                                   \
+        srcAPitch |= srcAXpix << 30;                                                                                                            \
+    uint32_t srcBXpix = srcBX & (31 / srcB_bpp);                                                                                                \
+    if (srcB_bpp < 8)                                                                                                                           \
+        bitPtrs |= srcBXpix << 27;                                                                                                              \
+    else if (srcB_bpp == 8 || srcB_bpp == 16)                                                                                                   \
+        srcBPitch |= srcBXpix << 30;                                                                                                            \
+    /* Work out which width class this operation is.                          */                                                                \
+    /* Rather than re-evaluate this for each line, we want one choice for the */                                                                \
+    /* whole operation; this means we can't assume anything about alignment   */                                                                \
+    /* to sizes larger than 4 bytes, because that's the only guarantee we     */                                                                \
+    /* have about line stride.                                                */                                                                \
+    uint32_t (*testRow)(uint32_t, uint32_t, const uint32_t *, uint32_t, const uint32_t *, uint32_t, uint32_t, uint32_t, void *, uint32_t);      \
+    if (width > (128-32)/srcA_bpp && (((srcAXpix-1) ^ (srcAXpix+width-(128-32)/srcA_bpp)) &~ (31/srcA_bpp)))                                    \
+        testRow = armSimd##op##Test##srcB_bpp##_##srcA_bpp##_wide;                                                                              \
+    else if (srcA_bpp > 8 || (((srcAXpix-1) ^ (srcAXpix+width)) &~ (31/srcA_bpp)))                                                              \
+        testRow = armSimd##op##Test##srcB_bpp##_##srcA_bpp##_narrow;                                                                            \
+    else                                                                                                                                        \
+        testRow = armSimd##op##Test##srcB_bpp##_##srcA_bpp##_tiny;                                                                              \
+    if (height & 1)                                                                                                                             \
+    {                                                                                                                                           \
+        height++;                                                                                                                               \
+        goto odd_number_of_rows_remain;                                                                                                         \
+    }                                                                                                                                           \
+    while (height != 0)                                                                                                                         \
+    {                                                                                                                                           \
+        srcADown -= srcAPitch;                                                                                                                  \
+        srcBDown -= srcBPitch;                                                                                                                  \
+        if (testRow(width, 1, srcADown, srcAPitch, srcBDown, srcBPitch, colorA, colorB, 0, bitPtrs))                                            \
+            return 1;                                                                                                                           \
+        odd_number_of_rows_remain:                                                                                                              \
+        if (testRow(width, 1, srcAUp, srcAPitch, srcBUp, srcBPitch, colorA, colorB, 0, bitPtrs))                                                \
+            return 1;                                                                                                                           \
+        srcAUp += srcAPitch;                                                                                                                    \
+        srcBUp += srcBPitch;                                                                                                                    \
+        height -= 2;                                                                                                                            \
+    }                                                                                                                                           \
+    return 0;                                                                                                                                   \
+}
+
 #define ADD_TALLY_FN(op, srcA_bpp, srcB_bpp)                     \
     do { compareColorsFns[(((MR_##op * 2) + 1) * 3 +             \
             (srcA_bpp == 8 ? 0 : srcA_bpp == 16 ? 1 : 2)) * 3 +  \
             (srcB_bpp == 8 ? 0 : srcB_bpp == 16 ? 1 : 2)] =      \
             tallyFastPath##op##srcA_bpp##_##srcB_bpp; } while(0)
 
+#define ADD_TEST_FN(op, srcA_bpp, srcB_bpp)                      \
+    do { compareColorsFns[(((MR_##op * 2) + 0) * 3 +             \
+            (srcA_bpp == 8 ? 0 : srcA_bpp == 16 ? 1 : 2)) * 3 +  \
+            (srcB_bpp == 8 ? 0 : srcB_bpp == 16 ? 1 : 2)] =      \
+            testFastPath##op##srcA_bpp##_##srcB_bpp; } while(0)
+
 TALLY_FAST_PATH(pixelMatch, 32, 32)
+TALLY_FAST_PATH(notAnotB,   32, 32)
+TALLY_FAST_PATH(notAmatchB, 32, 32)
+TEST_FAST_PATH(pixelMatch, 32, 32)
+TEST_FAST_PATH(notAnotB,   32, 32)
+TEST_FAST_PATH(notAmatchB, 32, 32)
 
+TALLY_FAST_PATH(pixelMatch, 16, 16)
+TALLY_FAST_PATH(notAnotB,   16, 16)
+TALLY_FAST_PATH(notAmatchB, 16, 16)
+TEST_FAST_PATH(pixelMatch, 16, 16)
+TEST_FAST_PATH(notAnotB,   16, 16)
+TEST_FAST_PATH(notAmatchB, 16, 16)
+
+TALLY_FAST_PATH(pixelMatch, 16, 32)
+TALLY_FAST_PATH(notAnotB,   16, 32)
+TALLY_FAST_PATH(notAmatchB, 16, 32)
+TEST_FAST_PATH(pixelMatch, 16, 32)
+TEST_FAST_PATH(notAnotB,   16, 32)
+TEST_FAST_PATH(notAmatchB, 16, 32)
+
+TALLY_FAST_PATH(notAmatchB, 32, 16)
+TEST_FAST_PATH(notAmatchB, 32, 16)
+
+TALLY_FAST_PATH(pixelMatch, 8, 8)
+TALLY_FAST_PATH(notAnotB,   8, 8)
+TALLY_FAST_PATH(notAmatchB, 8, 8)
+TEST_FAST_PATH(pixelMatch, 8, 8)
+TEST_FAST_PATH(notAnotB,   8, 8)
+TEST_FAST_PATH(notAmatchB, 8, 8)
+
 void addArmSimdFastPaths(void)
 {
 	addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
 
-	ADD_TALLY_FN(pixelMatch, 32, 32);
+    ADD_TALLY_FN(pixelMatch, 32, 32);
+    ADD_TALLY_FN(notAnotB,   32, 32);
+    ADD_TALLY_FN(notAmatchB, 32, 32);
+    ADD_TEST_FN(pixelMatch, 32, 32);
+    ADD_TEST_FN(notAnotB,   32, 32);
+    ADD_TEST_FN(notAmatchB, 32, 32);
+
+    ADD_TALLY_FN(pixelMatch, 16, 16);
+    ADD_TALLY_FN(notAnotB,   16, 16);
+    ADD_TALLY_FN(notAmatchB, 16, 16);
+    ADD_TEST_FN(pixelMatch, 16, 16);
+    ADD_TEST_FN(notAnotB,   16, 16);
+    ADD_TEST_FN(notAmatchB, 16, 16);
+
+    ADD_TALLY_FN(pixelMatch, 16, 32);
+    ADD_TALLY_FN(notAnotB,   16, 32);
+    ADD_TALLY_FN(notAmatchB, 16, 32);
+    ADD_TEST_FN(pixelMatch, 16, 32);
+    ADD_TEST_FN(notAnotB,   16, 32);
+    ADD_TEST_FN(notAmatchB, 16, 32);
+
+    ADD_TALLY_FN(notAmatchB, 32, 16);
+    ADD_TEST_FN(notAmatchB, 32, 16);
+
+    ADD_TALLY_FN(pixelMatch, 8, 8);
+    ADD_TALLY_FN(notAnotB,   8, 8);
+    ADD_TALLY_FN(notAmatchB, 8, 8);
+    ADD_TEST_FN(pixelMatch, 8, 8);
+    ADD_TEST_FN(notAnotB,   8, 8);
+    ADD_TEST_FN(notAmatchB, 8, 8);
 }

Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr	2014-12-16 23:23:27 UTC (rev 3187)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr	2014-12-17 01:43:47 UTC (rev 3188)
@@ -1484,7 +1484,7 @@
       ]
 
         EXPORT  armSimd$prefix._wide
-armSimd$prefix._wide
+armSimd$prefix._wide  ROUT
  [ src_bpp > 0 :LOR: dst_r_bpp > 0
         ; Check whether this is actually a medium-width operation
         ; (decision made here rather in C due to availability of
@@ -1498,14 +1498,14 @@
         BLO     armSimd$prefix._medium
         FunctionPrologue  WIDE, (prefetch_distance+2)*pix_per_block
 51
-        PreloadLeadingStep1  $src_bpp, $preload_src, src
+        PreloadLeadingStep1  src_bpp, $preload_src, src
       [ flags :AND: FLAG_NO_PRELOAD_DST = 0
-        PreloadLeadingStep1  $dst_r_bpp, $preload_dst, dst
+        PreloadLeadingStep1  dst_r_bpp, $preload_dst, dst
       ]
         CalculateLeadingPixels
-        PreloadLeadingStep2  $src_bpp, $src_bpp_shift, $preload_src, src, $tmp_leading_pixels, scratch
+        PreloadLeadingStep2  src_bpp, src_bpp_shift, $preload_src, src, $tmp_leading_pixels, scratch
       [ flags :AND: FLAG_NO_PRELOAD_DST = 0
-        PreloadLeadingStep2  $dst_r_bpp, $dst_bpp_shift, $preload_dst, dst, $tmp_leading_pixels, scratch
+        PreloadLeadingStep2  dst_r_bpp, dst_bpp_shift, $preload_dst, dst, $tmp_leading_pixels, scratch
       ]
         CalculateSkew
       [ "$newline" <> ""
@@ -1556,7 +1556,7 @@
      [ src_bpp > 0
       [ flags & FLAG_MAX_256BIT_MACRO > 0
         ; prefetch distance = 256/bpp, block distance = 256/dst_w_bpp
-do_preload IsEndOfGroup subblock, 256/256*dst_w_bpp/src_bpp
+do_preload IsEndOfGroup subblock/2, 256/256*dst_w_bpp/src_bpp
       |
         ; prefetch distance = 256/bpp, block distance = 128/dst_w_bpp
 do_preload IsEndOfGroup subblock, 256/128*dst_w_bpp/src_bpp
@@ -1607,9 +1607,9 @@
       |
         ADD     x, x, #(prefetch_distance + 2) * pix_per_block - 1
       ]
-        PreloadTrailing  $src_bpp, $src_bpp_shift, src, x, &$fixed_skew
+        PreloadTrailing  src_bpp, src_bpp_shift, src, x, &$fixed_skew
       [ flags :AND: FLAG_NO_PRELOAD_DST = 0
-        PreloadTrailing  $dst_r_bpp, $dst_bpp_shift, dst, x
+        PreloadTrailing  dst_r_bpp, dst_bpp_shift, dst, x
       ]
         SUB     x, x, #128/dst_w_bpp - 1
     ]
@@ -1643,7 +1643,7 @@
         LTORG
 
         EXPORT  armSimd$prefix._medium
-armSimd$prefix._medium
+armSimd$prefix._medium  ROUT
  ]
         FunctionPrologue  NON_WIDE, 0
 51
@@ -1709,7 +1709,8 @@
         LTORG
 
         EXPORT  armSimd$prefix._narrow
-armSimd$prefix._narrow  FunctionPrologue  NON_WIDE, 0
+armSimd$prefix._narrow  ROUT
+        FunctionPrologue  NON_WIDE, 0
       [ src_bpp > 0 :LAND: src_bpp < 32
 	; Because we're only aiming for 1-word alignment at the destination,
 	; we can at least have a constant skew for every scanline
@@ -1781,7 +1782,8 @@
 
  [ dst_w_bpp <= 8
         EXPORT  armSimd$prefix._tiny
-armSimd$prefix._tiny  FunctionPrologue  NON_WIDE, 0
+armSimd$prefix._tiny  ROUT
+        FunctionPrologue  NON_WIDE, 0
 51
         PreloadLine src, src_bpp, src_bpp_shift, scratch, carry
         BIC     scratch, dst, #31 ; loading dest is unconditional below

Modified: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s	2014-12-16 23:23:27 UTC (rev 3187)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdCompare.s	2014-12-17 01:43:47 UTC (rev 3188)
@@ -83,6 +83,7 @@
 
         MACRO
         pixelMatchTally32_32_128bits_tail $src
+        LDR     scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
         pixelMatchTally32_32_1pixel $wk0, $wk4
         pixelMatchTally32_32_1pixel $wk1, $wk5
         pixelMatchTally32_32_1pixel $wk2, $wk6
@@ -93,10 +94,2574 @@
 ;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
 
 pixelMatchTally GenerateFunctions 32, 32,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 3, \
+  "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+  "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+        MACRO
+        notAnotBTally32_32_init
+        MOV     map, #0
+        MEND
+
+        MACRO
+        notAnotBTally32_32_cleanup
+        MOV     a1, map
+        MEND
+
+        MACRO
+        notAnotBTally32_32_1pixel $srcA, $srcB
+        EORS    $srcA, $srcA, ht
+        MOVNE   $srcA, #1
+        TEQ     $srcB, ht_info
+        ADDNE   map, map, $srcA
+        MEND
+
+        MACRO
+        notAnotBTally32_32_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        notAnotBTally32_32_1pixel $wk0, $wk1
+        MEND
+
+        MACRO
+        notAnotBTally32_32_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        notAnotBTally32_32_1pixel $wk0, $wk2
+        notAnotBTally32_32_1pixel $wk1, $wk3
+        MEND
+
+        MACRO
+        notAnotBTally32_32_128bits_head $src, $fixed_skew, $intra_preloads
+        Read4Words dst, 0,, 0
+        Read4Words src, 4, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        notAnotBTally32_32_128bits_tail $src
+        LDR     scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
+        notAnotBTally32_32_1pixel $wk0, $wk4
+        notAnotBTally32_32_1pixel $wk1, $wk5
+        notAnotBTally32_32_1pixel $wk2, $wk6
+        notAnotBTally32_32_1pixel $wk3, $wk7
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAnotBTally GenerateFunctions 32, 32,, \
   FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
   "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
   "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5
 
 ; ********************************************************************
 
+        MACRO
+        notAmatchBTally32_32_init
+        MOV     map, #0
+        MEND
+
+        MACRO
+        notAmatchBTally32_32_cleanup
+        MOV     a1, map
+        MEND
+
+        MACRO
+        notAmatchBTally32_32_1pixel $srcA, $srcB
+        EORS    $srcA, $srcA, ht
+        MOVNE   $srcA, #1
+        TEQ     $srcB, ht_info
+        ADDEQ   map, map, $srcA
+        MEND
+
+        MACRO
+        notAmatchBTally32_32_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        notAmatchBTally32_32_1pixel $wk0, $wk1
+        MEND
+
+        MACRO
+        notAmatchBTally32_32_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        notAmatchBTally32_32_1pixel $wk0, $wk2
+        notAmatchBTally32_32_1pixel $wk1, $wk3
+        MEND
+
+        MACRO
+        notAmatchBTally32_32_128bits_head $src, $fixed_skew, $intra_preloads
+        Read4Words dst, 0,, 0
+        Read4Words src, 4, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        notAmatchBTally32_32_128bits_tail $src
+        LDR     scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
+        notAmatchBTally32_32_1pixel $wk0, $wk4
+        notAmatchBTally32_32_1pixel $wk1, $wk5
+        notAmatchBTally32_32_1pixel $wk2, $wk6
+        notAmatchBTally32_32_1pixel $wk3, $wk7
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAmatchBTally GenerateFunctions 32, 32,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+  "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+  "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+        MACRO
+        pixelMatchTest32_32_cleanup
+        MOV     a1, #0
+        B       %FT95
+90      ADD     sp, sp, #num_line_saved_regs * 4
+        MOV     a1, #1
+95
+        MEND
+
+        MACRO
+        pixelMatchTest32_32_1pixel $srcA, $srcB
+        TEQ     $srcA, ht
+        TEQEQ   $srcB, ht_info
+        BEQ     %FA90
+        MEND
+
+        MACRO
+        pixelMatchTest32_32_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        pixelMatchTest32_32_1pixel $wk0, $wk1
+        MEND
+
+        MACRO
+        pixelMatchTest32_32_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        pixelMatchTest32_32_1pixel $wk0, $wk2
+        pixelMatchTest32_32_1pixel $wk1, $wk3
+        MEND
+
+        MACRO
+        pixelMatchTest32_32_128bits_head $src, $fixed_skew, $intra_preloads
+        Read4Words dst, 0,, 0
+        Read4Words src, 4, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        pixelMatchTest32_32_128bits_tail $src
+        LDR     scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
+        pixelMatchTest32_32_1pixel $wk0, $wk4
+        pixelMatchTest32_32_1pixel $wk1, $wk5
+        pixelMatchTest32_32_1pixel $wk2, $wk6
+        pixelMatchTest32_32_1pixel $wk3, $wk7
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+pixelMatchTest GenerateFunctions 32, 32,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+  "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+  "x,y,stride_d,stride_s", orig_w,,,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+        MACRO
+        notAnotBTest32_32_cleanup
+        MOV     a1, #0
+        B       %FT95
+90      ADD     sp, sp, #num_line_saved_regs * 4
+        MOV     a1, #1
+95
+        MEND
+
+        MACRO
+        notAnotBTest32_32_1pixel $srcA, $srcB
+        TEQ     $srcA, ht
+        TEQNE   $srcB, ht_info
+        BNE     %FA90
+        MEND
+
+        MACRO
+        notAnotBTest32_32_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        notAnotBTest32_32_1pixel $wk0, $wk1
+        MEND
+
+        MACRO
+        notAnotBTest32_32_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        notAnotBTest32_32_1pixel $wk0, $wk2
+        notAnotBTest32_32_1pixel $wk1, $wk3
+        MEND
+
+        MACRO
+        notAnotBTest32_32_128bits_head $src, $fixed_skew, $intra_preloads
+        Read4Words dst, 0,, 0
+        Read4Words src, 4, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        notAnotBTest32_32_128bits_tail $src
+        LDR     scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
+        notAnotBTest32_32_1pixel $wk0, $wk4
+        notAnotBTest32_32_1pixel $wk1, $wk5
+        notAnotBTest32_32_1pixel $wk2, $wk6
+        notAnotBTest32_32_1pixel $wk3, $wk7
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAnotBTest GenerateFunctions 32, 32,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 3, \
+  "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+  "x,y,stride_d,stride_s", orig_w,,,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+        MACRO
+        notAmatchBTest32_32_cleanup
+        MOV     a1, #0
+        B       %FT95
+90      ADD     sp, sp, #num_line_saved_regs * 4
+        MOV     a1, #1
+95
+        MEND
+
+        MACRO
+        notAmatchBTest32_32_1pixel $srcA, $srcB
+        TEQ     $srcB, ht_info
+        MOVNE   $srcA, ht
+        TEQ     $srcA, ht
+        BNE     %FA90
+        MEND
+
+        MACRO
+        notAmatchBTest32_32_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        notAmatchBTest32_32_1pixel $wk0, $wk1
+        MEND
+
+        MACRO
+        notAmatchBTest32_32_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        notAmatchBTest32_32_1pixel $wk0, $wk2
+        notAmatchBTest32_32_1pixel $wk1, $wk3
+        MEND
+
+        MACRO
+        notAmatchBTest32_32_128bits_head $src, $fixed_skew, $intra_preloads
+        Read4Words dst, 0,, 0
+        Read4Words src, 4, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        notAmatchBTest32_32_128bits_tail $src
+        LDR     scratch, [src, #-8] ; reload after it was (in many cases) corrupted by preload calculations
+        notAmatchBTest32_32_1pixel $wk0, $wk4
+        notAmatchBTest32_32_1pixel $wk1, $wk5
+        notAmatchBTest32_32_1pixel $wk2, $wk6
+        notAmatchBTest32_32_1pixel $wk3, $wk7
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAmatchBTest GenerateFunctions 32, 32,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+  "y,stride_d,stride_s,bitptrs,skew,orig_w,scratch,carry", \
+  "x,y,stride_d,stride_s", orig_w,,,,, cleanup ; leading_pixels_reg = wk5
+
+; ********************************************************************
+
+        MACRO
+        pixelMatchTally16_16_init
+        LDR     bitptrs, =0x00010001
+        MOV     map, #0
+        PKHBT   ht, ht, LSL #16     ; replicate the constant colours across words
+        PKHBT   ht_info, ht_info, LSL #16
+        MEND
+
+        MACRO
+        pixelMatchTally16_16_cleanup
+        MOV     a1, map
+        MEND
+
+        MACRO
+        pixelMatchTally16_16_1pixel $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $ones, $zeros
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $ones, $zeros
+        AND     $srcA, $srcA, $srcB
+        UXTAH   map, map, $srcA, ROR #16
+        MEND
+
+        MACRO
+        pixelMatchTally16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $ones, $zeros
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $ones, $zeros
+    [ $first :LAND: $last  ; avoid touching wk4
+        AND     $srcA, $srcA, $srcB
+        UXTAH   map, map, $srcA
+        UXTAH   map, map, $srcA, ROR #16
+    |
+      [ $first
+        AND     $wk4, $srcA, $srcB
+      |
+        AND     $srcA, $srcA, $srcB
+        ADD     $wk4, $wk4, $srcA
+      ]
+      [ $last
+        UXTAH   map, map, $wk4
+        UXTAH   map, map, $wk4, ROR #16
+      ]
+    ]
+        MEND
+
+        MACRO
+        pixelMatchTally16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+        pixelMatchTally16_16_2pixels $first,  {FALSE}, $srcA0, $srcB0, $zeros, $ones
+        pixelMatchTally16_16_2pixels {FALSE}, $last,   $srcA1, $srcB1, $zeros, $ones
+        MEND
+
+        MACRO
+        pixelMatchTally16_16_16bits $src, $dst, $fixed_skew
+        MOV     scratch, #0
+        pixelMatchTally16_16_1pixel $dst, $src, scratch, bitptrs
+        MEND
+
+        MACRO
+        pixelMatchTally16_16_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        MOV     scratch, #0
+        pixelMatchTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, scratch, bitptrs
+        MEND
+
+        MACRO
+        pixelMatchTally16_16_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MOV     scratch, #0
+        pixelMatchTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, scratch, bitptrs
+        pixelMatchTally16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, scratch, bitptrs
+        MEND
+
+        MACRO
+        pixelMatchTally16_16_128bits_head $src, $fixed_skew, $intra_preloads
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MOV     scratch, #0
+        pixelMatchTally16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        pixelMatchTally16_16_128bits_tail $src
+        MOV     scratch, #0
+        pixelMatchTally16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+pixelMatchTally GenerateFunctions 16, 16,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+  "y,stride_d,stride_s,skew,orig_w", \
+  "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4
+
+; ********************************************************************
+
+        MACRO
+        notAnotBTally16_16_init
+        LDR     bitptrs, =0x00010001
+        MOV     map, #0
+        PKHBT   ht, ht, LSL #16     ; replicate the constant colours across words
+        PKHBT   ht_info, ht_info, LSL #16
+        MEND
+
+        MACRO
+        notAnotBTally16_16_cleanup
+        MOV     a1, map
+        MEND
+
+        MACRO
+        notAnotBTally16_16_1pixel $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $zeros, $ones
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $zeros, $ones
+        AND     $srcA, $srcA, $srcB
+        UXTAH   map, map, $srcA, ROR #16
+        MEND
+
+        MACRO
+        notAnotBTally16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $zeros, $ones
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $zeros, $ones
+    [ $first :LAND: $last  ; avoid touching wk4
+        AND     $srcA, $srcA, $srcB
+        UXTAH   map, map, $srcA
+        UXTAH   map, map, $srcA, ROR #16
+    |
+      [ $first
+        AND     $wk4, $srcA, $srcB
+      |
+        AND     $srcA, $srcA, $srcB
+        ADD     $wk4, $wk4, $srcA
+      ]
+      [ $last
+        UXTAH   map, map, $wk4
+        UXTAH   map, map, $wk4, ROR #16
+      ]
+    ]
+        MEND
+
+        MACRO
+        notAnotBTally16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+        notAnotBTally16_16_2pixels $first,  {FALSE}, $srcA0, $srcB0, $zeros, $ones
+        notAnotBTally16_16_2pixels {FALSE}, $last,   $srcA1, $srcB1, $zeros, $ones
+        MEND
+
+        MACRO
+        notAnotBTally16_16_16bits $src, $dst, $fixed_skew
+        MOV     scratch, #0
+        notAnotBTally16_16_1pixel $dst, $src, scratch, bitptrs
+        MEND
+
+        MACRO
+        notAnotBTally16_16_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        MOV     scratch, #0
+        notAnotBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, scratch, bitptrs
+        MEND
+
+        MACRO
+        notAnotBTally16_16_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MOV     scratch, #0
+        notAnotBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, scratch, bitptrs
+        notAnotBTally16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, scratch, bitptrs
+        MEND
+
+        MACRO
+        notAnotBTally16_16_128bits_head $src, $fixed_skew, $intra_preloads
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MOV     scratch, #0
+        notAnotBTally16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        notAnotBTally16_16_128bits_tail $src
+        MOV     scratch, #0
+        notAnotBTally16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAnotBTally GenerateFunctions 16, 16,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+  "y,stride_d,stride_s,skew,orig_w", \
+  "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4
+
+; ********************************************************************
+
+        MACRO
+        notAmatchBTally16_16_init
+        LDR     bitptrs, =0x00010001
+        MOV     map, #0
+        PKHBT   ht, ht, LSL #16     ; replicate the constant colours across words
+        PKHBT   ht_info, ht_info, LSL #16
+        MEND
+
+        MACRO
+        notAmatchBTally16_16_cleanup
+        MOV     a1, map
+        MEND
+
+        MACRO
+        notAmatchBTally16_16_1pixel $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $zeros, $ones
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $ones, $zeros
+        AND     $srcA, $srcA, $srcB
+        UXTAH   map, map, $srcA, ROR #16
+        MEND
+
+        MACRO
+        notAmatchBTally16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $zeros, $ones
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $ones, $zeros
+    [ $first :LAND: $last  ; avoid touching wk4
+        AND     $srcA, $srcA, $srcB
+        UXTAH   map, map, $srcA
+        UXTAH   map, map, $srcA, ROR #16
+    |
+      [ $first
+        AND     $wk4, $srcA, $srcB
+      |
+        AND     $srcA, $srcA, $srcB
+        ADD     $wk4, $wk4, $srcA
+      ]
+      [ $last
+        UXTAH   map, map, $wk4
+        UXTAH   map, map, $wk4, ROR #16
+      ]
+    ]
+        MEND
+
+        MACRO
+        notAmatchBTally16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+        notAmatchBTally16_16_2pixels $first,  {FALSE}, $srcA0, $srcB0, $zeros, $ones
+        notAmatchBTally16_16_2pixels {FALSE}, $last,   $srcA1, $srcB1, $zeros, $ones
+        MEND
+
+        MACRO
+        notAmatchBTally16_16_16bits $src, $dst, $fixed_skew
+        MOV     scratch, #0
+        notAmatchBTally16_16_1pixel $dst, $src, scratch, bitptrs
+        MEND
+
+        MACRO
+        notAmatchBTally16_16_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        MOV     scratch, #0
+        notAmatchBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, scratch, bitptrs
+        MEND
+
+        MACRO
+        notAmatchBTally16_16_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MOV     scratch, #0
+        notAmatchBTally16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, scratch, bitptrs
+        notAmatchBTally16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, scratch, bitptrs
+        MEND
+
+        MACRO
+        notAmatchBTally16_16_128bits_head $src, $fixed_skew, $intra_preloads
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MOV     scratch, #0
+        notAmatchBTally16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        notAmatchBTally16_16_128bits_tail $src
+        MOV     scratch, #0
+        notAmatchBTally16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, scratch, bitptrs
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAmatchBTally GenerateFunctions 16, 16,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+  "y,stride_d,stride_s,skew,orig_w", \
+  "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4
+
+; ********************************************************************
+
+        MACRO
+        pixelMatchTest16_16_init
+        LDR     bitptrs, =0x00010001
+        MOV     map, #0
+        PKHBT   ht, ht, LSL #16     ; replicate the constant colours across words
+        PKHBT   ht_info, ht_info, LSL #16
+        MEND
+
+        MACRO
+        pixelMatchTest16_16_cleanup
+        MOV     a1, #0
+        B       %FT95
+90      ADD     sp, sp, #num_line_saved_regs * 4
+        MOV     a1, #1
+95
+        MEND
+
+        MACRO
+        pixelMatchTest16_16_1pixel $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $ones, $zeros
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $ones, $zeros
+        AND     $srcA, $srcA, $srcB
+        TST     $srcA, #0x10000
+        BNE     %FA90
+        MEND
+
+        MACRO
+        pixelMatchTest16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $ones, $zeros
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $ones, $zeros
+    [ $first :LAND: $last  ; avoid touching wk4
+        TST     $srcA, $srcB
+    |
+      [ $first
+        AND     $wk4, $srcA, $srcB
+      |
+        AND     $srcA, $srcA, $srcB
+        ORRS    $wk4, $wk4, $srcA
+      ]
+    ]
+      [ $last
+        BNE     %FA90
+      ]
+        MEND
+
+        MACRO
+        pixelMatchTest16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+        pixelMatchTest16_16_2pixels $first,  {FALSE}, $srcA0, $srcB0, $zeros, $ones
+        pixelMatchTest16_16_2pixels {FALSE}, $last,   $srcA1, $srcB1, $zeros, $ones
+        MEND
+
+        MACRO
+        pixelMatchTest16_16_16bits $src, $dst, $fixed_skew
+        pixelMatchTest16_16_1pixel $dst, $src, map, bitptrs
+        MEND
+
+        MACRO
+        pixelMatchTest16_16_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        pixelMatchTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, map, bitptrs
+        MEND
+
+        MACRO
+        pixelMatchTest16_16_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        pixelMatchTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, map, bitptrs
+        pixelMatchTest16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, map, bitptrs
+        MEND
+
+        MACRO
+        pixelMatchTest16_16_128bits_head $src, $fixed_skew, $intra_preloads
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        pixelMatchTest16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        pixelMatchTest16_16_128bits_tail $src
+        pixelMatchTest16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+pixelMatchTest GenerateFunctions 16, 16,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+  "y,stride_d,stride_s,skew,orig_w", \
+  "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4
+
+; ********************************************************************
+
+        MACRO
+        notAnotBTest16_16_init
+        LDR     bitptrs, =0x00010001
+        MOV     map, #0
+        PKHBT   ht, ht, LSL #16     ; replicate the constant colours across words
+        PKHBT   ht_info, ht_info, LSL #16
+        MEND
+
+        MACRO
+        notAnotBTest16_16_cleanup
+        MOV     a1, #0
+        B       %FT95
+90      ADD     sp, sp, #num_line_saved_regs * 4
+        MOV     a1, #1
+95
+        MEND
+
+        MACRO
+        notAnotBTest16_16_1pixel $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $zeros, $ones
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $zeros, $ones
+        AND     $srcA, $srcA, $srcB
+        TST     $srcA, #0x10000
+        BNE     %FA90
+        MEND
+
+        MACRO
+        notAnotBTest16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $zeros, $ones
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $zeros, $ones
+    [ $first :LAND: $last  ; avoid touching wk4
+        TST     $srcA, $srcB
+    |
+      [ $first
+        AND     $wk4, $srcA, $srcB
+      |
+        AND     $srcA, $srcA, $srcB
+        ORRS    $wk4, $wk4, $srcA
+      ]
+    ]
+      [ $last
+        BNE     %FA90
+      ]
+        MEND
+
+        MACRO
+        notAnotBTest16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+        notAnotBTest16_16_2pixels $first,  {FALSE}, $srcA0, $srcB0, $zeros, $ones
+        notAnotBTest16_16_2pixels {FALSE}, $last,   $srcA1, $srcB1, $zeros, $ones
+        MEND
+
+        MACRO
+        notAnotBTest16_16_16bits $src, $dst, $fixed_skew
+        notAnotBTest16_16_1pixel $dst, $src, map, bitptrs
+        MEND
+
+        MACRO
+        notAnotBTest16_16_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch
+        notAnotBTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk1, map, bitptrs
+        MEND
+
+        MACRO
+        notAnotBTest16_16_64bits $src, $fixed_skew
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        notAnotBTest16_16_2pixels {TRUE}, {TRUE}, $wk0, $wk2, map, bitptrs
+        notAnotBTest16_16_2pixels {TRUE}, {TRUE}, $wk1, $wk3, map, bitptrs
+        MEND
+
+        MACRO
+        notAnotBTest16_16_128bits_head $src, $fixed_skew, $intra_preloads
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        notAnotBTest16_16_4pixels {TRUE}, {FALSE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs
+        Read2Words dst, 0,, 0
+        Read2Words src, 2, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        notAnotBTest16_16_128bits_tail $src
+        notAnotBTest16_16_4pixels {FALSE}, {TRUE}, $wk0, $wk1, $wk2, $wk3, map, bitptrs
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $reinitwk, $cleanup
+
+notAnotBTest GenerateFunctions 16, 16,, \
+  FLAG_VECTOR_HALFTONE :OR: FLAG_DST_READONLY :OR: FLAG_SPILL_LINE_VARS, 2, \
+  "y,stride_d,stride_s,skew,orig_w", \
+  "x,y,stride_d,stride_s", orig_w,, init,,, cleanup ; leading_pixels_reg = wk4
+
+; ********************************************************************
+
+        MACRO
+        notAmatchBTest16_16_init
+        LDR     bitptrs, =0x00010001
+        MOV     map, #0
+        PKHBT   ht, ht, LSL #16     ; replicate the constant colours across words
+        PKHBT   ht_info, ht_info, LSL #16
+        MEND
+
+        MACRO
+        notAmatchBTest16_16_cleanup
+        MOV     a1, #0
+        B       %FT95
+90      ADD     sp, sp, #num_line_saved_regs * 4
+        MOV     a1, #1
+95
+        MEND
+
+        MACRO
+        notAmatchBTest16_16_1pixel $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $zeros, $ones
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $ones, $zeros
+        AND     $srcA, $srcA, $srcB
+        TST     $srcA, #0x10000
+        BNE     %FA90
+        MEND
+
+        MACRO
+        notAmatchBTest16_16_2pixels $first, $last, $srcA, $srcB, $zeros, $ones
+        EOR     $srcA, $srcA, ht
+        EOR     $srcB, $srcB, ht_info
+        USUB16  $srcA, $zeros, $srcA
+        SEL     $srcA, $zeros, $ones
+        USUB16  $srcB, $zeros, $srcB
+        SEL     $srcB, $ones, $zeros
+    [ $first :LAND: $last  ; avoid touching wk4
+        TST     $srcA, $srcB
+    |
+      [ $first
+        AND     $wk4, $srcA, $srcB
+      |
+        AND     $srcA, $srcA, $srcB
+        ORRS    $wk4, $wk4, $srcA
+      ]
+    ]
+      [ $last
+        BNE     %FA90
+      ]
+        MEND
+
+        MACRO
+        notAmatchBTest16_16_4pixels $first, $last, $srcA0, $srcA1, $srcB0, $srcB1, $zeros, $ones
+        notAmatchBTest16_16_2pixels $first,  {FALSE}, $srcA0, $srcB0, $zeros, $ones
+        notAmatchBTest16_16_2pixels {FALSE}, $last,   $srcA1, $srcB1, $zeros, $ones
+        MEND
+
+        MACRO
+        notAmatchBTest16_16_16bits $src, $dst, $fixed_skew
+        notAmatchBTest16_16_1pixel $dst, $src, map, bitptrs
+        MEND
+
+        MACRO
+        notAmatchBTest16_16_32bits $src, $dst, $fixed_skew
+        Read1Word dst, 0,, 0
+        Read1Word src, 1, carry, $fixed_skew, skew, scratch

@@ Diff output truncated at 50000 characters. @@


More information about the Vm-dev mailing list