[Vm-dev] [commit][2766] add optimised BitBlt support

Thu Aug 15 14:17:31 UTC 2013

Revision: 2766
Author:   piumarta
Date:     2013-08-15 07:17:30 -0700 (Thu, 15 Aug 2013)
Log Message:
-----------
add optimised BitBlt support

Modified Paths:
--------------
    trunk/platforms/unix/vm-display-X11/config.cmake
    trunk/platforms/unix/vm-display-X11/sqUnixX11.c

Added Paths:
-----------
    trunk/platforms/unix/vm-display-X11/sqUnixX11Arm.S

Modified: trunk/platforms/unix/vm-display-X11/config.cmake
===================================================================

--- trunk/platforms/unix/vm-display-X11/config.cmake	2013-08-15 00:03:27 UTC (rev 2765)
+++ trunk/platforms/unix/vm-display-X11/config.cmake	2013-08-15 14:17:30 UTC (rev 2766)
@@ -22,6 +22,18 @@
       SET (USE_X11_GLX 1)
     ENDIF (HAVE_GL_GL_H AND HAVE_GL_GLX_H)
   ENDIF (OPENGL_FOUND)
+  IF (DEFINED ENABLE-FAST-BLT)
+    SET (USE_FAST_BLT 1)
+    IF (vm-host-cpu MATCHES "arm")
+      ENABLE_LANGUAGE (ASM)
+      SET (CMAKE_ASM_COMPILE_OBJECT "asasm -cpu 6 -I ${cross}/plugins/BitBltPlugin -o <OBJECT> <SOURCE>")
+      SET (${plugin}_sources ${${plugin}_sources} "${unix}/${plugin}/sqUnixX11Arm.S")
+    ELSE ()
+      MESSAGE (FATAL_ERROR "
+  --enableFastBlt is not supported on this platform
+"     )
+    ENDIF ()
+  ENDIF (DEFINED ENABLE-FAST-BLT)
   PLUGIN_INCLUDE_DIRECTORIES (${cross}/plugins/B3DAcceleratorPlugin ${cross}/plugins/FilePlugin)
   PLUGIN_INCLUDE_DIRECTORIES (${X11_INCLUDE_DIR} ${OPENGL_INCLUDE_DIR})
   PLUGIN_LINK_LIBRARIES (${X11_LIBRARIES} ${OPENGL_LIBRARIES})
@@ -29,6 +41,7 @@
 
 CONFIG_DEFINE (USE_X11)
 CONFIG_DEFINE (USE_X11_GLX)
+CONFIG_DEFINE (USE_FAST_BLT)
 
 CONFIG_DEFINE (HAVE_LIBXEXT)
 CONFIG_DEFINE (HAVE_LIBXRENDER)

Modified: trunk/platforms/unix/vm-display-X11/sqUnixX11.c
===================================================================
--- trunk/platforms/unix/vm-display-X11/sqUnixX11.c	2013-08-15 00:03:27 UTC (rev 2765)
+++ trunk/platforms/unix/vm-display-X11/sqUnixX11.c	2013-08-15 14:17:30 UTC (rev 2766)
@@ -27,7 +27,7 @@
 
 /* Author: Ian Piumarta <ian.piumarta at squeakland.org>
  *
- * Last edited: 2012-06-27 05:07:55 by piumarta on ubuntu32-1204
+ * Last edited: 2013-08-11 23:12:08 by piumarta on emilia
  *
  * Support for more intelligent CLIPBOARD selection handling contributed by:
  *	Ned Konz <ned at bike-nomad.com>
@@ -65,6 +65,15 @@
 #undef HAVE_OPENGL_GL_H		/* don't include Quartz OpenGL if configured */
 #include "SqDisplay.h"
 
+#if defined(USE_FAST_BLT)
+  /* XXX referring to plugin variables *requires* BitBitPlugin to be included by VMM as an internal plugin */
+# if defined(__arm__)
+#   include "../../../Cross/plugins/BitBltPlugin/BitBltArm.h"
+# else
+#   error configuration error
+# endif
+#endif
+
 #if defined(ioMSecs)
 # undef ioMSecs
 #endif
@@ -5028,17 +5037,68 @@
     }
 }
 
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+
+    extern void armSimdConvert_x888_8_LEPacking32_8_wide(unsigned int width, unsigned int height,
+							 unsigned int *dst, unsigned int dstStride,
+							 unsigned int *src, unsigned int srcStride,
+							 unsigned int halftone, unsigned int halftoneInfo,
+							 unsigned int *colourMap);
+
+    extern void armSimdConvert_x888_8_LEPacking32_8_narrow(unsigned int width, unsigned int height,
+							   unsigned int *dst, unsigned int dstStride,
+							   unsigned int *src, unsigned int srcStride,
+							   unsigned int halftone, unsigned int halftoneInfo,
+							   unsigned int *colourMap);
+
+    static void armSimdCopyImage32To8(int *fromImageData, int *toImageData, int width, int height,
+				      int affectedL, int affectedT, int affectedR, int affectedB,
+				      unsigned int *downGradingColors)
+    {
+      /* Find image strides in 32-bit words */
+      unsigned int srcStride= width;
+      unsigned int dstStride= (width + 3) >> 2;
+      /* Round affected region out to encompass complete words in both images */
+      affectedL &= ~3;
+      affectedR= (affectedR + 3) &~ 3;
+      width=  affectedR - affectedL;
+      height= affectedB - affectedT;
+      /* Find first words */
+      fromImageData += srcStride * affectedT + affectedL;
+      toImageData += dstStride * affectedT + (affectedL >> 2);
+      /* Adjust strides to remove number of words read/written */
+      srcStride -= affectedR - affectedL;
+      dstStride -= (affectedR - affectedL) >> 2;
+      /* Work out which width class this operation is. */
+      if (width > (128 - 32) / 8 && ((-1 ^ (width -(128 - 32) / 8)) & ~(31 / 8)))
+	armSimdConvert_x888_8_LEPacking32_8_wide(width, height, toImageData, dstStride, fromImageData, srcStride, 0, 0, downGradingColors);
+      else
+	armSimdConvert_x888_8_LEPacking32_8_narrow(width, height, toImageData, dstStride, fromImageData, srcStride, 0, 0, downGradingColors);
+    }
+# else
+#   error configuration error
+# endif
+#endif
+
 void copyImage32To8(int *fromImageData, int *toImageData, int width, int height,
 		    int affectedL, int affectedT, int affectedR, int affectedB)
 {
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+    armSimdCopyImage32To8(fromImageData, toImageData, width, height, affectedL, affectedT, affectedR, affectedB, stDownGradingColors);
+# else
+#   error configuration error
+# endif
+#else
   int scanLine32, firstWord32, lastWord32;
   int scanLine8, firstWord8;
   int line;
 
-#define map32To8(w) (col= (w), stDownGradingColors[\
-  (((col >> (16+(8-3))) & 0x7) << 5) | \
-  (((col >> ( 8+(8-3))) & 0x7) << 2) | \
-   ((col >> ( 0+(8-2))) & 0x7)])
+# define map32To8(w) (col= (w), stDownGradingColors[\
+    (((col >> (16+(8-3))) & 0x7) << 5) | \
+    (((col >> ( 8+(8-3))) & 0x7) << 2) | \
+     ((col >> ( 0+(8-2))) & 0x7)])
 
   scanLine32= bytesPerLine(width, 32);
   firstWord32= scanLine32*affectedT + bytesPerLineRD(affectedL, 32);
@@ -5061,8 +5121,9 @@
     firstWord32+= scanLine32;
     lastWord32+= scanLine32;
     firstWord8+= scanLine8;
+# undef map32To8
   }
-#undef map32To8
+#endif /* !USE_FAST_BLT */
 }
 
 void copyImage16To32(int *fromImageData, int *toImageData, int width, int height,
@@ -5187,47 +5248,96 @@
 #undef map16To24
 }
 
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
 
+    extern void armSimdConvert_x888_0565_LEPacking32_16_wide(unsigned int width, unsigned int height,
+							     unsigned int *dst, unsigned int dstStride,
+							     unsigned int *src, unsigned int srcStride);
+
+    extern void armSimdConvert_x888_0565_LEPacking32_16_narrow(unsigned int width, unsigned int height,
+							       unsigned int *dst, unsigned int dstStride,
+							       unsigned int *src, unsigned int srcStride);
+    static void armSimdCopyImage32To16(int *fromImageData, int *toImageData, int width, int height,
+				       int affectedL, int affectedT, int affectedR, int affectedB)
+    {
+      /* Find image strides in 32-bit words */
+      unsigned int srcStride= width;
+      unsigned int dstStride= (width + 1) >> 1;
+      /* Round affected region out to encompass complete words in both images */
+      affectedL &= ~1;
+      affectedR += affectedR & 1;
+      width=  affectedR - affectedL;
+      height= affectedB - affectedT;
+      /* Find first words */
+      fromImageData += srcStride * affectedT + affectedL;
+      toImageData += dstStride * affectedT + (affectedL >> 1);
+      /* Adjust strides to remove number of words read/written */
+      srcStride -= affectedR - affectedL;
+      dstStride -= (affectedR - affectedL) >> 1;
+      /* Work out which width class this operation is. */
+      if (width > (128 - 32) / 16 && ((-1 ^ (width - (128 - 32) / 16)) & ~(31 / 16)))
+	  armSimdConvert_x888_0565_LEPacking32_16_wide(width, height, toImageData, dstStride, fromImageData, srcStride);
+      else
+	  armSimdConvert_x888_0565_LEPacking32_16_narrow(width, height, toImageData, dstStride, fromImageData, srcStride);
+    }
+
+# else
+#   error configuration error
+# endif
+#endif
+
 void copyImage32To16(int *fromImageData, int *toImageData, int width, int height,
 		     int affectedL, int affectedT, int affectedR, int affectedB)
 {
-  int scanLine32, firstWord32, lastWord32;
-  int scanLine16, firstWord16;
-  int line;
-  int rshift, gshift, bshift;
-  register unsigned int col;
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+  if (stRNMask == 5 && stRShift == 11 && stGNMask == 6 && stGShift == 5 && stBNMask == 5 && stBShift == 0)
+    armSimdCopyImage32To16(fromImageData, toImageData, width, height, affectedL, affectedT, affectedR, affectedB);
+  else
+# else
+#  error configuration error
+# endif
+#endif
+  {
+    int scanLine32, firstWord32, lastWord32;
+    int scanLine16, firstWord16;
+    int line;
+    int rshift, gshift, bshift;
+    register unsigned int col;
 
-  rshift= stRNMask-5 + stRShift;
-  gshift= stGNMask-5 + stGShift;
-  bshift= stBNMask-5 + stBShift;
+    rshift= stRNMask-5 + stRShift;
+    gshift= stGNMask-5 + stGShift;
+    bshift= stBNMask-5 + stBShift;
 
-#define map32To16(w) (col= (w), \
-  (((col >> 19) & 0x1f) << rshift) | \
-  (((col >> 11) & 0x1f) << gshift) | \
-  (((col >>  3) & 0x1f) << bshift))
+#   define map32To16(w) (col= (w), \
+      (((col >> 19) & 0x1f) << rshift) | \
+      (((col >> 11) & 0x1f) << gshift) | \
+      (((col >>  3) & 0x1f) << bshift))
 
-  scanLine32= bytesPerLine(width, 32);
-  firstWord32= scanLine32*affectedT + bytesPerLineRD(affectedL, 32);
-  lastWord32= scanLine32*affectedT + bytesPerLine(affectedR, 32);
-  scanLine16= bytesPerLine(width, 16);
-  firstWord16= scanLine16*affectedT + (bytesPerLineRD(affectedL, 32) >> 1);
+    scanLine32= bytesPerLine(width, 32);
+    firstWord32= scanLine32*affectedT + bytesPerLineRD(affectedL, 32);
+    lastWord32= scanLine32*affectedT + bytesPerLine(affectedR, 32);
+    scanLine16= bytesPerLine(width, 16);
+    firstWord16= scanLine16*affectedT + (bytesPerLineRD(affectedL, 32) >> 1);
 
-  for (line= affectedT; line < affectedB; line++)
-    {
-      register unsigned int *from= (unsigned int *)((long)fromImageData+firstWord32);
-      register unsigned int *limit= (unsigned int *)((long)fromImageData+lastWord32);
-      register unsigned short *to= (unsigned short *)((long)toImageData+firstWord16);
-      while (from < limit)
-	{
-	  to[0]= map32To16(from[0]);
-	  from++;
-	  to++;
-	}
-      firstWord32+= scanLine32;
-      lastWord32+= scanLine32;
-      firstWord16+= scanLine16;
-    }
-#undef map32To16
+    for (line= affectedT; line < affectedB; line++)
+      {
+	register unsigned int *from= (unsigned int *)((long)fromImageData+firstWord32);
+	register unsigned int *limit= (unsigned int *)((long)fromImageData+lastWord32);
+	register unsigned short *to= (unsigned short *)((long)toImageData+firstWord16);
+	while (from < limit)
+	  {
+	    to[0]= map32To16(from[0]);
+	    from++;
+	    to++;
+	  }
+	firstWord32+= scanLine32;
+	lastWord32+= scanLine32;
+	firstWord16+= scanLine16;
+      }
+# undef map32To16
+  }
 }
 
 void copyImage16To16(int *fromImageData, int *toImageData, int width, int height,
@@ -5274,42 +5384,85 @@
 #undef map16To16
 }
 
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+    extern void armSimdConvert_x888_x888BGR_LEPacking32_32_wide(unsigned int width, unsigned int height,
+								unsigned int *dst, unsigned int dstStride,
+								unsigned int *src, unsigned int srcStride);
+
+    extern void armSimdConvert_x888_x888BGR_LEPacking32_32_narrow(unsigned int width, unsigned int height,
+								  unsigned int *dst, unsigned int dstStride,
+								  unsigned int *src, unsigned int srcStride);
+
+    static void armSimdCopyImage32To32(int *fromImageData, int *toImageData, int width, int height,
+				       int affectedL, int affectedT, int affectedR, int affectedB)
+    {
+      unsigned int stride= width;
+      width=  affectedR - affectedL;
+      height= affectedB - affectedT;
+      /* Find first words */
+      fromImageData += stride * affectedT + affectedL;
+      toImageData   += stride * affectedT + affectedL;
+      /* Adjust stride to remove number of words read/written */
+      stride -= width;
+      /* Work out which width class this operation is. */
+      if (width > (128 - 32) / 32 && (-1 ^ (width - (128 - 32) / 32)))
+	armSimdConvert_x888_x888BGR_LEPacking32_32_wide(width, height, toImageData, stride, fromImageData, stride);
+      else
+	armSimdConvert_x888_x888BGR_LEPacking32_32_narrow(width, height, toImageData, stride, fromImageData, stride);
+    }
+# else
+#   error configuration error
+# endif
+#endif
+
 void copyImage32To32(int *fromImageData, int *toImageData, int width, int height,
 		     int affectedL, int affectedT, int affectedR, int affectedB)
 {
-  int scanLine32, firstWord32, lastWord32;
-  int line;
-  int rshift, gshift, bshift;
-  register unsigned int col;
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+    if ((armCpuFeatures & ARM_V6) && stRNMask == 8 && stRShift == 0 && stGNMask == 8 && stGShift == 8 && stBNMask == 8 && stBShift == 16)
+      armSimdCopyImage32To32(fromImageData, toImageData, width, height, affectedL, affectedT, affectedR, affectedB);
+    else
+# else
+#  error unsupported use of ENABLE_FAST_BLT
+# endif
+#endif
+  {
+    int scanLine32, firstWord32, lastWord32;
+    int line;
+    int rshift, gshift, bshift;
+    register unsigned int col;
 
-  rshift= stRNMask-8 + stRShift;
-  gshift= stGNMask-8 + stGShift;
-  bshift= stBNMask-8 + stBShift;
+    rshift= stRNMask-8 + stRShift;
+    gshift= stGNMask-8 + stGShift;
+    bshift= stBNMask-8 + stBShift;
 
-#define map32To32(w) (col= (w), \
-  (((col >> 16) & 0xff) << rshift) | \
-  (((col >> 8)  & 0xff) << gshift) | \
-   ((col & 0xff) << bshift))
+#  define map32To32(w) (col= (w), \
+    (((col >> 16) & 0xff) << rshift) | \
+    (((col >> 8)  & 0xff) << gshift) | \
+     ((col & 0xff) << bshift))
 
-  scanLine32= bytesPerLine(width, 32);
-  firstWord32= scanLine32*affectedT + bytesPerLineRD(affectedL, 32);
-  lastWord32= scanLine32*affectedT + bytesPerLine(affectedR, 32);
+    scanLine32= bytesPerLine(width, 32);
+    firstWord32= scanLine32*affectedT + bytesPerLineRD(affectedL, 32);
+    lastWord32= scanLine32*affectedT + bytesPerLine(affectedR, 32);
 
-  for (line= affectedT; line < affectedB; line++)
-    {
-      register unsigned int *from= (unsigned int *)((long)fromImageData+firstWord32);
-      register unsigned int *limit= (unsigned int *)((long)fromImageData+lastWord32);
-      register unsigned int *to= (unsigned int *)((long)toImageData+firstWord32);
-      while (from < limit)
-	{
-	  *to= map32To32(*from);
-	  from++;
-	  to++;
-	}
-      firstWord32+= scanLine32;
-      lastWord32+= scanLine32;
-    }
-#undef map32To32
+    for (line= affectedT; line < affectedB; line++)
+      {
+	register unsigned int *from= (unsigned int *)((long)fromImageData+firstWord32);
+	register unsigned int *limit= (unsigned int *)((long)fromImageData+lastWord32);
+	register unsigned int *to= (unsigned int *)((long)toImageData+firstWord32);
+	while (from < limit)
+	  {
+	    *to= map32To32(*from);
+	    from++;
+	    to++;
+	  }
+	firstWord32+= scanLine32;
+	lastWord32+= scanLine32;
+      }
+# undef map32To32
+  }
 }
 
 void copyImage32To32Same(int *fromImageData, int *toImageData,

Added: trunk/platforms/unix/vm-display-X11/sqUnixX11Arm.S
===================================================================
--- trunk/platforms/unix/vm-display-X11/sqUnixX11Arm.S	                        (rev 0)
+++ trunk/platforms/unix/vm-display-X11/sqUnixX11Arm.S	2013-08-15 14:17:30 UTC (rev 2766)
@@ -0,0 +1,246 @@
+;
+; Copyright © 2013 Raspberry Pi Foundation
+; Copyright © 2013 RISC OS Open Ltd
+;
+; Permission to use, copy, modify, distribute, and sell this software and its
+; documentation for any purpose is hereby granted without fee, provided that
+; the above copyright notice appear in all copies and that both that
+; copyright notice and this permission notice appear in supporting
+; documentation, and that the name of the copyright holders not be used in
+; advertising or publicity pertaining to distribution of the software without
+; specific, written prior permission.  The copyright holders make no
+; representations about the suitability of this software for any purpose.  It
+; is provided "as is" without express or implied warranty.
+;
+; THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+; SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+; FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+; SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+; OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+; SOFTWARE.
+;
+
+; Debug options
+                GBLL    DebugData
+;DebugData       SETL    {TRUE}
+                GBLL    DebugPld
+;DebugPld        SETL    {TRUE}
+                GBLL    VerboseBuild
+;VerboseBuild    SETL    {TRUE}
+
+        GET     BitBltArmSimdAsm.hdr
+
+        AREA    |sqUnixX11Arm$$Code|, CODE, READONLY
+        ARM
+
+; ********************************************************************
+
+        MACRO
+        Convert_x888_8_LEPacking_1pixel $src, $dst
+        AND     $dst, ht, $src, LSR #6     ; 00000000000000rrr0000000000000bb
+        AND     $src, $src, #&E000         ; 0000000000000000ggg0000000000000
+        ORR     $dst, $dst, $dst, LSR #10  ; 00000000000000rrr0000000rrr000bb
+        ORR     $dst, $dst, $src, LSR #11  ; 00000000000000rrr0000000rrrgggbb
+        AND     $dst, $dst, #&FF           ; 000000000000000000000000rrrgggbb
+        LDR     $dst, [map, $dst, LSL #2]
+        MEND
+
+        MACRO
+        Convert_x888_8_LEPacking_4pixels $src0, $src1, $src2, $src3, $dst
+        Convert_x888_8_LEPacking_1pixel $src0, $dst
+        Convert_x888_8_LEPacking_1pixel $src1, $src0
+        Convert_x888_8_LEPacking_1pixel $src2, $src1
+        Convert_x888_8_LEPacking_1pixel $src3, $src2
+        ORR     $dst, $dst, $src0, LSL #8
+        ORR     $dst, $dst, $src1, LSL #16
+        ORR     $dst, $dst, $src2, LSL #24
+        MEND
+
+        MACRO
+        Convert_x888_8_LEPacking32_8_init
+        LDR     ht, =&38003
+        B       %FT00
+        LTORG
+00
+        MEND
+
+        MACRO
+        Convert_x888_8_LEPacking32_8_8bits $src, $dst, $fixed_skew
+        ; This code should never be executed. It's for handling stray
+        ; pixels at the start or end of the row, but for now the
+        ; assembler framework only supports packing these big-endian
+        ; into words, which isn't what we want.
+        MEND
+
+        MACRO
+        Convert_x888_8_LEPacking32_8_16bits $src, $dst, $fixed_skew
+        ; This code should never be executed. It's for handling stray
+        ; pixels at the start or end of the row, but for now the
+        ; assembler framework only supports packing these big-endian
+        ; into words, which isn't what we want.
+        MEND
+
+        MACRO
+        Convert_x888_8_LEPacking32_8_32bits $src, $dst, $fixed_skew
+        Read4Words src, 4, carry, $fixed_skew, skew, unused
+        Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk0
+        Write1Word dst, 0
+        MEND
+
+        MACRO
+        Convert_x888_8_LEPacking32_8_64bits $src, $fixed_skew
+        Read4Words src, 4, carry, $fixed_skew, skew, unused
+        Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk0
+        Read4Words src, 4, carry, $fixed_skew, skew, unused
+        Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk1
+        Write2Words dst, 0
+        MEND
+
+        MACRO
+        Convert_x888_8_LEPacking32_8_128bits_head $src, $fixed_skew, $intra_preloads
+        Read4Words src, 4, carry, $fixed_skew, skew, unused
+        Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk0
+        Read4Words src, 4, carry, $fixed_skew, skew, unused
+      [ "$intra_preloads" <> ""
+        PreloadMiddle
+      ]
+        Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk1
+        Read4Words src, 4, carry, $fixed_skew, skew, unused
+        Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk2
+        Read4Words src, 4, carry, $fixed_skew, skew, unused
+        MEND
+
+        MACRO
+        Convert_x888_8_LEPacking32_8_128bits_tail $src
+        Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk3
+        Write4Words dst, 0
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $cleanup
+
+Convert_x888_8_LEPacking GenerateFunctions 32, 8,, \
+  FLAG_COLOUR_MAP :OR: FLAG_DST_WRITEONLY :OR: FLAG_SPILL_LINE_VARS, 3, \
+  "y,stride_d,stride_s,ht_info,bitptrs,skew,orig_w,carry", \
+  "x,y,stride_d,stride_s", ht_info,, init ; leading_pixels_reg = wk3
+
+; ********************************************************************
+
+        MACRO
+        Convert_x888_0565_LEPacking_2pixels $src0, $src1, $dst
+        AND     $dst, ht, $src1, LSR #3     ; 00000000000rrrrr00000000000bbbbb
+        AND     $src1, $src1, #&FC00        ; 0000000000000000gggggg0000000000
+        ORR     $dst, $dst, $dst, LSR #5    ; 00000000000rrrrrrrrrr000000bbbbb
+        ORR     $src1, $dst, $src1, LSR #5  ; 00000000000rrrrrrrrrrggggggbbbbb
+        AND     $dst, ht, $src0, LSR #3     ; 00000000000RRRRR00000000000BBBBB
+        AND     $src0, $src0, #&FC00        ; 0000000000000000GGGGGG0000000000
+        ORR     $dst, $dst, $dst, LSR #5    ; 00000000000RRRRRRRRRR000000BBBBB
+        ORR     $src0, $dst, $src0, LSR #5  ; 00000000000RRRRRRRRRRGGGGGGBBBBB
+        BIC     $src0, $src0, #&1F0000      ; 0000000000000000RRRRRGGGGGGBBBBB
+        ORR     $dst, $src0, $src1, LSL #16 ; rrrrrggggggbbbbbRRRRRGGGGGGBBBBB
+        MEND
+
+        MACRO
+        Convert_x888_0565_LEPacking32_16_init
+        LDR     ht, =&001F001F
+        B       %FT00
+        LTORG
+00
+        MEND
+
+        MACRO
+        Convert_x888_0565_LEPacking32_16_16bits $src, $dst, $fixed_skew
+        ; This code should never be executed. It's for handling stray
+        ; pixels at the start or end of the row, but for now the
+        ; assembler framework only supports packing these big-endian
+        ; into words, which isn't what we want.
+        MEND
+
+        MACRO
+        Convert_x888_0565_LEPacking32_16_32bits $src, $dst, $fixed_skew
+        Read2Words src, 3, carry, $fixed_skew, skew, unused
+        Convert_x888_0565_LEPacking_2pixels $wk3, $wk4, $wk0
+        Write1Word dst, 0
+        MEND
+
+        MACRO
+        Convert_x888_0565_LEPacking32_16_64bits $src, $fixed_skew
+        Read4Words src, 3, carry, $fixed_skew, skew, unused
+        Convert_x888_0565_LEPacking_2pixels $wk3, $wk4, $wk0
+        Convert_x888_0565_LEPacking_2pixels $wk5, $wk6, $wk1
+        Write2Words dst, 0
+        MEND
+
+        MACRO
+        Convert_x888_0565_LEPacking32_16_128bits_head $src, $fixed_skew, $intra_preloads
+        Read4Words src, 3, carry, $fixed_skew, skew, unused
+        Convert_x888_0565_LEPacking_2pixels $wk3, $wk4, $wk0
+        Convert_x888_0565_LEPacking_2pixels $wk5, $wk6, $wk1
+        Read4Words src, 3, carry, $fixed_skew, skew, unused
+        MEND
+
+        MACRO
+        Convert_x888_0565_LEPacking32_16_128bits_tail $src
+        Convert_x888_0565_LEPacking_2pixels $wk3, $wk4, $wk2
+        Convert_x888_0565_LEPacking_2pixels $wk5, $wk6, $wk3
+        Write4Words dst, 0
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $cleanup
+
+Convert_x888_0565_LEPacking GenerateFunctions 32, 16,, \
+  FLAG_DST_WRITEONLY :OR: FLAG_SPILL_LINE_VARS, 3, \
+  "stride_s,ht_info,map,bitptrs,skew,orig_w,carry", \
+  "x,stride_s", map, scratch, init ; leading_pixels_reg = wk2
+
+; ********************************************************************
+
+        MACRO
+        Convert_x888_x888BGR_LEPacking32_32_32bits $src, $dst, $fixed_skew
+        SETEND  BE
+        Read1Word src, 0, carry, $fixed_skew, skew, unused
+        SETEND  LE
+        MOV     $wk0, $wk0, LSR #8
+        Write1Word dst, 0
+        MEND
+
+        MACRO
+        Convert_x888_x888BGR_LEPacking32_32_64bits $src, $fixed_skew
+        SETEND  BE
+        Read2Words src, 0, carry, $fixed_skew, skew, unused
+        SETEND  LE
+        MOV     $wk0, $wk0, LSR #8
+        MOV     $wk1, $wk1, LSR #8
+        Write2Words dst, 0
+        MEND
+
+        MACRO
+        Convert_x888_x888BGR_LEPacking32_32_128bits_head $src, $fixed_skew, $intra_preloads
+        SETEND  BE
+        Read4Words src, 0, carry, $fixed_skew, skew, unused
+        MEND
+
+        MACRO
+        Convert_x888_x888BGR_LEPacking32_32_128bits_tail $src
+        SETEND  LE
+        MOV     $wk0, $wk0, LSR #8
+        MOV     $wk1, $wk1, LSR #8
+        MOV     $wk2, $wk2, LSR #8
+        MOV     $wk3, $wk3, LSR #8
+        Write4Words dst, 0
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $cleanup
+
+Convert_x888_x888BGR_LEPacking GenerateFunctions 32, 32,, \
+  FLAG_DST_WRITEONLY, 2, \
+  "ht,ht_info,map,bitptrs", \
+  "", skew, scratch
+
+; ********************************************************************
+
+        END