[Vm-dev] [commit][2766] add optimised BitBlt support
commits at squeakvm.org
commits at squeakvm.org
Thu Aug 15 14:17:31 UTC 2013
Revision: 2766
Author: piumarta
Date: 2013-08-15 07:17:30 -0700 (Thu, 15 Aug 2013)
Log Message:
-----------
add optimised BitBlt support
Modified Paths:
--------------
trunk/platforms/unix/vm-display-X11/config.cmake
trunk/platforms/unix/vm-display-X11/sqUnixX11.c
Added Paths:
-----------
trunk/platforms/unix/vm-display-X11/sqUnixX11Arm.S
Modified: trunk/platforms/unix/vm-display-X11/config.cmake
===================================================================
--- trunk/platforms/unix/vm-display-X11/config.cmake 2013-08-15 00:03:27 UTC (rev 2765)
+++ trunk/platforms/unix/vm-display-X11/config.cmake 2013-08-15 14:17:30 UTC (rev 2766)
@@ -22,6 +22,18 @@
SET (USE_X11_GLX 1)
ENDIF (HAVE_GL_GL_H AND HAVE_GL_GLX_H)
ENDIF (OPENGL_FOUND)
+ IF (DEFINED ENABLE-FAST-BLT)
+ SET (USE_FAST_BLT 1)
+ IF (vm-host-cpu MATCHES "arm")
+ ENABLE_LANGUAGE (ASM)
+ SET (CMAKE_ASM_COMPILE_OBJECT "asasm -cpu 6 -I ${cross}/plugins/BitBltPlugin -o <OBJECT> <SOURCE>")
+ SET (${plugin}_sources ${${plugin}_sources} "${unix}/${plugin}/sqUnixX11Arm.S")
+ ELSE ()
+ MESSAGE (FATAL_ERROR "
+ --enableFastBlt is not supported on this platform
+" )
+ ENDIF ()
+ ENDIF (DEFINED ENABLE-FAST-BLT)
PLUGIN_INCLUDE_DIRECTORIES (${cross}/plugins/B3DAcceleratorPlugin ${cross}/plugins/FilePlugin)
PLUGIN_INCLUDE_DIRECTORIES (${X11_INCLUDE_DIR} ${OPENGL_INCLUDE_DIR})
PLUGIN_LINK_LIBRARIES (${X11_LIBRARIES} ${OPENGL_LIBRARIES})
@@ -29,6 +41,7 @@
CONFIG_DEFINE (USE_X11)
CONFIG_DEFINE (USE_X11_GLX)
+CONFIG_DEFINE (USE_FAST_BLT)
CONFIG_DEFINE (HAVE_LIBXEXT)
CONFIG_DEFINE (HAVE_LIBXRENDER)
Modified: trunk/platforms/unix/vm-display-X11/sqUnixX11.c
===================================================================
--- trunk/platforms/unix/vm-display-X11/sqUnixX11.c 2013-08-15 00:03:27 UTC (rev 2765)
+++ trunk/platforms/unix/vm-display-X11/sqUnixX11.c 2013-08-15 14:17:30 UTC (rev 2766)
@@ -27,7 +27,7 @@
/* Author: Ian Piumarta <ian.piumarta at squeakland.org>
*
- * Last edited: 2012-06-27 05:07:55 by piumarta on ubuntu32-1204
+ * Last edited: 2013-08-11 23:12:08 by piumarta on emilia
*
* Support for more intelligent CLIPBOARD selection handling contributed by:
* Ned Konz <ned at bike-nomad.com>
@@ -65,6 +65,15 @@
#undef HAVE_OPENGL_GL_H /* don't include Quartz OpenGL if configured */
#include "SqDisplay.h"
+#if defined(USE_FAST_BLT)
+ /* XXX referring to plugin variables *requires* BitBitPlugin to be included by VMM as an internal plugin */
+# if defined(__arm__)
+# include "../../../Cross/plugins/BitBltPlugin/BitBltArm.h"
+# else
+# error configuration error
+# endif
+#endif
+
#if defined(ioMSecs)
# undef ioMSecs
#endif
@@ -5028,17 +5037,68 @@
}
}
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+
+ extern void armSimdConvert_x888_8_LEPacking32_8_wide(unsigned int width, unsigned int height,
+ unsigned int *dst, unsigned int dstStride,
+ unsigned int *src, unsigned int srcStride,
+ unsigned int halftone, unsigned int halftoneInfo,
+ unsigned int *colourMap);
+
+ extern void armSimdConvert_x888_8_LEPacking32_8_narrow(unsigned int width, unsigned int height,
+ unsigned int *dst, unsigned int dstStride,
+ unsigned int *src, unsigned int srcStride,
+ unsigned int halftone, unsigned int halftoneInfo,
+ unsigned int *colourMap);
+
+ static void armSimdCopyImage32To8(int *fromImageData, int *toImageData, int width, int height,
+ int affectedL, int affectedT, int affectedR, int affectedB,
+ unsigned int *downGradingColors)
+ {
+ /* Find image strides in 32-bit words */
+ unsigned int srcStride= width;
+ unsigned int dstStride= (width + 3) >> 2;
+ /* Round affected region out to encompass complete words in both images */
+ affectedL &= ~3;
+ affectedR= (affectedR + 3) &~ 3;
+ width= affectedR - affectedL;
+ height= affectedB - affectedT;
+ /* Find first words */
+ fromImageData += srcStride * affectedT + affectedL;
+ toImageData += dstStride * affectedT + (affectedL >> 2);
+ /* Adjust strides to remove number of words read/written */
+ srcStride -= affectedR - affectedL;
+ dstStride -= (affectedR - affectedL) >> 2;
+ /* Work out which width class this operation is. */
+ if (width > (128 - 32) / 8 && ((-1 ^ (width -(128 - 32) / 8)) & ~(31 / 8)))
+ armSimdConvert_x888_8_LEPacking32_8_wide(width, height, toImageData, dstStride, fromImageData, srcStride, 0, 0, downGradingColors);
+ else
+ armSimdConvert_x888_8_LEPacking32_8_narrow(width, height, toImageData, dstStride, fromImageData, srcStride, 0, 0, downGradingColors);
+ }
+# else
+# error configuration error
+# endif
+#endif
+
void copyImage32To8(int *fromImageData, int *toImageData, int width, int height,
int affectedL, int affectedT, int affectedR, int affectedB)
{
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+ armSimdCopyImage32To8(fromImageData, toImageData, width, height, affectedL, affectedT, affectedR, affectedB, stDownGradingColors);
+# else
+# error configuration error
+# endif
+#else
int scanLine32, firstWord32, lastWord32;
int scanLine8, firstWord8;
int line;
-#define map32To8(w) (col= (w), stDownGradingColors[\
- (((col >> (16+(8-3))) & 0x7) << 5) | \
- (((col >> ( 8+(8-3))) & 0x7) << 2) | \
- ((col >> ( 0+(8-2))) & 0x7)])
+# define map32To8(w) (col= (w), stDownGradingColors[\
+ (((col >> (16+(8-3))) & 0x7) << 5) | \
+ (((col >> ( 8+(8-3))) & 0x7) << 2) | \
+ ((col >> ( 0+(8-2))) & 0x7)])
scanLine32= bytesPerLine(width, 32);
firstWord32= scanLine32*affectedT + bytesPerLineRD(affectedL, 32);
@@ -5061,8 +5121,9 @@
firstWord32+= scanLine32;
lastWord32+= scanLine32;
firstWord8+= scanLine8;
+# undef map32To8
}
-#undef map32To8
+#endif /* !USE_FAST_BLT */
}
void copyImage16To32(int *fromImageData, int *toImageData, int width, int height,
@@ -5187,47 +5248,96 @@
#undef map16To24
}
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+ extern void armSimdConvert_x888_0565_LEPacking32_16_wide(unsigned int width, unsigned int height,
+ unsigned int *dst, unsigned int dstStride,
+ unsigned int *src, unsigned int srcStride);
+
+ extern void armSimdConvert_x888_0565_LEPacking32_16_narrow(unsigned int width, unsigned int height,
+ unsigned int *dst, unsigned int dstStride,
+ unsigned int *src, unsigned int srcStride);
+ static void armSimdCopyImage32To16(int *fromImageData, int *toImageData, int width, int height,
+ int affectedL, int affectedT, int affectedR, int affectedB)
+ {
+ /* Find image strides in 32-bit words */
+ unsigned int srcStride= width;
+ unsigned int dstStride= (width + 1) >> 1;
+ /* Round affected region out to encompass complete words in both images */
+ affectedL &= ~1;
+ affectedR += affectedR & 1;
+ width= affectedR - affectedL;
+ height= affectedB - affectedT;
+ /* Find first words */
+ fromImageData += srcStride * affectedT + affectedL;
+ toImageData += dstStride * affectedT + (affectedL >> 1);
+ /* Adjust strides to remove number of words read/written */
+ srcStride -= affectedR - affectedL;
+ dstStride -= (affectedR - affectedL) >> 1;
+ /* Work out which width class this operation is. */
+ if (width > (128 - 32) / 16 && ((-1 ^ (width - (128 - 32) / 16)) & ~(31 / 16)))
+ armSimdConvert_x888_0565_LEPacking32_16_wide(width, height, toImageData, dstStride, fromImageData, srcStride);
+ else
+ armSimdConvert_x888_0565_LEPacking32_16_narrow(width, height, toImageData, dstStride, fromImageData, srcStride);
+ }
+
+# else
+# error configuration error
+# endif
+#endif
+
void copyImage32To16(int *fromImageData, int *toImageData, int width, int height,
int affectedL, int affectedT, int affectedR, int affectedB)
{
- int scanLine32, firstWord32, lastWord32;
- int scanLine16, firstWord16;
- int line;
- int rshift, gshift, bshift;
- register unsigned int col;
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+ if (stRNMask == 5 && stRShift == 11 && stGNMask == 6 && stGShift == 5 && stBNMask == 5 && stBShift == 0)
+ armSimdCopyImage32To16(fromImageData, toImageData, width, height, affectedL, affectedT, affectedR, affectedB);
+ else
+# else
+# error configuration error
+# endif
+#endif
+ {
+ int scanLine32, firstWord32, lastWord32;
+ int scanLine16, firstWord16;
+ int line;
+ int rshift, gshift, bshift;
+ register unsigned int col;
- rshift= stRNMask-5 + stRShift;
- gshift= stGNMask-5 + stGShift;
- bshift= stBNMask-5 + stBShift;
+ rshift= stRNMask-5 + stRShift;
+ gshift= stGNMask-5 + stGShift;
+ bshift= stBNMask-5 + stBShift;
-#define map32To16(w) (col= (w), \
- (((col >> 19) & 0x1f) << rshift) | \
- (((col >> 11) & 0x1f) << gshift) | \
- (((col >> 3) & 0x1f) << bshift))
+# define map32To16(w) (col= (w), \
+ (((col >> 19) & 0x1f) << rshift) | \
+ (((col >> 11) & 0x1f) << gshift) | \
+ (((col >> 3) & 0x1f) << bshift))
- scanLine32= bytesPerLine(width, 32);
- firstWord32= scanLine32*affectedT + bytesPerLineRD(affectedL, 32);
- lastWord32= scanLine32*affectedT + bytesPerLine(affectedR, 32);
- scanLine16= bytesPerLine(width, 16);
- firstWord16= scanLine16*affectedT + (bytesPerLineRD(affectedL, 32) >> 1);
+ scanLine32= bytesPerLine(width, 32);
+ firstWord32= scanLine32*affectedT + bytesPerLineRD(affectedL, 32);
+ lastWord32= scanLine32*affectedT + bytesPerLine(affectedR, 32);
+ scanLine16= bytesPerLine(width, 16);
+ firstWord16= scanLine16*affectedT + (bytesPerLineRD(affectedL, 32) >> 1);
- for (line= affectedT; line < affectedB; line++)
- {
- register unsigned int *from= (unsigned int *)((long)fromImageData+firstWord32);
- register unsigned int *limit= (unsigned int *)((long)fromImageData+lastWord32);
- register unsigned short *to= (unsigned short *)((long)toImageData+firstWord16);
- while (from < limit)
- {
- to[0]= map32To16(from[0]);
- from++;
- to++;
- }
- firstWord32+= scanLine32;
- lastWord32+= scanLine32;
- firstWord16+= scanLine16;
- }
-#undef map32To16
+ for (line= affectedT; line < affectedB; line++)
+ {
+ register unsigned int *from= (unsigned int *)((long)fromImageData+firstWord32);
+ register unsigned int *limit= (unsigned int *)((long)fromImageData+lastWord32);
+ register unsigned short *to= (unsigned short *)((long)toImageData+firstWord16);
+ while (from < limit)
+ {
+ to[0]= map32To16(from[0]);
+ from++;
+ to++;
+ }
+ firstWord32+= scanLine32;
+ lastWord32+= scanLine32;
+ firstWord16+= scanLine16;
+ }
+# undef map32To16
+ }
}
void copyImage16To16(int *fromImageData, int *toImageData, int width, int height,
@@ -5274,42 +5384,85 @@
#undef map16To16
}
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+ extern void armSimdConvert_x888_x888BGR_LEPacking32_32_wide(unsigned int width, unsigned int height,
+ unsigned int *dst, unsigned int dstStride,
+ unsigned int *src, unsigned int srcStride);
+
+ extern void armSimdConvert_x888_x888BGR_LEPacking32_32_narrow(unsigned int width, unsigned int height,
+ unsigned int *dst, unsigned int dstStride,
+ unsigned int *src, unsigned int srcStride);
+
+ static void armSimdCopyImage32To32(int *fromImageData, int *toImageData, int width, int height,
+ int affectedL, int affectedT, int affectedR, int affectedB)
+ {
+ unsigned int stride= width;
+ width= affectedR - affectedL;
+ height= affectedB - affectedT;
+ /* Find first words */
+ fromImageData += stride * affectedT + affectedL;
+ toImageData += stride * affectedT + affectedL;
+ /* Adjust stride to remove number of words read/written */
+ stride -= width;
+ /* Work out which width class this operation is. */
+ if (width > (128 - 32) / 32 && (-1 ^ (width - (128 - 32) / 32)))
+ armSimdConvert_x888_x888BGR_LEPacking32_32_wide(width, height, toImageData, stride, fromImageData, stride);
+ else
+ armSimdConvert_x888_x888BGR_LEPacking32_32_narrow(width, height, toImageData, stride, fromImageData, stride);
+ }
+# else
+# error configuration error
+# endif
+#endif
+
void copyImage32To32(int *fromImageData, int *toImageData, int width, int height,
int affectedL, int affectedT, int affectedR, int affectedB)
{
- int scanLine32, firstWord32, lastWord32;
- int line;
- int rshift, gshift, bshift;
- register unsigned int col;
+#if defined(USE_FAST_BLT)
+# if defined(__arm__)
+ if ((armCpuFeatures & ARM_V6) && stRNMask == 8 && stRShift == 0 && stGNMask == 8 && stGShift == 8 && stBNMask == 8 && stBShift == 16)
+ armSimdCopyImage32To32(fromImageData, toImageData, width, height, affectedL, affectedT, affectedR, affectedB);
+ else
+# else
+# error unsupported use of ENABLE_FAST_BLT
+# endif
+#endif
+ {
+ int scanLine32, firstWord32, lastWord32;
+ int line;
+ int rshift, gshift, bshift;
+ register unsigned int col;
- rshift= stRNMask-8 + stRShift;
- gshift= stGNMask-8 + stGShift;
- bshift= stBNMask-8 + stBShift;
+ rshift= stRNMask-8 + stRShift;
+ gshift= stGNMask-8 + stGShift;
+ bshift= stBNMask-8 + stBShift;
-#define map32To32(w) (col= (w), \
- (((col >> 16) & 0xff) << rshift) | \
- (((col >> 8) & 0xff) << gshift) | \
- ((col & 0xff) << bshift))
+# define map32To32(w) (col= (w), \
+ (((col >> 16) & 0xff) << rshift) | \
+ (((col >> 8) & 0xff) << gshift) | \
+ ((col & 0xff) << bshift))
- scanLine32= bytesPerLine(width, 32);
- firstWord32= scanLine32*affectedT + bytesPerLineRD(affectedL, 32);
- lastWord32= scanLine32*affectedT + bytesPerLine(affectedR, 32);
+ scanLine32= bytesPerLine(width, 32);
+ firstWord32= scanLine32*affectedT + bytesPerLineRD(affectedL, 32);
+ lastWord32= scanLine32*affectedT + bytesPerLine(affectedR, 32);
- for (line= affectedT; line < affectedB; line++)
- {
- register unsigned int *from= (unsigned int *)((long)fromImageData+firstWord32);
- register unsigned int *limit= (unsigned int *)((long)fromImageData+lastWord32);
- register unsigned int *to= (unsigned int *)((long)toImageData+firstWord32);
- while (from < limit)
- {
- *to= map32To32(*from);
- from++;
- to++;
- }
- firstWord32+= scanLine32;
- lastWord32+= scanLine32;
- }
-#undef map32To32
+ for (line= affectedT; line < affectedB; line++)
+ {
+ register unsigned int *from= (unsigned int *)((long)fromImageData+firstWord32);
+ register unsigned int *limit= (unsigned int *)((long)fromImageData+lastWord32);
+ register unsigned int *to= (unsigned int *)((long)toImageData+firstWord32);
+ while (from < limit)
+ {
+ *to= map32To32(*from);
+ from++;
+ to++;
+ }
+ firstWord32+= scanLine32;
+ lastWord32+= scanLine32;
+ }
+# undef map32To32
+ }
}
void copyImage32To32Same(int *fromImageData, int *toImageData,
Added: trunk/platforms/unix/vm-display-X11/sqUnixX11Arm.S
===================================================================
--- trunk/platforms/unix/vm-display-X11/sqUnixX11Arm.S (rev 0)
+++ trunk/platforms/unix/vm-display-X11/sqUnixX11Arm.S 2013-08-15 14:17:30 UTC (rev 2766)
@@ -0,0 +1,246 @@
+;
+; Copyright © 2013 Raspberry Pi Foundation
+; Copyright © 2013 RISC OS Open Ltd
+;
+; Permission to use, copy, modify, distribute, and sell this software and its
+; documentation for any purpose is hereby granted without fee, provided that
+; the above copyright notice appear in all copies and that both that
+; copyright notice and this permission notice appear in supporting
+; documentation, and that the name of the copyright holders not be used in
+; advertising or publicity pertaining to distribution of the software without
+; specific, written prior permission. The copyright holders make no
+; representations about the suitability of this software for any purpose. It
+; is provided "as is" without express or implied warranty.
+;
+; THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+; SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+; FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+; SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+; OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+; SOFTWARE.
+;
+
+; Debug options
+ GBLL DebugData
+;DebugData SETL {TRUE}
+ GBLL DebugPld
+;DebugPld SETL {TRUE}
+ GBLL VerboseBuild
+;VerboseBuild SETL {TRUE}
+
+ GET BitBltArmSimdAsm.hdr
+
+ AREA |sqUnixX11Arm$$Code|, CODE, READONLY
+ ARM
+
+; ********************************************************************
+
+ MACRO
+ Convert_x888_8_LEPacking_1pixel $src, $dst
+ AND $dst, ht, $src, LSR #6 ; 00000000000000rrr0000000000000bb
+ AND $src, $src, #&E000 ; 0000000000000000ggg0000000000000
+ ORR $dst, $dst, $dst, LSR #10 ; 00000000000000rrr0000000rrr000bb
+ ORR $dst, $dst, $src, LSR #11 ; 00000000000000rrr0000000rrrgggbb
+ AND $dst, $dst, #&FF ; 000000000000000000000000rrrgggbb
+ LDR $dst, [map, $dst, LSL #2]
+ MEND
+
+ MACRO
+ Convert_x888_8_LEPacking_4pixels $src0, $src1, $src2, $src3, $dst
+ Convert_x888_8_LEPacking_1pixel $src0, $dst
+ Convert_x888_8_LEPacking_1pixel $src1, $src0
+ Convert_x888_8_LEPacking_1pixel $src2, $src1
+ Convert_x888_8_LEPacking_1pixel $src3, $src2
+ ORR $dst, $dst, $src0, LSL #8
+ ORR $dst, $dst, $src1, LSL #16
+ ORR $dst, $dst, $src2, LSL #24
+ MEND
+
+ MACRO
+ Convert_x888_8_LEPacking32_8_init
+ LDR ht, =&38003
+ B %FT00
+ LTORG
+00
+ MEND
+
+ MACRO
+ Convert_x888_8_LEPacking32_8_8bits $src, $dst, $fixed_skew
+ ; This code should never be executed. It's for handling stray
+ ; pixels at the start or end of the row, but for now the
+ ; assembler framework only supports packing these big-endian
+ ; into words, which isn't what we want.
+ MEND
+
+ MACRO
+ Convert_x888_8_LEPacking32_8_16bits $src, $dst, $fixed_skew
+ ; This code should never be executed. It's for handling stray
+ ; pixels at the start or end of the row, but for now the
+ ; assembler framework only supports packing these big-endian
+ ; into words, which isn't what we want.
+ MEND
+
+ MACRO
+ Convert_x888_8_LEPacking32_8_32bits $src, $dst, $fixed_skew
+ Read4Words src, 4, carry, $fixed_skew, skew, unused
+ Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk0
+ Write1Word dst, 0
+ MEND
+
+ MACRO
+ Convert_x888_8_LEPacking32_8_64bits $src, $fixed_skew
+ Read4Words src, 4, carry, $fixed_skew, skew, unused
+ Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk0
+ Read4Words src, 4, carry, $fixed_skew, skew, unused
+ Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk1
+ Write2Words dst, 0
+ MEND
+
+ MACRO
+ Convert_x888_8_LEPacking32_8_128bits_head $src, $fixed_skew, $intra_preloads
+ Read4Words src, 4, carry, $fixed_skew, skew, unused
+ Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk0
+ Read4Words src, 4, carry, $fixed_skew, skew, unused
+ [ "$intra_preloads" <> ""
+ PreloadMiddle
+ ]
+ Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk1
+ Read4Words src, 4, carry, $fixed_skew, skew, unused
+ Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk2
+ Read4Words src, 4, carry, $fixed_skew, skew, unused
+ MEND
+
+ MACRO
+ Convert_x888_8_LEPacking32_8_128bits_tail $src
+ Convert_x888_8_LEPacking_4pixels $wk4, $wk5, $wk6, $wk7, $wk3
+ Write4Words dst, 0
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $cleanup
+
+Convert_x888_8_LEPacking GenerateFunctions 32, 8,, \
+ FLAG_COLOUR_MAP :OR: FLAG_DST_WRITEONLY :OR: FLAG_SPILL_LINE_VARS, 3, \
+ "y,stride_d,stride_s,ht_info,bitptrs,skew,orig_w,carry", \
+ "x,y,stride_d,stride_s", ht_info,, init ; leading_pixels_reg = wk3
+
+; ********************************************************************
+
+ MACRO
+ Convert_x888_0565_LEPacking_2pixels $src0, $src1, $dst
+ AND $dst, ht, $src1, LSR #3 ; 00000000000rrrrr00000000000bbbbb
+ AND $src1, $src1, #&FC00 ; 0000000000000000gggggg0000000000
+ ORR $dst, $dst, $dst, LSR #5 ; 00000000000rrrrrrrrrr000000bbbbb
+ ORR $src1, $dst, $src1, LSR #5 ; 00000000000rrrrrrrrrrggggggbbbbb
+ AND $dst, ht, $src0, LSR #3 ; 00000000000RRRRR00000000000BBBBB
+ AND $src0, $src0, #&FC00 ; 0000000000000000GGGGGG0000000000
+ ORR $dst, $dst, $dst, LSR #5 ; 00000000000RRRRRRRRRR000000BBBBB
+ ORR $src0, $dst, $src0, LSR #5 ; 00000000000RRRRRRRRRRGGGGGGBBBBB
+ BIC $src0, $src0, #&1F0000 ; 0000000000000000RRRRRGGGGGGBBBBB
+ ORR $dst, $src0, $src1, LSL #16 ; rrrrrggggggbbbbbRRRRRGGGGGGBBBBB
+ MEND
+
+ MACRO
+ Convert_x888_0565_LEPacking32_16_init
+ LDR ht, =&001F001F
+ B %FT00
+ LTORG
+00
+ MEND
+
+ MACRO
+ Convert_x888_0565_LEPacking32_16_16bits $src, $dst, $fixed_skew
+ ; This code should never be executed. It's for handling stray
+ ; pixels at the start or end of the row, but for now the
+ ; assembler framework only supports packing these big-endian
+ ; into words, which isn't what we want.
+ MEND
+
+ MACRO
+ Convert_x888_0565_LEPacking32_16_32bits $src, $dst, $fixed_skew
+ Read2Words src, 3, carry, $fixed_skew, skew, unused
+ Convert_x888_0565_LEPacking_2pixels $wk3, $wk4, $wk0
+ Write1Word dst, 0
+ MEND
+
+ MACRO
+ Convert_x888_0565_LEPacking32_16_64bits $src, $fixed_skew
+ Read4Words src, 3, carry, $fixed_skew, skew, unused
+ Convert_x888_0565_LEPacking_2pixels $wk3, $wk4, $wk0
+ Convert_x888_0565_LEPacking_2pixels $wk5, $wk6, $wk1
+ Write2Words dst, 0
+ MEND
+
+ MACRO
+ Convert_x888_0565_LEPacking32_16_128bits_head $src, $fixed_skew, $intra_preloads
+ Read4Words src, 3, carry, $fixed_skew, skew, unused
+ Convert_x888_0565_LEPacking_2pixels $wk3, $wk4, $wk0
+ Convert_x888_0565_LEPacking_2pixels $wk5, $wk6, $wk1
+ Read4Words src, 3, carry, $fixed_skew, skew, unused
+ MEND
+
+ MACRO
+ Convert_x888_0565_LEPacking32_16_128bits_tail $src
+ Convert_x888_0565_LEPacking_2pixels $wk3, $wk4, $wk2
+ Convert_x888_0565_LEPacking_2pixels $wk5, $wk6, $wk3
+ Write4Words dst, 0
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $cleanup
+
+Convert_x888_0565_LEPacking GenerateFunctions 32, 16,, \
+ FLAG_DST_WRITEONLY :OR: FLAG_SPILL_LINE_VARS, 3, \
+ "stride_s,ht_info,map,bitptrs,skew,orig_w,carry", \
+ "x,stride_s", map, scratch, init ; leading_pixels_reg = wk2
+
+; ********************************************************************
+
+ MACRO
+ Convert_x888_x888BGR_LEPacking32_32_32bits $src, $dst, $fixed_skew
+ SETEND BE
+ Read1Word src, 0, carry, $fixed_skew, skew, unused
+ SETEND LE
+ MOV $wk0, $wk0, LSR #8
+ Write1Word dst, 0
+ MEND
+
+ MACRO
+ Convert_x888_x888BGR_LEPacking32_32_64bits $src, $fixed_skew
+ SETEND BE
+ Read2Words src, 0, carry, $fixed_skew, skew, unused
+ SETEND LE
+ MOV $wk0, $wk0, LSR #8
+ MOV $wk1, $wk1, LSR #8
+ Write2Words dst, 0
+ MEND
+
+ MACRO
+ Convert_x888_x888BGR_LEPacking32_32_128bits_head $src, $fixed_skew, $intra_preloads
+ SETEND BE
+ Read4Words src, 0, carry, $fixed_skew, skew, unused
+ MEND
+
+ MACRO
+ Convert_x888_x888BGR_LEPacking32_32_128bits_tail $src
+ SETEND LE
+ MOV $wk0, $wk0, LSR #8
+ MOV $wk1, $wk1, LSR #8
+ MOV $wk2, $wk2, LSR #8
+ MOV $wk3, $wk3, LSR #8
+ Write4Words dst, 0
+ MEND
+
+;$op GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+; $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $cleanup
+
+Convert_x888_x888BGR_LEPacking GenerateFunctions 32, 32,, \
+ FLAG_DST_WRITEONLY, 2, \
+ "ht,ht_info,map,bitptrs", \
+ "", skew, scratch
+
+; ********************************************************************
+
+ END
More information about the Vm-dev
mailing list