[Vm-dev] [commit][2744] add fast bitblt support files

commits at squeakvm.org commits at squeakvm.org
Tue Jun 18 23:13:52 UTC 2013


Revision: 2744
Author:   rowledge
Date:     2013-06-18 16:13:50 -0700 (Tue, 18 Jun 2013)
Log Message:
-----------
add fast bitblt support files

Modified Paths:
--------------
    trunk/platforms/Cross/vm/sqMemoryAccess.h

Added Paths:
-----------
    trunk/platforms/Cross/plugins/BitBltPlugin/
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.h
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmLinux.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmOther.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.h
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAlphaBlend.s
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdBitLogical.s
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdPixPaint.s
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdSourceWord.s
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltDispatch.h
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.c
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltGeneric.h
    trunk/platforms/Cross/plugins/BitBltPlugin/BitBltInternal.h

Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.c	                        (rev 0)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.c	2013-06-18 23:13:50 UTC (rev 2744)
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2013 Raspberry Pi Foundation
+ * Copyright © 2013 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ */
+
+#include "BitBltArm.h"
+#include "BitBltArmSimd.h"
+
+arm_cpu_features_t armCpuFeatures;
+
+void addArmFastPaths(void)
+{
+	armCpuFeatures = detectCpuFeatures();
+	if (armCpuFeatures & ARM_V6)
+		addArmSimdFastPaths();
+}

Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.h
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.h	                        (rev 0)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArm.h	2013-06-18 23:13:50 UTC (rev 2744)
@@ -0,0 +1,44 @@
+/*
+ * Copyright © 2013 Raspberry Pi Foundation
+ * Copyright © 2013 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ */
+
+#ifndef BITBLTARM_H_
+#define BITBLTARM_H_
+
+typedef enum {
+	ARM_V7     = (1 << 0),
+	ARM_V6     = (1 << 1),
+	ARM_VFP    = (1 << 2),
+	ARM_NEON   = (1 << 3),
+	ARM_IWMMXT = (1 << 4)
+} arm_cpu_features_t;
+
+extern arm_cpu_features_t armCpuFeatures;
+
+/* There's a separate implementation of this function for each OS */
+arm_cpu_features_t detectCpuFeatures(void);
+
+void addArmFastPaths(void);
+
+#endif /* BITBLTARM_H_ */

Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmLinux.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmLinux.c	                        (rev 0)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmLinux.c	2013-06-18 23:13:50 UTC (rev 2744)
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2000 SuSE, Inc.
+ * Copyright © 2007 Red Hat, Inc.
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of SuSE not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  SuSE makes no representations about the
+ * suitability of this software for any purpose.  It is provided "as is"
+ * without express or implied warranty.
+ *
+ * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
+ * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+* WRT the usage in the Squeak Smalltalk system -
+* This file provides a function to discover the cpu features supported at runtime; we assume
+* you understand that 'arm_cpu_features' is meaningful only on ARM cpu machines. 
+* An equivalent file will be required for other ARM platforms; see BitBtArmOther.c in this directory
+ */
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <elf.h>
+
+#include "BitBltArm.h"
+
+arm_cpu_features_t detectCpuFeatures(void)
+{
+    arm_cpu_features_t features = 0;
+    Elf32_auxv_t aux;
+    int fd;
+
+    fd = open ("/proc/self/auxv", O_RDONLY);
+    if (fd >= 0)
+    {
+	while (read (fd, &aux, sizeof(Elf32_auxv_t)) == sizeof(Elf32_auxv_t))
+	{
+	    if (aux.a_type == AT_HWCAP)
+	    {
+		uint32_t hwcap = aux.a_un.a_val;
+
+		/* hardcode these values to avoid depending on specific
+		 * versions of the hwcap header, e.g. HWCAP_NEON
+		 */
+		if ((hwcap & 64) != 0)
+		    features |= ARM_VFP;
+		if ((hwcap & 512) != 0)
+		    features |= ARM_IWMMXT;
+		/* this flag is only present on kernel 2.6.29 */
+		if ((hwcap & 4096) != 0)
+		    features |= ARM_NEON;
+	    }
+	    else if (aux.a_type == AT_PLATFORM)
+	    {
+		const char *plat = (const char*) aux.a_un.a_val;
+
+		if (strncmp (plat, "v7l", 3) == 0)
+		    features |= (ARM_V7 | ARM_V6);
+		else if (strncmp (plat, "v6l", 3) == 0)
+		    features |= ARM_V6;
+	    }
+	}
+	close (fd);
+    }
+
+    return features;
+}

Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmOther.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmOther.c	                        (rev 0)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmOther.c	2013-06-18 23:13:50 UTC (rev 2744)
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2013 Raspberry Pi Foundation
+ * Copyright © 2013 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ * WRT the usage in the Squeak Smalltalk system -
+* This file provides a function to discover the cpu features supported at runtime; we assume
+* you understand that 'arm_cpu_features' is meaningful only on ARM cpu machines. 
+* Obviously, this is a null function and a suitable equivalent file will be required for actual ARM platforms; 
+* see BitBtArmLinux.c in this directory as an example
+ */
+
+#include "BitBltArm.h"
+
+/* There is no OS-neutral way of determining which type of ARM this is */
+
+arm_cpu_features_t detectCpuFeatures(void)
+{
+	return 0;
+}

Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c	                        (rev 0)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.c	2013-06-18 23:13:50 UTC (rev 2744)
@@ -0,0 +1,263 @@
+/*
+ * Copyright © 2013 Raspberry Pi Foundation
+ * Copyright © 2013 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "BitBltInternal.h"
+
+enum {
+	HALFTONE_NONE,
+	HALFTONE_SCALAR,
+	HALFTONE_VECTOR
+};
+
+//typedef void (*armSimdAsmFn)(uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...);
+
+#define FAST_PATH(op, src_bpp, dst_bpp, qualifier, halftone_type)                                                                                             \
+extern void armSimd##op##src_bpp##_##dst_bpp##qualifier##_wide  (uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...); \
+extern void armSimd##op##src_bpp##_##dst_bpp##qualifier##_narrow(uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...); \
+extern void armSimd##op##src_bpp##_##dst_bpp##qualifier##_tiny  (uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...); \
+static void fastPath##op##src_bpp##_##dst_bpp##qualifier(operation_t *op, uint32_t flags)                                                                     \
+{                                                                                                                                                             \
+	IGNORE(flags);                                                                                                                                            \
+	/* Copy certain parts of the operation structure to locals to help compiler */                                                                            \
+	uint32_t *srcBits = op->src.bits;                                                                                                                         \
+	uint32_t srcPitch = op->src.pitch / sizeof (uint32_t);                                                                                                    \
+	uint32_t srcX     = op->src.x;                                                                                                                            \
+	uint32_t srcY     = op->src.y;                                                                                                                            \
+	uint32_t *dstBits = op->dest.bits;                                                                                                                        \
+	uint32_t dstPitch = op->dest.pitch / sizeof (uint32_t);                                                                                                   \
+	uint32_t dstX     = op->dest.x;                                                                                                                           \
+	uint32_t dstY     = op->dest.y;                                                                                                                           \
+	uint32_t width    = op->width;                                                                                                                            \
+	uint32_t height   = op->height;                                                                                                                           \
+	uint32_t *cmLookupTable = *op->cmLookupTable;                                                                                                             \
+	uint32_t halftoneHeight = op->halftoneHeight;                                                                                                             \
+	uint32_t *halftoneBase  = (uint32_t *) *op->halftoneBase;                                                                                                 \
+	/* Get pointers to initial words */                                                                                                                       \
+	uint32_t *src = 0;                                                                                                                                        \
+	if (src_bpp > 0)                                                                                                                                          \
+		src = srcBits + srcPitch * srcY + srcX * src_bpp / 32;                                                                                                \
+	uint32_t *dst = dstBits + dstPitch * dstY + dstX * dst_bpp / 32;                                                                                          \
+	/* Get initial pixel offset within words, mangle into pitch if possible */                                                                                \
+	uint32_t bitPtrs = 0;                                                                                                                                     \
+	uint32_t srcXpix = 0;                                                                                                                                     \
+	if (src_bpp > 0) {                                                                                                                                        \
+		srcXpix = srcX & (31 / (src_bpp == 0 ? 1 : src_bpp)); /* ?: to avoid compiler warning on GCC! */                                                      \
+		if (src_bpp < 8)                                                                                                                                      \
+			bitPtrs = srcXpix << 27;                                                                                                                          \
+		else if (src_bpp == 8 || src_bpp == 16)                                                                                                               \
+			srcPitch |= srcXpix << 30;                                                                                                                        \
+	}                                                                                                                                                         \
+	uint32_t dstXpix = dstX & (31/dst_bpp);                                                                                                                   \
+	if (dst_bpp < 8)                                                                                                                                          \
+		bitPtrs |= dstXpix;                                                                                                                                   \
+	else if (dst_bpp == 8 || dst_bpp == 16)                                                                                                                   \
+		dstPitch |= dstXpix << 30;                                                                                                                            \
+	/* Adjust strides to remove number of words partially or wholly read/written */                                                                           \
+	if (src_bpp > 0)                                                                                                                                          \
+		srcPitch -= (src_bpp * (srcXpix + width) + 31) / 32;                                                                                                  \
+	dstPitch -= (dst_bpp * (dstXpix + width) + 31) / 32;                                                                                                      \
+	/* Deal with halftoning */                                                                                                                                \
+	uint32_t halftone = 0;                                                                                                                                    \
+	uint32_t halftoneInfo = 0;                                                                                                                                \
+	if (halftone_type == HALFTONE_SCALAR)                                                                                                                     \
+		halftone = halftoneBase[0];                                                                                                                           \
+	else if (halftone_type == HALFTONE_VECTOR) {                                                                                                              \
+		halftone = (uint32_t) (halftoneBase + halftoneHeight);                                                                                                \
+		halftoneInfo = (((dstY % halftoneHeight) - halftoneHeight) << 17) | (-halftoneHeight & 0x7FFF);                                                       \
+	}                                                                                                                                                         \
+	/* Work out which width class this operation is.                                                                                                          \
+	 * Rather than re-evaluate this for each line, we want one choice                                                                                         \
+	 * for the whole operation; this means we can't assume anything about                                                                                     \
+	 * alignment to sizes larger than 4 bytes, because that's the only                                                                                        \
+	 * guarantee we have about line stride. */                                                                                                                \
+	if (width > (128-32)/dst_bpp && (((dstXpix-1) ^ (dstXpix+width-(128-32)/dst_bpp)) &~ (31/dst_bpp)))                                                       \
+		armSimd##op##src_bpp##_##dst_bpp##qualifier##_wide(width, height, dst, dstPitch, src, srcPitch, halftone, halftoneInfo, cmLookupTable, bitPtrs);      \
+	else if (dst_bpp > 8 || (((dstXpix-1) ^ (dstXpix+width)) &~ (31/dst_bpp)))                                                                                \
+		armSimd##op##src_bpp##_##dst_bpp##qualifier##_narrow(width, height, dst, dstPitch, src, srcPitch, halftone, halftoneInfo, cmLookupTable, bitPtrs);    \
+	else                                                                                                                                                      \
+		armSimd##op##src_bpp##_##dst_bpp##qualifier##_tiny(width, height, dst, dstPitch, src, srcPitch, halftone, halftoneInfo, cmLookupTable, bitPtrs);      \
+}
+
+FAST_PATH(SourceWord,1,32,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,1,16,,HALFTONE_NONE)
+FAST_PATH(SourceWord,2,32,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,1,8,,HALFTONE_NONE)
+FAST_PATH(SourceWord,2,16,,HALFTONE_NONE)
+FAST_PATH(SourceWord,4,32,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,1,4,,HALFTONE_NONE)
+FAST_PATH(SourceWord,2,8,,HALFTONE_NONE)
+FAST_PATH(SourceWord,4,16,,HALFTONE_NONE)
+FAST_PATH(SourceWord,8,32,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,1,2,,HALFTONE_NONE)
+FAST_PATH(SourceWord,2,4,,HALFTONE_NONE)
+FAST_PATH(SourceWord,4,8,,HALFTONE_NONE)
+FAST_PATH(SourceWord,8,16,,HALFTONE_NONE)
+FAST_PATH(SourceWord,16,32,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,1,1,,HALFTONE_NONE)
+FAST_PATH(SourceWord,2,2,,HALFTONE_NONE)
+FAST_PATH(SourceWord,4,4,,HALFTONE_NONE)
+FAST_PATH(SourceWord,8,8,,HALFTONE_NONE)
+FAST_PATH(SourceWord,16,16,,HALFTONE_NONE)
+FAST_PATH(SourceWord,32,32,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,2,1,,HALFTONE_NONE)
+FAST_PATH(SourceWord,4,2,,HALFTONE_NONE)
+FAST_PATH(SourceWord,8,4,,HALFTONE_NONE)
+FAST_PATH(SourceWord,16,8,,HALFTONE_NONE)
+FAST_PATH(SourceWord,32,16,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,4,1,,HALFTONE_NONE)
+FAST_PATH(SourceWord,8,2,,HALFTONE_NONE)
+FAST_PATH(SourceWord,16,4,,HALFTONE_NONE)
+FAST_PATH(SourceWord,32,8,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,8,1,,HALFTONE_NONE)
+FAST_PATH(SourceWord,16,2,,HALFTONE_NONE)
+FAST_PATH(SourceWord,32,4,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,16,1,,HALFTONE_NONE)
+FAST_PATH(SourceWord,32,2,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,32,1,,HALFTONE_NONE)
+
+FAST_PATH(SourceWord,0,1,,HALFTONE_NONE)
+FAST_PATH(SourceWord,0,1,_scalar,HALFTONE_SCALAR)
+FAST_PATH(SourceWord,0,2,,HALFTONE_NONE)
+FAST_PATH(SourceWord,0,2,_scalar,HALFTONE_SCALAR)
+FAST_PATH(SourceWord,0,4,,HALFTONE_NONE)
+FAST_PATH(SourceWord,0,4,_scalar,HALFTONE_SCALAR)
+FAST_PATH(SourceWord,0,8,,HALFTONE_NONE)
+FAST_PATH(SourceWord,0,8,_scalar,HALFTONE_SCALAR)
+FAST_PATH(SourceWord,0,16,,HALFTONE_NONE)
+FAST_PATH(SourceWord,0,16,_scalar,HALFTONE_SCALAR)
+FAST_PATH(SourceWord,0,32,,HALFTONE_NONE)
+FAST_PATH(SourceWord,0,32,_scalar,HALFTONE_SCALAR)
+
+FAST_PATH(PixPaint,1,1,,HALFTONE_NONE)
+FAST_PATH(PixPaint,2,2,,HALFTONE_NONE)
+FAST_PATH(PixPaint,4,4,,HALFTONE_NONE)
+FAST_PATH(PixPaint,8,8,,HALFTONE_NONE)
+FAST_PATH(PixPaint,16,16,,HALFTONE_NONE)
+FAST_PATH(PixPaint,32,32,,HALFTONE_NONE)
+
+FAST_PATH(AlphaBlend,32,32,,HALFTONE_NONE)
+
+FAST_PATH(BitAnd,1,1,,HALFTONE_NONE)
+FAST_PATH(BitAnd,2,2,,HALFTONE_NONE)
+FAST_PATH(BitAnd,4,4,,HALFTONE_NONE)
+FAST_PATH(BitAnd,8,8,,HALFTONE_NONE)
+FAST_PATH(BitAnd,16,16,,HALFTONE_NONE)
+FAST_PATH(BitAnd,32,32,,HALFTONE_NONE)
+
+static fast_path_t fastPaths[] = {
+		{ fastPathSourceWord1_32,        CR_sourceWord, STD_FLAGS(1,32,DIRECT,NO) },
+
+		{ fastPathSourceWord1_16,        CR_sourceWord, STD_FLAGS(1,16,DIRECT,NO) },
+		{ fastPathSourceWord2_32,        CR_sourceWord, STD_FLAGS(2,32,DIRECT,NO) },
+
+		{ fastPathSourceWord1_8,         CR_sourceWord, STD_FLAGS(1,8,DIRECT,NO) },
+		{ fastPathSourceWord2_16,        CR_sourceWord, STD_FLAGS(2,16,DIRECT,NO) },
+		{ fastPathSourceWord4_32,        CR_sourceWord, STD_FLAGS(4,32,DIRECT,NO) },
+
+		{ fastPathSourceWord1_4,         CR_sourceWord, STD_FLAGS(1,4,DIRECT,NO) },
+		{ fastPathSourceWord2_8,         CR_sourceWord, STD_FLAGS(2,8,DIRECT,NO) },
+		{ fastPathSourceWord4_16,        CR_sourceWord, STD_FLAGS(4,16,DIRECT,NO) },
+		{ fastPathSourceWord8_32,        CR_sourceWord, STD_FLAGS(8,32,DIRECT,NO) },
+
+		{ fastPathSourceWord1_2,         CR_sourceWord, STD_FLAGS(1,2,DIRECT,NO) },
+		{ fastPathSourceWord2_4,         CR_sourceWord, STD_FLAGS(2,4,DIRECT,NO) },
+		{ fastPathSourceWord4_8,         CR_sourceWord, STD_FLAGS(4,8,DIRECT,NO) },
+		{ fastPathSourceWord8_16,        CR_sourceWord, STD_FLAGS(8,16,DIRECT,NO) },
+		{ fastPathSourceWord16_32,       CR_sourceWord, STD_FLAGS(16,32,NO,NO) },
+
+		{ fastPathSourceWord1_1,         CR_sourceWord, STD_FLAGS(1,1,NO,NO) },
+		{ fastPathSourceWord2_2,         CR_sourceWord, STD_FLAGS(2,2,NO,NO) },
+		{ fastPathSourceWord4_4,         CR_sourceWord, STD_FLAGS(4,4,NO,NO) },
+		{ fastPathSourceWord8_8,         CR_sourceWord, STD_FLAGS(8,8,NO,NO) },
+		{ fastPathSourceWord16_16,       CR_sourceWord, STD_FLAGS(16,16,NO,NO) },
+		{ fastPathSourceWord32_32,       CR_sourceWord, STD_FLAGS(32,32,NO,NO) },
+
+		{ fastPathSourceWord2_1,         CR_sourceWord, STD_FLAGS(2,1,DIRECT,NO) },
+		{ fastPathSourceWord4_2,         CR_sourceWord, STD_FLAGS(4,2,DIRECT,NO) },
+		{ fastPathSourceWord8_4,         CR_sourceWord, STD_FLAGS(8,4,DIRECT,NO) },
+		{ fastPathSourceWord16_8,        CR_sourceWord, STD_FLAGS(16,8,DIRECT,NO) },
+		{ fastPathSourceWord32_16,       CR_sourceWord, STD_FLAGS(32,16,NO,NO) },
+
+		{ fastPathSourceWord4_1,         CR_sourceWord, STD_FLAGS(4,1,DIRECT,NO) },
+		{ fastPathSourceWord8_2,         CR_sourceWord, STD_FLAGS(8,2,DIRECT,NO) },
+		{ fastPathSourceWord16_4,        CR_sourceWord, STD_FLAGS(16,4,DIRECT,NO) },
+		{ fastPathSourceWord32_8,        CR_sourceWord, STD_FLAGS(32,8,15BIT,NO) },
+
+		{ fastPathSourceWord8_1,         CR_sourceWord, STD_FLAGS(8,1,DIRECT,NO) },
+		{ fastPathSourceWord16_2,        CR_sourceWord, STD_FLAGS(16,2,DIRECT,NO) },
+		{ fastPathSourceWord32_4,        CR_sourceWord, STD_FLAGS(32,4,15BIT,NO) },
+
+		{ fastPathSourceWord16_1,        CR_sourceWord, STD_FLAGS(16,1,DIRECT,NO) },
+		{ fastPathSourceWord32_2,        CR_sourceWord, STD_FLAGS(32,2,15BIT,NO) },
+
+		{ fastPathSourceWord32_1,        CR_sourceWord, STD_FLAGS(32,1,15BIT,NO) },
+
+		{ fastPathSourceWord0_1,         CR_sourceWord, STD_FLAGS_NO_SOURCE(1,NO) },
+		{ fastPathSourceWord0_1_scalar,  CR_sourceWord, STD_FLAGS_NO_SOURCE(1,SCALAR) },
+		{ fastPathSourceWord0_2,         CR_sourceWord, STD_FLAGS_NO_SOURCE(2,NO) },
+		{ fastPathSourceWord0_2_scalar,  CR_sourceWord, STD_FLAGS_NO_SOURCE(2,SCALAR) },
+		{ fastPathSourceWord0_4,         CR_sourceWord, STD_FLAGS_NO_SOURCE(4,NO) },
+		{ fastPathSourceWord0_4_scalar,  CR_sourceWord, STD_FLAGS_NO_SOURCE(4,SCALAR) },
+		{ fastPathSourceWord0_8,         CR_sourceWord, STD_FLAGS_NO_SOURCE(8,NO) },
+		{ fastPathSourceWord0_8_scalar,  CR_sourceWord, STD_FLAGS_NO_SOURCE(8,SCALAR) },
+		{ fastPathSourceWord0_16,        CR_sourceWord, STD_FLAGS_NO_SOURCE(16,NO) },
+		{ fastPathSourceWord0_16_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(16,SCALAR) },
+		{ fastPathSourceWord0_32,        CR_sourceWord, STD_FLAGS_NO_SOURCE(32,NO) },
+		{ fastPathSourceWord0_32_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(32,SCALAR) },
+
+		{ fastPathPixPaint1_1,           CR_pixPaint,   STD_FLAGS(1,1,NO,NO) },
+		{ fastPathPixPaint2_2,           CR_pixPaint,   STD_FLAGS(2,2,NO,NO) },
+		{ fastPathPixPaint4_4,           CR_pixPaint,   STD_FLAGS(4,4,NO,NO) },
+		{ fastPathPixPaint8_8,           CR_pixPaint,   STD_FLAGS(8,8,NO,NO) },
+		{ fastPathPixPaint16_16,         CR_pixPaint,   STD_FLAGS(16,16,NO,NO) },
+		{ fastPathPixPaint32_32,         CR_pixPaint,   STD_FLAGS(32,32,NO,NO) },
+
+		{ fastPathAlphaBlend32_32,       CR_alphaBlend, STD_FLAGS(32,32,NO,NO) },
+
+		{ fastPathBitAnd1_1,             CR_bitAnd,     STD_FLAGS(1,1,NO,NO) },
+		{ fastPathBitAnd2_2,             CR_bitAnd,     STD_FLAGS(2,2,NO,NO) },
+		{ fastPathBitAnd4_4,             CR_bitAnd,     STD_FLAGS(4,4,NO,NO) },
+		{ fastPathBitAnd8_8,             CR_bitAnd,     STD_FLAGS(8,8,NO,NO) },
+		{ fastPathBitAnd16_16,           CR_bitAnd,     STD_FLAGS(16,16,NO,NO) },
+		{ fastPathBitAnd32_32,           CR_bitAnd,     STD_FLAGS(32,32,NO,NO) },
+};
+
+void addArmSimdFastPaths(void)
+{
+	addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
+}

Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.h
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.h	                        (rev 0)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimd.h	2013-06-18 23:13:50 UTC (rev 2744)
@@ -0,0 +1,31 @@
+/*
+ * Copyright © 2013 Raspberry Pi Foundation
+ * Copyright © 2013 RISC OS Open Ltd
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of the copyright holders not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  The copyright holders make no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+ * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ *
+ */
+
+#ifndef BITBLTARMSIMD_H_
+#define BITBLTARMSIMD_H_
+
+void addArmSimdFastPaths(void);
+
+#endif /* BITBLTARMSIMD_H_ */

Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAlphaBlend.s
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAlphaBlend.s	                        (rev 0)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAlphaBlend.s	2013-06-18 23:13:50 UTC (rev 2744)
@@ -0,0 +1,141 @@
+;
+; Copyright © 2013 Raspberry Pi Foundation
+; Copyright © 2013 RISC OS Open Ltd
+;
+; Permission to use, copy, modify, distribute, and sell this software and its
+; documentation for any purpose is hereby granted without fee, provided that
+; the above copyright notice appear in all copies and that both that
+; copyright notice and this permission notice appear in supporting
+; documentation, and that the name of the copyright holders not be used in
+; advertising or publicity pertaining to distribution of the software without
+; specific, written prior permission.  The copyright holders make no
+; representations about the suitability of this software for any purpose.  It
+; is provided "as is" without express or implied warranty.
+;
+; THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+; SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+; FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+; SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+; OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+; SOFTWARE.
+;
+
+; Debug options
+                GBLL    DebugData
+;DebugData       SETL    {TRUE}
+                GBLL    DebugPld
+;DebugPld        SETL    {TRUE}
+                GBLL    VerboseBuild
+;VerboseBuild    SETL    {TRUE}
+
+        GET     BitBltArmSimdAsm.hdr
+
+        AREA    |BitBltArmSimdAlphaBlend$$Code|, CODE, READONLY
+        ARM
+
+; ********************************************************************
+
+        MACRO
+        AlphaBlend32_32_init
+        MOV     ht_info, #1
+        MOV     ht, #0
+        ORR     ht_info, ht_info, ht_info, LSL #16 ; &10001
+        MEND
+
+        MACRO
+        AlphaBlend32_32_1pixel $src, $dst, $tmp0, $tmp1, $tmp2, $known_not_transp
+      [ "$known_not_transp" = ""
+        MOVS    $tmp2, $src, LSR #24      ; s_a
+        BEQ     %FT09 ; fully transparent - use dst
+      ]
+        TEQ     $tmp2, #&FF
+        BEQ     %FT10 ; fully opaque - use src
+        UXTB    $tmp0, $src, ROR #8       ; s_ag
+        ORR     $tmp0, $tmp0, #&FF0000
+        UXTB16  $tmp1, $src               ; s_rb
+        MUL     $tmp0, $tmp0, $tmp2
+        MUL     $tmp1, $tmp1, $tmp2
+        RSB     $tmp2, $tmp2, #&FF
+        UXTB16  $src, $dst, ROR #8        ; d_ag
+        UXTB16  $dst, $dst                ; d_rb
+        MLA     $src, $src, $tmp2, $tmp0  ; ag
+        MLA     $dst, $dst, $tmp2, $tmp1  ; rb
+        USUB16  $tmp0, $src, ht_info
+        UXTAB16 $src, $src, $src, ROR #8
+        SEL     $tmp1, ht_info, ht
+        UXTAB16 $src, $tmp1, $src, ROR #8
+        USUB16  $tmp0, $dst, ht_info
+        UXTAB16 $dst, $dst, $dst, ROR #8
+        SEL     $tmp1, ht_info, ht
+        UXTAB16 $dst, $tmp1, $dst, ROR #8
+        ORR     $src, $dst, $src, LSL #8  ; recombine
+        B       %FT10
+09      MOV     $src, $dst
+10
+        MEND
+
+        MACRO
+        AlphaBlend32_32_32bits $src, $dst, $fixed_skew
+        Read1Word src, 0, carry, $fixed_skew, skew, scratch
+        ADD     dst, dst, #1*4
+        MOVS    $wk7, $wk0, LSR #24
+        BEQ     %FT01 ; all pixels fully transparent - don't touch destination
+        LDR     $wk4, [dst, #-1*4]
+        AlphaBlend32_32_1pixel $wk0, $wk4, $wk5, $wk6, $wk7, known_not_transp
+        Write1Word dst, 0
+01
+        MEND
+
+        MACRO
+        AlphaBlend32_32_64bits $src, $fixed_skew
+        Read2Words src, 0, carry, $fixed_skew, skew, scratch
+        ADD     dst, dst, #2*4
+        MOVS    $wk7, $wk0, LSR #24
+        MOVEQS  $wk7, $wk1, LSR #24
+        BEQ     %FT01 ; all pixels fully transparent - don't touch destination
+        LDR     $wk4, [dst, #-2*4]
+        AlphaBlend32_32_1pixel $wk0, $wk4, $wk5, $wk6, $wk7
+        LDR     $wk4, [dst, #-1*4]
+        AlphaBlend32_32_1pixel $wk1, $wk4, $wk5, $wk6, $wk7
+        Write2Words dst, 0
+01
+        MEND
+
+        MACRO
+        AlphaBlend32_32_128bits_head $src, $fixed_skew, $intra_preloads
+        Read4Words src, 0, carry, $fixed_skew, skew, scratch
+        MEND
+
+        MACRO
+        AlphaBlend32_32_128bits_tail $src
+        ADD     dst, dst, #4*4
+        MOVS    $wk7, $wk0, LSR #24
+        MOVEQS  $wk7, $wk1, LSR #24
+        MOVEQS  $wk7, $wk2, LSR #24
+        MOVEQS  $wk7, $wk3, LSR #24
+        BEQ     %FT01 ; all pixels fully transparent - don't touch destination
+        LDR     $wk4, [dst, #-4*4]
+        AlphaBlend32_32_1pixel $wk0, $wk4, $wk5, $wk6, $wk7
+        LDR     $wk4, [dst, #-3*4]
+        AlphaBlend32_32_1pixel $wk1, $wk4, $wk5, $wk6, $wk7
+        LDR     $wk4, [dst, #-2*4]
+        AlphaBlend32_32_1pixel $wk2, $wk4, $wk5, $wk6, $wk7
+        LDR     $wk4, [dst, #-1*4]
+        AlphaBlend32_32_1pixel $wk3, $wk4, $wk5, $wk6, $wk7
+        Write4Words dst, 0
+01
+        MEND
+
+;$op     GenerateFunctions $src_bpp, $dst_w_bpp, $qualifier, $flags, $prefetch_distance,
+;        $work_regs, $line_saved_regs, $leading_pixels_reg, $preload_offset_reg, $init, $newline, $cleanup
+
+AlphaBlend GenerateFunctions 32, 32,, \
+  FLAG_DST_READWRITE :OR: FLAG_SPILL_LINE_VARS :OR: FLAG_PROCESS_PARALLEL :OR: FLAG_NO_PRELOAD_DST, 1, \
+  "stride_d,stride_s,map,bitptrs,skew,orig_w,scratch,carry", \
+  "x,stride_d,stride_s", bitptrs,, init ; leading_pixels_reg = wk3
+
+; ********************************************************************
+
+        END

Added: trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr
===================================================================
--- trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr	                        (rev 0)
+++ trunk/platforms/Cross/plugins/BitBltPlugin/BitBltArmSimdAsm.hdr	2013-06-18 23:13:50 UTC (rev 2744)
@@ -0,0 +1,1939 @@
+;
+; Copyright © 2013 Raspberry Pi Foundation
+; Copyright © 2013 RISC OS Open Ltd
+;
+; Permission to use, copy, modify, distribute, and sell this software and its
+; documentation for any purpose is hereby granted without fee, provided that
+; the above copyright notice appear in all copies and that both that
+; copyright notice and this permission notice appear in supporting
+; documentation, and that the name of the copyright holders not be used in
+; advertising or publicity pertaining to distribution of the software without
+; specific, written prior permission.  The copyright holders make no
+; representations about the suitability of this software for any purpose.  It
+; is provided "as is" without express or implied warranty.
+;
+; THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
+; SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+; FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
+; SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+; WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
+; AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
+; OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+; SOFTWARE.
+;
+
+      [ :LNOT: :DEF: DebugData
+        GBLL    DebugData
+      ]
+      [ :LNOT: :DEF: DebugPld
+        GBLL    DebugPld
+      ]
+      [ :LNOT: :DEF: VerboseBuild
+        GBLL    VerboseBuild
+      ]
+
+; Flag bitfield definitions
+FLAG_NO_HALFTONE                *       0 :SHL: 0
+FLAG_SCALAR_HALFTONE            *       1 :SHL: 0
+FLAG_VECTOR_HALFTONE            *       2 :SHL: 0
+FLAG_NO_COLOUR_MAP              *       0 :SHL: 2
+FLAG_COLOUR_MAP                 *       1 :SHL: 2
+
+FLAG_DST_WRITEONLY              *       0 :SHL: 3
+FLAG_DST_READWRITE              *       1 :SHL: 3
+FLAG_SPILL_NO_LINE_VARS         *       0 :SHL: 4
+FLAG_SPILL_LINE_VARS_WIDE       *       1 :SHL: 4
+FLAG_SPILL_LINE_VARS_NON_WIDE   *       2 :SHL: 4
+FLAG_SPILL_LINE_VARS            *       3 :SHL: 4
+FLAG_EXPAND_SKEW                *       0 :SHL: 6
+FLAG_NO_EXPAND_SKEW             *       1 :SHL: 6
+FLAG_PROCESS_SERIAL             *       0 :SHL: 7  ; sub-word data is presented MS-aligned, and results are expected LS-aligned
+FLAG_PROCESS_PARALLEL           *       1 :SHL: 7  ; sub-word data retains its original alignment throughout (only useful if src & dest depths same)
+FLAG_MAX_128BIT_MACRO           *       0 :SHL: 8
+FLAG_MAX_256BIT_MACRO           *       1 :SHL: 8  ; particularly tight loops can sometimes benefit from being unrolled to allow 2x 128-bit blocks to be staggered
+FLAG_PRELOAD_DST                *       0 :SHL: 9
+FLAG_NO_PRELOAD_DST             *       1 :SHL: 9
+
+; Offsets into stack
+        GBLA    args_stack_offset
+args_stack_offset       SETA    9*4
+        GBLA    locals_stack_offset
+locals_stack_offset     SETA    0
+
+; Top-level macro arguments are held in variables for convenience
+        GBLA    src_bpp
+        GBLA    dst_w_bpp
+        GBLA    flags
+        GBLA    prefetch_distance
+        GBLS    leading_pixels_reg
+        GBLS    preload_offset_reg
+        GBLS    line_saved_regs
+        GBLS    init
+        GBLS    newline
+        GBLS    reinitwk
+        GBLS    cleanup
+; Derived values
+        GBLS    prefix
+        GBLA    dst_r_bpp
+        GBLA    src_bpp_shift
+        GBLA    dst_bpp_shift
+        GBLL    sub_byte
+        GBLA    num_line_saved_regs
+        GBLA    pix_per_block
+
+; Work registers - variables so they can be reassigned between functions
+; (should always be assigned in increasing register number though)
+        GBLA    wk0_num
+        GBLA    wk1_num
+        GBLA    wk2_num
+        GBLA    wk3_num
+        GBLA    wk4_num
+        GBLA    wk5_num
+        GBLA    wk6_num
+        GBLA    wk7_num
+        GBLA    wk8_num
+        GBLA    wk9_num
+        GBLA    wk10_num
+; String versions of the same
+        GBLS    wk0
+        GBLS    wk1
+        GBLS    wk2
+        GBLS    wk3
+        GBLS    wk4
+        GBLS    wk5
+        GBLS    wk6
+        GBLS    wk7
+        GBLS    wk8
+        GBLS    wk9
+        GBLS    wk10
+
+
+ [ DebugData :LOR: DebugPld
+        IMPORT  printf
+ ]
+        GBLL    PrintAtStartOfLine
+PrintAtStartOfLine SETL {TRUE}
+        MACRO
+        Print$cond $switch, $fmt, $reg0, $reg1, $reg2
+ [ Debug$switch
+      [ "$cond" <> "" :LAND: "$cond" <> "AL"
+        LCLS    opp
+opp     SETS    :REVERSE_CC: "$cond"
+        B$opp   %FT82
+      ]
+        PUSH    {r12,r14}
+        PUSH    {r0-r12}
+        ADD     ip, sp, #15*4
+        STR     ip, [sp, #13*4]
+        MRS     v1, CPSR
+      [ "$reg0" <> ""
+        LDR     a2, [sp, #:RCONST:$reg0 * 4]
+      ]
+      [ "$reg1" <> ""
+        LDR     a3, [sp, #:RCONST:$reg1 * 4]
+      ]
+      [ "$reg2" <> ""
+        LDR     a4, [sp, #:RCONST:$reg2 * 4]
+      ]
+        ADR     a1, %FT80
+        ADR     lr, %FT81
+        B       printf
+80
+      [ PrintAtStartOfLine
+        =       "$switch: "
+      ]
+        =       "$fmt", 0
+PrintAtStartOfLine SETL "$fmt" :RIGHT: 1 = "\n"
+        ALIGN
+81      MSR     CPSR_cxsf, v1
+        POP     {r0-r12}
+        ADD     sp, sp, #4
+        POP     {r14}
+82
+ ]
+        MEND
+
+ [ :LNOT: :DEF: |objasm$version| :LAND: :LNOT: :DEF: |ads$version|
+ ; Assume asasm, which is lacking a number of key opcodes
+ ; Note there's a bug in asasm, the CC_ENCODING value shouldn't need shifting
+
+        MACRO
+$label  SEL$cond $Rd, $Rn, $Rm
+$label  DCI     :CC_ENCODING:"$cond":SHL:28 :OR: &06800FB0 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0)
+        MEND
+
+        MACRO
+$label  UADD8$cond $Rd, $Rn, $Rm
+$label  DCI     :CC_ENCODING:"$cond":SHL:28 :OR: &06500F90 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0)
+        MEND
+
+        MACRO
+$label  USUB8$cond $Rd, $Rn, $Rm
+$label  DCI     :CC_ENCODING:"$cond":SHL:28 :OR: &06500FF0 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0)
+        MEND
+
+        MACRO
+$label  USUB16$cond $Rd, $Rn, $Rm
+$label  DCI     :CC_ENCODING:"$cond":SHL:28 :OR: &06500F70 :OR: (:RCONST:$Rn :SHL: 16) :OR: (:RCONST:$Rd :SHL: 12) :OR: (:RCONST:$Rm :SHL: 0)
+        MEND
+
+        MACRO
+$label  SETEND $endian
+        IF "$endian" = "LE"
+$label  DCI     &F1010000
+        ELIF "$endian" = "BE"
+$label  DCI     &F1010200
+        ELSE
+        !       1, "Unrecognised SETEND endianness"
+        ENDIF
+        MEND
+
+ ]
+
+; Add a constant, using a minimal number of ARM instructions
+; Doesn't handle cases where bit 31 of constant is set, but we're not expecting any of those
+        MACRO
+$lab    AddL    $dst, $src, $const
+        LCLA    tmp
+tmp     SETA    $const
+tmp     SETA    tmp :OR: (((tmp :AND: &55555555) :SHL: 1) + ((tmp :AND: &AAAAAAAA) :SHR: 1))
+        LCLA    lsb
+lsb     SETA    tmp :AND::NOT: (tmp-1)
+tmp     SETA    tmp :OR: (tmp :SHR: 2)
+tmp     SETA    tmp :OR: (tmp :SHR: 4)
+tmp     SETA    tmp :OR: (tmp :SHR: 8)
+tmp     SETA    tmp :OR: (tmp :SHR: 16)
+        LCLA    msb
+msb     SETA    (tmp+1) :AND::NOT: tmp
+        LCLS    reg
+reg     SETS    "$src"
+$lab
+        WHILE   lsb < msb
+        ADD     $dst, $reg, #($const) :AND: (lsb * &FF)
+lsb     SETA    lsb * 256
+reg     SETS    "$dst"
+        WEND
+        MEND
+
+; Find log2 of a variable
+        MACRO
+$out    Log2    $in
+      [ $in = 0
+$out    SETA    -1
+      |
+        LCLA    tmp
+tmp     SETA    $in
+$out    SETA    0
+        WHILE   tmp > 1
+tmp     SETA    tmp / 2
+$out    SETA    $out + 1
+        WEND
+      ]
+        MEND
+
+; Find max of two numbers
+        MACRO
+$out    Max     $a, $b
+      [ $a > $b
+$out    SETA    $a
+      |
+$out    SETA    $b
+      ]
+        MEND
+
+; Find if an integer is the last in a group of a power-of-2 integers
+        MACRO
+$result IsEndOfGroup $index, $size
+        LCLA    index
+index   SETA    $index
+        LCLA    size
+size    SETA    $size
+      [ size < 2
+$result SETL    {TRUE}
+      |
+$result SETL    (index :AND::NOT: (index + 1)) :AND: (size / 2) > 0
+      ]
+        MEND
+
+; Convert an integer to a decimal string
+        MACRO
+$str    DecimalStr $num
+        LCLA    n
+n       SETA    $num
+$str    SETS    ""
+        WHILE   n <> 0
+$str    SETS    :CHR:(48 + n % 10) :CC: $str
+n       SETA    n / 10
+        WEND
+     IF :LEN: $str = 0
+$str    SETS    "0"
+  ENDIF
+        MEND
+
+; Convert a wk register index into the name of the physical register
+        MACRO
+$str    LookupWk $index
+        LCLS    wk
+wk      DecimalStr $index
+wk      SETS    "wk$wk"
+$str    SETS    $wk
+        MEND
+
+; Assign the wk registers from a list of registers
+        MACRO
+        AssignWk $list
+        LCLA    wk_num
+        LCLS    wk
+        LCLS    tail
+        LCLS    reg
+wk_num  SETA    0
+tail    SETS    "$list,"
+        WHILE   :LEN: tail > 0
+wk      DecimalStr wk_num
+wk_num  SETA    wk_num + 1
+reg     SETS    ""
+        WHILE   tail :LEFT: 1 <> ","
+reg     SETS    reg :CC: (tail :LEFT: 1)
+tail    SETS    tail :RIGHT: (:LEN:tail - 1)
+        WEND
+tail    SETS    tail :RIGHT: (:LEN:tail - 1)
+wk$wk._num SETA :RCONST: $reg
+wk$wk   DecimalStr wk$wk._num
+wk$wk   SETS    "r" :CC: wk$wk
+        WEND
+        ; Ensure the remaining ones aren't used
+        WHILE   wk_num <= 10
+wk      DecimalStr wk_num
+wk_num  SETA    wk_num + 1
+wk$wk._num SETA -1
+wk$wk   SETS    "invalid_register_wk$wk"
+        WEND
+        MEND
+
+; See if a given register name is in a comma-separated list of registers
+        MACRO
+$out    RegIsInList $reg, $list
+        LCLS    tail
+tail    SETS    "$list,"
+        WHILE   :LEN: tail > 0
+     [ :LEN: "$reg," <= :LEN: tail
+      [ "$reg," = tail :LEFT: :LEN: "$reg,"
+$out    SETL    {TRUE}
+        MEXIT
+      ]
+     ]
+        WHILE   tail :LEFT: 1 <> ","
+tail    SETS    tail :RIGHT: (:LEN:tail - 1)
+        WEND
+tail    SETS    tail :RIGHT: (:LEN:tail - 1)
+        WEND
+$out    SETL    {FALSE}
+        MEND
+
+; Count how many registers are in a comma-separated list of registers
+        MACRO
+$out    CountRegsInList $list
+$out    SETA    1
+        LCLS    tail
+tail    SETS    "$list"
+        WHILE   :LEN: tail > 0
+      [ tail :LEFT: 1 = ","
+$out    SETA    $out + 1
+      ]
+tail    SETS    tail :RIGHT: (:LEN:tail -1)
+        WEND
+        MEND
+
+; Data read macros
+
+        MACRO
+$lab    ReadFirstSubWord $base, $data, $carry, $pixels, $fixed_skew, $skew, $tmp
+$lab
+ [ src_bpp > 0 :LAND: src_bpp < 32
+        LCLS    reg0
+reg0    LookupWk $data
+     IF (flags :AND: FLAG_NO_EXPAND_SKEW) > 0
+      [ "$pixels" <> "#0"
+        AND     $tmp, $pixels, #32/src_bpp - 1
+        CMP     $tmp, $skew, LSR #src_bpp_shift
+        PrintHI Data, "ReadFirstSubWord: left@%p", $base
+        LDRHI   $reg0, [$base], #4
+        PrintHI Data, " %08X\n", $reg0
+      ]
+        CMP     $skew, #0
+        PrintHI Data, "ReadFirstSubWord: right@%p", $base
+        LDRHI   $carry, [$base], #4
+        PrintHI Data, " %08X\n", $carry
+        CMP     $tmp, #0
+        BEQ     %FT01
+        RSB     $tmp, $skew, #32
+        MOV     $reg0, $reg0, LSL $skew
+        ORR     $reg0, $reg0, $carry, LSR $tmp
+        Print   Data, "ReadFirstSubWord: skew %u -> %08X\n", $skew, $reg0
+      [ flags :AND: FLAG_PROCESS_PARALLEL = 0 :LAND: "$pixels" <> "#0"
+        AND     $tmp, $pixels, #32/src_bpp - 1
+        MOV     $tmp, $tmp, LSL #src_bpp_shift
+        MOV     $reg0, $reg0, ROR $tmp
+      ]
+01
+   ELIF $fixed_skew == 0
+     [ "$pixels" <> "#0"
+        ANDS    $tmp, $pixels, #32/src_bpp - 1
+      [ flags :AND: FLAG_PROCESS_PARALLEL = 0
+        BEQ     %FT01
+        Print   Data, "ReadFirstSubWord: left@%p", $base
+        LDR     $reg0, [$base], #4
+        Print   Data, " %08X\n", $reg0
+        MOV     $tmp, $tmp, LSL #src_bpp_shift
+        MOV     $reg0, $reg0, ROR $tmp
+01
+      |
+        PrintNE Data, "ReadFirstSubWord: left@%p", $base
+        LDRNE   $reg0, [$base], #4
+        PrintNE Data, " %08X\n", $reg0
+      ]
+     ]
+   ELSE
+      [ "$pixels" <> "#0"
+        AND     $tmp, $pixels, #32/src_bpp - 1
+        CMP     $tmp, #$fixed_skew/src_bpp
+        PrintHI Data, "ReadFirstSubWord: left@%p", $base
+        LDRHI   $reg0, [$base], #4
+        PrintHI Data, " %08X\n", $reg0
+      ]
+        Print   Data, "ReadFirstSubWord: right@%p", $base
+        LDR     $carry, [$base], #4
+        Print   Data, " %08X\n", $carry
+        CMP     $tmp, #0
+        BEQ     %FT01
+        MOV     $reg0, $reg0, LSL #$fixed_skew
+        ORR     $reg0, $reg0, $carry, LSR #32-$fixed_skew
+        Print   Data, "ReadFirstSubWord: skew $fixed_skew -> %08X\n", $reg0
+      [ flags :AND: FLAG_PROCESS_PARALLEL = 0
+        MOV     $tmp, $tmp, LSL #src_bpp_shift
+        MOV     $reg0, $reg0, ROR $tmp
+      ]
+01
+  ENDIF
+ ]
+        MEND
+
+        MACRO
+$lab    ReadLastSubWord $base, $data, $carry, $pixels, $fixed_skew, $skew, $tmp
+$lab
+ [ src_bpp > 0 :LAND: src_bpp < 32
+        LCLS    reg0
+reg0    LookupWk $data
+     IF (flags :AND: FLAG_NO_EXPAND_SKEW) > 0
+        CMP     $skew, #0
+        BHI     %FT01
+        TST     $pixels, #32/src_bpp - 1
+        PrintNE Data, "ReadLastSubWord: next@%p", $base
+        LDRNE   $reg0, [$base], #4
+        PrintNE Data, " %08X\n", $reg0
+        B       %FT02
+01
+        Print   Data, "ReadLastSubWord: left %08X\n", $carry
+        MOV     $reg0, $carry, LSL $skew
+        AND     $tmp, $pixels, #32/src_bpp - 1
+        RSB     $tmp, $tmp, #32/src_bpp
+        CMP     $tmp, $skew, LSR #src_bpp_shift
+        BHS     %FT02
+        Print   Data, "ReadLastSubWord: right@%p", $base
+        LDR     $carry, [$base], #4

@@ Diff output truncated at 50000 characters. @@


More information about the Vm-dev mailing list