[Vm-dev] [commit][2767] use version with fast bitblt support

commits at squeakvm.org commits at squeakvm.org
Thu Aug 15 14:21:39 UTC 2013


Revision: 2767
Author:   piumarta
Date:     2013-08-15 07:21:39 -0700 (Thu, 15 Aug 2013)
Log Message:
-----------
use version with fast bitblt support

Modified Paths:
--------------
    trunk/platforms/unix/src/vm/intplugins/BitBltPlugin/BitBltPlugin.c

Modified: trunk/platforms/unix/src/vm/intplugins/BitBltPlugin/BitBltPlugin.c
===================================================================
--- trunk/platforms/unix/src/vm/intplugins/BitBltPlugin/BitBltPlugin.c	2013-08-15 14:17:30 UTC (rev 2766)
+++ trunk/platforms/unix/src/vm/intplugins/BitBltPlugin/BitBltPlugin.c	2013-08-15 14:21:39 UTC (rev 2767)
@@ -1,7 +1,14 @@
-/* Automatically generated from Squeak on 30 July 2012 4:52:36 pm 
-   by VMMaker 4.9.8
+/* Smalltalk from Squeak4.4 with VMMaker 4.12.5 translated as C source on 25 June 2013 12:57:59 pm */
+/* Automatically generated by
+	SmartSyntaxPluginCodeGenerator VMMaker-dtl.322 uuid: 8c6464c8-9bdc-40bc-90fe-4bf2b642ff2d
+   from
+	BitBltSimulation VMMaker-dtl.322 uuid: 8c6464c8-9bdc-40bc-90fe-4bf2b642ff2d
  */
+static char __buildInfo[] = "BitBltSimulation VMMaker-dtl.322 uuid: 8c6464c8-9bdc-40bc-90fe-4bf2b642ff2d " __DATE__ ;
 
+
+
+
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -27,6 +34,12 @@
 // was #undef EXPORT(returnType) but screws NorCroft cc
 #define EXPORT(returnType) static returnType
 #endif
+#ifdef ENABLE_FAST_BLT
+#include "BitBltDispatch.h"
+#else
+// to handle the unavoidable decl in the spec of copyBitsFallback();
+#define operation_t void
+#endif
 
 #include "sqMemoryAccess.h"
 
@@ -93,8 +106,14 @@
 static sqInt clipRange(void);
 #pragma export on
 EXPORT(sqInt) copyBits(void);
+#pragma export off
+void copyBitsFallback(operation_t *op, unsigned int flags);
+static sqInt copyBitsFastPathSpecialised(void);
+#pragma export on
 EXPORT(sqInt) copyBitsFromtoat(sqInt startX, sqInt stopX, sqInt yValue);
 #pragma export off
+static sqInt copyBitsLockedAndClipped(void);
+static sqInt copyBitsRule41Test(void);
 static sqInt copyLoop(void);
 static sqInt copyLoopNoSource(void);
 static sqInt copyLoopPixMap(void);
@@ -137,6 +156,7 @@
 EXPORT(sqInt) primitiveCopyBits(void);
 EXPORT(sqInt) primitiveDisplayString(void);
 EXPORT(sqInt) primitiveDrawLoop(void);
+EXPORT(sqInt) primitivePixelValueAt(void);
 EXPORT(sqInt) primitiveWarpBits(void);
 #pragma export off
 static sqInt queryDestSurface(sqInt handle);
@@ -237,9 +257,9 @@
 };
 static const char *moduleName =
 #ifdef SQUEAK_BUILTIN_PLUGIN
-	"BitBltPlugin 30 July 2012 (i)"
+	"BitBltPlugin 25 June 2013 (i)"
 #else
-	"BitBltPlugin 30 July 2012 (e)"
+	"BitBltPlugin 25 June 2013 (e)"
 #endif
 ;
 static sqInt nWords;
@@ -286,8 +306,8 @@
 	values obtained from the left and right fringes. */
 
 static sqInt OLDrgbDiffwith(sqInt sourceWord, sqInt destinationWord) {
-    sqInt diff;
-    sqInt pixMask;
+	sqInt diff;
+	sqInt pixMask;
 
 	if (destDepth < 16) {
 
@@ -325,22 +345,22 @@
 	values obtained from the left and right fringes. */
 
 static sqInt OLDtallyIntoMapwith(sqInt sourceWord, sqInt destinationWord) {
-    sqInt i;
-    sqInt mapIndex;
-    sqInt pixMask;
-    sqInt shiftWord;
-    sqInt d;
-    sqInt destPix;
-    sqInt mask;
-    sqInt srcPix;
-    sqInt d1;
-    sqInt destPix1;
-    sqInt mask3;
-    sqInt srcPix1;
-    sqInt d2;
-    sqInt destPix2;
-    sqInt mask4;
-    sqInt srcPix2;
+	sqInt pixMask;
+	sqInt mapIndex;
+	sqInt i;
+	sqInt shiftWord;
+	sqInt d;
+	sqInt destPix;
+	sqInt srcPix;
+	sqInt mask;
+	sqInt d1;
+	sqInt destPix1;
+	sqInt srcPix1;
+	sqInt mask3;
+	sqInt d2;
+	sqInt destPix2;
+	sqInt srcPix2;
+	sqInt mask4;
 
 	if (!((cmFlags & (ColorMapPresent | ColorMapIndexedPart)) == (ColorMapPresent | ColorMapIndexedPart))) {
 		return destinationWord;
@@ -359,6 +379,9 @@
 		return destinationWord;
 	}
 	if (destDepth == 16) {
+
+		/* Two pixels  Tally the right half... */
+
 		/* begin rgbMap:from:to: */
 		if (((d = cmBitsPerColor - 5)) > 0) {
 			mask = (1 << 5) - 1;
@@ -446,6 +469,9 @@
 	l2:	/* end rgbMap:from:to: */;
 		cmLookupTable[mapIndex & cmMask] = ((cmLookupTable[mapIndex & cmMask]) + 1);
 	} else {
+
+		/* Just one pixel. */
+
 		/* begin rgbMap:from:to: */
 		if (((d2 = cmBitsPerColor - 8)) > 0) {
 			mask4 = (1 << 8) - 1;
@@ -506,12 +532,12 @@
 	component.  The high byte of the result will be 0. */
 
 static sqInt alphaBlendwith(sqInt sourceWord, sqInt destinationWord) {
-    sqInt alpha;
-    sqInt blend;
-    sqInt colorMask;
-    sqInt result;
-    sqInt shift;
-    sqInt unAlpha;
+	sqInt unAlpha;
+	sqInt colorMask;
+	sqInt blend;
+	sqInt result;
+	sqInt shift;
+	sqInt alpha;
 
 
 	/* High 8 bits of source pixel */
@@ -572,21 +598,21 @@
 	colormaps, as is the case with WarpBlt. */
 
 static sqInt alphaBlendConstwithpaintMode(sqInt sourceWord, sqInt destinationWord, sqInt paintMode) {
-    sqInt bitsPerColor;
-    sqInt blend;
-    sqInt destPixVal;
-    sqInt destShifted;
-    sqInt i;
-    sqInt j;
-    sqInt maskShifted;
-    sqInt pixBlend;
-    sqInt pixMask;
-    sqInt result;
-    sqInt rgbMask;
-    sqInt shift;
-    sqInt sourcePixVal;
-    sqInt sourceShifted;
-    sqInt unAlpha;
+	sqInt sourcePixVal;
+	sqInt j;
+	sqInt bitsPerColor;
+	sqInt unAlpha;
+	sqInt sourceShifted;
+	sqInt pixMask;
+	sqInt blend;
+	sqInt rgbMask;
+	sqInt result;
+	sqInt shift;
+	sqInt destShifted;
+	sqInt maskShifted;
+	sqInt i;
+	sqInt destPixVal;
+	sqInt pixBlend;
 
 	if (destDepth < 16) {
 		return destinationWord;
@@ -604,9 +630,11 @@
 	sourceShifted = sourceWord;
 	result = destinationWord;
 	if (destPPW == 1) {
+
+		/* 32bpp blends include alpha */
+
 		if (!(paintMode && (sourceWord == 0))) {
 
-			/* 32bpp blends include alpha */
 			/* painting a transparent pixel */
 
 			result = 0;
@@ -651,16 +679,18 @@
 	e.g., it is assumed that the source color is already scaled. */
 
 static sqInt alphaBlendScaledwith(sqInt sourceWord, sqInt destinationWord) {
-    sqInt a;
-    sqInt b;
-    sqInt dstMask;
-    sqInt g;
-    sqInt r;
-    sqInt srcMask;
-    sqInt unAlpha;
+	sqInt unAlpha;
+	sqInt g;
+	sqInt srcMask;
+	sqInt a;
+	sqInt dstMask;
+	sqInt r;
+	sqInt b;
 
 
 	/* Do NOT inline this into optimized loops */
+
+
 	/* High 8 bits of source pixel */
 
 	unAlpha = 255 - (((usqInt) sourceWord) >> 24);
@@ -707,27 +737,29 @@
 	 */
 
 static sqInt alphaSourceBlendBits16(void) {
-    sqInt deltaX;
-    sqInt deltaY;
-    sqInt destWord;
-    sqInt ditherBase;
-    sqInt ditherIndex;
-    sqInt ditherThreshold;
-    sqInt dstIndex;
-    sqInt dstMask;
-    sqInt dstY;
-    sqInt sourceWord;
-    sqInt srcAlpha;
-    sqInt srcIndex;
-    sqInt srcShift;
-    sqInt srcY;
-    sqInt addThreshold;
-    sqInt addThreshold1;
-    sqInt dstValue;
-    sqInt dstValue1;
+	sqInt ditherBase;
+	sqInt ditherThreshold;
+	sqInt srcShift;
+	sqInt sourceWord;
+	sqInt srcIndex;
+	sqInt deltaX;
+	sqInt dstIndex;
+	sqInt srcAlpha;
+	sqInt dstMask;
+	sqInt deltaY;
+	sqInt srcY;
+	sqInt destWord;
+	sqInt dstY;
+	sqInt ditherIndex;
+	sqInt addThreshold;
+	sqInt addThreshold1;
+	sqInt dstValue;
+	sqInt dstValue1;
 
 
 	/* This particular method should be optimized in itself */
+
+
 	/* So we can pre-decrement */
 
 	deltaY = bbH + 1;
@@ -764,6 +796,9 @@
 			sourceWord = long32At(srcIndex);
 			srcAlpha = ((usqInt) sourceWord) >> 24;
 			if (srcAlpha == 255) {
+
+				/* Dither from 32 to 16 bit */
+
 				/* begin dither32To16:threshold: */
 				addThreshold = ((usqInt) ditherThreshold << 8);
 				sourceWord = ((((usqInt) (dither8Lookup[addThreshold + ((((usqInt) sourceWord >> 16)) & 255)]) << 10)) + (((usqInt) (dither8Lookup[addThreshold + ((((usqInt) sourceWord >> 8)) & 255)]) << 5))) + (dither8Lookup[addThreshold + (sourceWord & 255)]);
@@ -778,9 +813,11 @@
 				dstValue = dstValue | sourceWord;
 				long32Atput(dstIndex, dstValue);
 			} else {
+
+				/* srcAlpha ~= 255 */
+
 				if (!(srcAlpha == 0)) {
 
-					/* srcAlpha ~= 255 */
 					/* 0 < srcAlpha < 255 */
 					/* If we have to mix colors then just copy a single word */
 
@@ -844,15 +881,15 @@
 	 */
 
 static sqInt alphaSourceBlendBits32(void) {
-    sqInt deltaX;
-    sqInt deltaY;
-    sqInt destWord;
-    sqInt dstIndex;
-    sqInt dstY;
-    sqInt sourceWord;
-    sqInt srcAlpha;
-    sqInt srcIndex;
-    sqInt srcY;
+	sqInt sourceWord;
+	sqInt srcIndex;
+	sqInt deltaX;
+	sqInt dstIndex;
+	sqInt srcAlpha;
+	sqInt deltaY;
+	sqInt srcY;
+	sqInt destWord;
+	sqInt dstY;
 
 
 	/* This particular method should be optimized in itself */
@@ -862,6 +899,8 @@
 	(good to know on an Intel architecture) but then the increments
 	would be different between ST code and C code so must hope the
 	compiler notices what happens (MS Visual C does) */
+
+
 	/* So we can pre-decrement */
 
 	deltaY = bbH + 1;
@@ -895,10 +934,10 @@
 				}
 				deltaX += 1;
 			} else {
-				if (srcAlpha == 0) {
 
-					/* srcAlpha ~= 255 */
+				/* srcAlpha ~= 255 */
 
+				if (srcAlpha == 0) {
 					srcIndex += 4;
 
 					/* Now skip as many words as possible, */
@@ -937,23 +976,23 @@
 	 */
 
 static sqInt alphaSourceBlendBits8(void) {
-    sqInt adjust;
-    sqInt deltaX;
-    sqInt deltaY;
-    sqInt destWord;
-    sqInt dstIndex;
-    sqInt dstMask;
-    sqInt dstY;
-    sqInt mapperFlags;
-    unsigned int *mappingTable;
-    sqInt sourceWord;
-    sqInt srcAlpha;
-    sqInt srcIndex;
-    sqInt srcShift;
-    sqInt srcY;
-    sqInt pv;
-    sqInt val;
-    sqInt dstValue;
+	sqInt srcShift;
+	sqInt sourceWord;
+	sqInt srcIndex;
+	sqInt deltaX;
+	unsigned int *mappingTable;
+	sqInt dstIndex;
+	sqInt adjust;
+	sqInt mapperFlags;
+	sqInt srcAlpha;
+	sqInt dstMask;
+	sqInt deltaY;
+	sqInt srcY;
+	sqInt destWord;
+	sqInt dstY;
+	sqInt pv;
+	sqInt val;
+	sqInt dstValue;
 
 	mappingTable = default8To32Table();
 	mapperFlags = cmFlags & (~ColorMapNewStyle);
@@ -993,9 +1032,11 @@
 			sourceWord = ((long32At(srcIndex)) & (~adjust)) + adjust;
 			srcAlpha = ((usqInt) sourceWord) >> 24;
 			if (srcAlpha > 31) {
+
+				/* Everything below 31 is transparent */
+
 				if (srcAlpha < 224) {
 
-					/* Everything below 31 is transparent */
 					/* Everything above 224 is opaque */
 
 					destWord = long32At(dstIndex);
@@ -1165,18 +1206,6 @@
 /*	This function is exported for the Balloon engine */
 
 EXPORT(sqInt) copyBits(void) {
-    sqInt done;
-    sqInt gammaLookupTableOop;
-    sqInt ungammaLookupTableOop;
-    sqInt t;
-    sqInt endBits;
-    sqInt pixPerM1;
-    sqInt startBits;
-    sqInt dWid;
-    sqInt dxLowBits;
-    sqInt pixPerM11;
-    sqInt sxLowBits;
-
 	clipRange();
 	if ((bbW <= 0) || (bbH <= 0)) {
 
@@ -1188,62 +1217,372 @@
 	if (!(lockSurfaces())) {
 		return interpreterProxy->primitiveFail();
 	}
-	/* begin copyBitsLockedAndClipped */
+	
+# ifdef ENABLE_FAST_BLT  // // you really, really mustn't call this unless you have the rest of the code to link to
+	copyBitsFastPathSpecialised();
+# else
+	copyBitsLockedAndClipped();
+# endif  // ENABLE_FAST_BLT
+	
+	unlockSurfaces();
+}
+
+
+/*	Recover from the fast path specialised code saying Help-I-cant-cope */
+
+void copyBitsFallback(operation_t *op, unsigned int flags) {
+	sqInt done;
+	sqInt t;
+	sqInt endBits;
+	sqInt startBits;
+	sqInt pixPerM1;
+	sqInt dxLowBits;
+	sqInt sxLowBits;
+	sqInt dWid;
+	sqInt pixPerM11;
+
+	
+# ifdef ENABLE_FAST_BLT  // only for fast blt platform specific code
+
+	/* recover values from the operation struct used by the fast ARM code */
+
+	
+	combinationRule = op->combinationRule;
+	noSource = op->noSource;
+	sourceBits = (sqInt) op->src.bits;
+	sourcePitch = op->src.pitch;
+	sourceDepth = op->src.depth;
+	sourceMSB = op->src.msb;
+	sx = op->src.x;
+	sy = op->src.y;
+	destBits = (sqInt) op->dest.bits;
+	destPitch = op->dest.pitch;
+	destDepth = op->dest.depth;
+	destMSB = op->dest.msb;
+	dx = op->dest.x;
+	dy = op->dest.y;
+	bbW = op->width;
+	bbH = op->height;
+	cmFlags = op->cmFlags;
+	cmShiftTable = (void *) op->cmShiftTable;
+	cmMaskTable = (void *) op->cmMaskTable;
+	cmMask = op->cmMask;
+	cmLookupTable = (void *) op->cmLookupTable;
+	noHalftone = op->noHalftone;
+	halftoneHeight = op->halftoneHeight;
+	halftoneBase = (sqInt) op->halftoneBase;
+	if (combinationRule == 30 || combinationRule == 31) {
+		sourceAlpha = op->opt.sourceAlpha;
+	}
 	if (combinationRule == 41) {
-		componentAlphaModeAlpha = 255;
-		componentAlphaModeColor = 16777215;
-		gammaLookupTable = null;
-		ungammaLookupTable = null;
-		if ((interpreterProxy->methodArgumentCount()) >= 2) {
-			componentAlphaModeAlpha = interpreterProxy->stackIntegerValue((interpreterProxy->methodArgumentCount()) - 2);
-			if (!(!(interpreterProxy->failed()))) {
-				interpreterProxy->primitiveFail();
-				goto l1;
-			}
-			componentAlphaModeColor = interpreterProxy->stackIntegerValue((interpreterProxy->methodArgumentCount()) - 1);
-			if (!(!(interpreterProxy->failed()))) {
-				interpreterProxy->primitiveFail();
-				goto l1;
-			}
-			if ((interpreterProxy->methodArgumentCount()) == 4) {
-				gammaLookupTableOop = interpreterProxy->stackObjectValue(1);
-				if (interpreterProxy->isBytes(gammaLookupTableOop)) {
-					gammaLookupTable = interpreterProxy->firstIndexableField(gammaLookupTableOop);
+		componentAlphaModeColor = op->opt.componentAlpha.componentAlphaModeColor;
+		componentAlphaModeAlpha = op->opt.componentAlpha.componentAlphaModeAlpha;
+		gammaLookupTable = (void *) op->opt.componentAlpha.gammaLookupTable;
+		ungammaLookupTable = (void *) op->opt.componentAlpha.ungammaLookupTable;
+	}
+	destPPW = 32 / destDepth;
+	cmBitsPerColor = 0;
+	if (cmMask == 511) {
+		cmBitsPerColor = 3;
+	}
+	if (cmMask == 4095) {
+		cmBitsPerColor = 4;
+	}
+	if (cmMask == 16383) {
+		cmBitsPerColor = 5;
+	}
+	/* begin tryCopyingBitsQuickly */
+	if (noSource) {
+		done = 0;
+		goto l1;
+	}
+	if (!((combinationRule == 34) || (combinationRule == 41))) {
+		done = 0;
+		goto l1;
+	}
+	if (!(sourceDepth == 32)) {
+		done = 0;
+		goto l1;
+	}
+	if (sourceForm == destForm) {
+		done = 0;
+		goto l1;
+	}
+	if (combinationRule == 41) {
+		if (destDepth == 32) {
+			rgbComponentAlpha32();
+			affectedL = dx;
+			affectedR = dx + bbW;
+			affectedT = dy;
+			affectedB = dy + bbH;
+			done = 1;
+			goto l1;
+		}
+		if (destDepth == 16) {
+			rgbComponentAlpha16();
+			affectedL = dx;
+			affectedR = dx + bbW;
+			affectedT = dy;
+			affectedB = dy + bbH;
+			done = 1;
+			goto l1;
+		}
+		if (destDepth == 8) {
+			rgbComponentAlpha8();
+			affectedL = dx;
+			affectedR = dx + bbW;
+			affectedT = dy;
+			affectedB = dy + bbH;
+			done = 1;
+			goto l1;
+		}
+		done = 0;
+		goto l1;
+	}
+	if (destDepth < 8) {
+		done = 0;
+		goto l1;
+	}
+	if ((destDepth == 8) && ((cmFlags & ColorMapPresent) == 0)) {
+		done = 0;
+		goto l1;
+	}
+	if (destDepth == 32) {
+		alphaSourceBlendBits32();
+	}
+	if (destDepth == 16) {
+		alphaSourceBlendBits16();
+	}
+	if (destDepth == 8) {
+		alphaSourceBlendBits8();
+	}
+	affectedL = dx;
+	affectedR = dx + bbW;
+	affectedT = dy;
+	affectedB = dy + bbH;
+	done = 1;
+l1:	/* end tryCopyingBitsQuickly */;
+	if (done) {
+		return;
+	}
+
+	/* Choose and perform the actual copy loop. */
+
+	bitCount = 0;
+	/* begin performCopyLoop */
+	/* begin destMaskAndPointerInit */
+	pixPerM1 = destPPW - 1;
+	startBits = destPPW - (dx & pixPerM1);
+	if (destMSB) {
+		mask1 = ((usqInt) AllOnes) >> (32 - (startBits * destDepth));
+	} else {
+		mask1 = AllOnes << (32 - (startBits * destDepth));
+	}
+	endBits = (((dx + bbW) - 1) & pixPerM1) + 1;
+	if (destMSB) {
+		mask2 = AllOnes << (32 - (endBits * destDepth));
+	} else {
+		mask2 = ((usqInt) AllOnes) >> (32 - (endBits * destDepth));
+	}
+	if (bbW < startBits) {
+		mask1 = mask1 & mask2;
+		mask2 = 0;
+		nWords = 1;
+	} else {
+		nWords = (((bbW - startBits) + pixPerM1) / destPPW) + 1;
+	}
+	hDir = (vDir = 1);
+	destIndex = (destBits + (dy * destPitch)) + ((dx / destPPW) * 4);
+	destDelta = (destPitch * vDir) - (4 * (nWords * hDir));
+	if (noSource) {
+		copyLoopNoSource();
+	} else {
+		/* begin checkSourceOverlap */
+		if ((sourceForm == destForm) && (dy >= sy)) {
+			if (dy > sy) {
+				vDir = -1;
+				sy = (sy + bbH) - 1;
+				dy = (dy + bbH) - 1;
+			} else {
+				if ((dy == sy) && (dx > sx)) {
+					hDir = -1;
+					sx = (sx + bbW) - 1;
+					dx = (dx + bbW) - 1;
+					if (nWords > 1) {
+						t = mask1;
+						mask1 = mask2;
+						mask2 = t;
+					}
 				}
-				ungammaLookupTableOop = interpreterProxy->stackObjectValue(0);
-				if (interpreterProxy->isBytes(ungammaLookupTableOop)) {
-					ungammaLookupTable = interpreterProxy->firstIndexableField(ungammaLookupTableOop);
-				}
 			}
+			destIndex = (destBits + (dy * destPitch)) + ((dx / destPPW) * 4);
+			destDelta = (destPitch * vDir) - (4 * (nWords * hDir));
+		}
+		if ((sourceDepth != destDepth) || ((cmFlags != 0) || (sourceMSB != destMSB))) {
+			copyLoopPixMap();
 		} else {
-			if ((interpreterProxy->methodArgumentCount()) == 1) {
-				componentAlphaModeColor = interpreterProxy->stackIntegerValue(0);
-				if (!(!(interpreterProxy->failed()))) {
-					interpreterProxy->primitiveFail();
-					goto l1;
-				}
+			/* begin sourceSkewAndPointerInit */
+			pixPerM11 = destPPW - 1;
+			sxLowBits = sx & pixPerM11;
+			dxLowBits = dx & pixPerM11;
+			if (hDir > 0) {
+				dWid = ((bbW < (destPPW - dxLowBits)) ? bbW : (destPPW - dxLowBits));
+				preload = (sxLowBits + dWid) > pixPerM11;
 			} else {
-				interpreterProxy->primitiveFail();
-				goto l1;
+				dWid = ((bbW < (dxLowBits + 1)) ? bbW : (dxLowBits + 1));
+				preload = ((sxLowBits - dWid) + 1) < 0;
 			}
+			if (sourceMSB) {
+				skew = (sxLowBits - dxLowBits) * destDepth;
+			} else {
+				skew = (dxLowBits - sxLowBits) * destDepth;
+			}
+			if (preload) {
+				if (skew < 0) {
+					skew += 32;
+				} else {
+					skew -= 32;
+				}
+			}
+			sourceIndex = (sourceBits + (sy * sourcePitch)) + ((sx / (32 / sourceDepth)) * 4);
+			sourceDelta = (sourcePitch * vDir) - (4 * (nWords * hDir));
+			if (preload) {
+				sourceDelta -= 4 * hDir;
+			}
+			copyLoop();
 		}
 	}
+# endif  // ENABLE_FAST_BLT
+	
+}
+
+
+/*	Perform the actual copyBits operation using the fast path specialised code; fail some cases by falling back to normal code.
+	Assume: Surfaces have been locked and clipping was performed. */
+
+static sqInt copyBitsFastPathSpecialised(void) {
+	
+# ifdef ENABLE_FAST_BLT  // only for ARM
+
+	/* set the affected area to 0 first */
+
+	affectedL = (affectedR = (affectedT = (affectedB = 0)));
+	copyBitsRule41Test();
+	if (!(!(interpreterProxy->failed()))) {
+		return interpreterProxy->primitiveFail();
+	}
+	if ((combinationRule == 30) || (combinationRule == 31)) {
+
+		/* Check and fetch source alpha parameter for alpha blend */
+
+		if ((interpreterProxy->methodArgumentCount()) == 1) {
+			sourceAlpha = interpreterProxy->stackIntegerValue(0);
+			if (!((!(interpreterProxy->failed())) && ((sourceAlpha >= 0) && (sourceAlpha <= 255)))) {
+				return interpreterProxy->primitiveFail();
+			}
+		} else {
+			return interpreterProxy->primitiveFail();
+		}
+	}
+	if ((combinationRule != 22) && (combinationRule != 32)) {
+
+		/* zero width and height; return the count */
+
+		affectedL = dx;
+		affectedR = dx + bbW;
+		affectedT = dy;
+		affectedB = dy + bbH;
+	}
+	
+	// fill the operation structure
+	operation_t op;
+	op.combinationRule = combinationRule;
+	op.noSource = noSource;
+	op.src.bits = (void *) sourceBits;
+	op.src.pitch = sourcePitch;
+	op.src.depth = sourceDepth;
+	op.src.msb = sourceMSB;
+	op.src.x = sx;
+	op.src.y = sy;
+	op.dest.bits = (void *) destBits;
+	op.dest.pitch = destPitch;
+	op.dest.depth = destDepth;
+	op.dest.msb = destMSB;
+	op.dest.x = dx;
+	op.dest.y = dy;
+	op.width = bbW;
+	op.height = bbH;
+	op.cmFlags = cmFlags;
+	op.cmShiftTable = (void *) cmShiftTable;
+	op.cmMaskTable = (void *) cmMaskTable;
+	op.cmMask = cmMask;
+	op.cmLookupTable = (void *) cmLookupTable;
+	op.noHalftone = noHalftone;
+	op.halftoneHeight = halftoneHeight;
+	op.halftoneBase = (void *) halftoneBase;
+	if (combinationRule == 30 || combinationRule == 31) {
+		op.opt.sourceAlpha = sourceAlpha;
+	}
+	if (combinationRule == 41) {
+		op.opt.componentAlpha.componentAlphaModeColor = componentAlphaModeColor;
+		op.opt.componentAlpha.componentAlphaModeAlpha = componentAlphaModeAlpha;
+		op.opt.componentAlpha.gammaLookupTable = (void *) gammaLookupTable;
+		op.opt.componentAlpha.ungammaLookupTable = (void *) ungammaLookupTable;
+	}
+	// call the sneaky code
+	copyBitsDispatch(&op);
+# endif  // ENABLE_FAST_BLT
+	
+}
+
+
+/*	Support for the balloon engine. */
+
+EXPORT(sqInt) copyBitsFromtoat(sqInt startX, sqInt stopX, sqInt yValue) {
+	destX = startX;
+	destY = yValue;
+	sourceX = startX;
+	width = stopX - startX;
+	copyBits();
+	/* begin showDisplayBits */
+	interpreterProxy->showDisplayBitsLeftTopRightBottom(destForm, affectedL, affectedT, affectedR, affectedB);
+}
+
+
+/*	Perform the actual copyBits operation.
+	Assume: Surfaces have been locked and clipping was performed. */
+
+static sqInt copyBitsLockedAndClipped(void) {
+	sqInt done;
+	sqInt t;
+	sqInt endBits;
+	sqInt startBits;
+	sqInt pixPerM1;
+	sqInt dxLowBits;
+	sqInt sxLowBits;
+	sqInt dWid;
+	sqInt pixPerM11;
+
+	copyBitsRule41Test();
+	if (!(!(interpreterProxy->failed()))) {
+		return interpreterProxy->primitiveFail();
+	}
 	/* begin tryCopyingBitsQuickly */
 	if (noSource) {
 		done = 0;
-		goto l2;
+		goto l1;
 	}
 	if (!((combinationRule == 34) || (combinationRule == 41))) {
 		done = 0;
-		goto l2;
+		goto l1;
 	}
 	if (!(sourceDepth == 32)) {
 		done = 0;
-		goto l2;
+		goto l1;
 	}
 	if (sourceForm == destForm) {
 		done = 0;
-		goto l2;
+		goto l1;
 	}
 	if (combinationRule == 41) {
 		if (destDepth == 32) {
@@ -1253,7 +1592,7 @@
 			affectedT = dy;
 			affectedB = dy + bbH;
 			done = 1;
-			goto l2;
+			goto l1;
 		}
 		if (destDepth == 16) {
 			rgbComponentAlpha16();
@@ -1262,7 +1601,7 @@
 			affectedT = dy;
 			affectedB = dy + bbH;
 			done = 1;
-			goto l2;
+			goto l1;
 		}
 		if (destDepth == 8) {
 			rgbComponentAlpha8();
@@ -1271,18 +1610,18 @@
 			affectedT = dy;
 			affectedB = dy + bbH;
 			done = 1;
-			goto l2;
+			goto l1;
 		}
 		done = 0;
-		goto l2;
+		goto l1;
 	}
 	if (destDepth < 8) {
 		done = 0;
-		goto l2;
+		goto l1;
 	}
 	if ((destDepth == 8) && ((cmFlags & ColorMapPresent) == 0)) {
 		done = 0;
-		goto l2;
+		goto l1;
 	}
 	if (destDepth == 32) {
 		alphaSourceBlendBits32();
@@ -1298,22 +1637,26 @@
 	affectedT = dy;
 	affectedB = dy + bbH;
 	done = 1;
-l2:	/* end tryCopyingBitsQuickly */;
+l1:	/* end tryCopyingBitsQuickly */;
 	if (done) {
-		goto l1;
+		return null;
 	}
 	if ((combinationRule == 30) || (combinationRule == 31)) {
+
+		/* Check and fetch source alpha parameter for alpha blend */
+
 		if ((interpreterProxy->methodArgumentCount()) == 1) {
 			sourceAlpha = interpreterProxy->stackIntegerValue(0);
 			if (!((!(interpreterProxy->failed())) && ((sourceAlpha >= 0) && (sourceAlpha <= 255)))) {
-				interpreterProxy->primitiveFail();
-				goto l1;
+				return interpreterProxy->primitiveFail();
 			}
 		} else {
-			interpreterProxy->primitiveFail();
-			goto l1;
+			return interpreterProxy->primitiveFail();
 		}
 	}
+
+	/* Choose and perform the actual copy loop. */
+
 	bitCount = 0;
 	/* begin performCopyLoop */
 	/* begin destMaskAndPointerInit */
@@ -1399,6 +1742,9 @@
 		}
 	}
 	if ((combinationRule == 22) || (combinationRule == 32)) {
+
+		/* zero width and height; return the count */
+
 		affectedL = (affectedR = (affectedT = (affectedB = 0)));
 	}
 	if (hDir > 0) {
@@ -1415,56 +1761,88 @@
 		affectedT = (dy - bbH) + 1;
 		affectedB = dy + 1;
 	}
-l1:	/* end copyBitsLockedAndClipped */;
-	unlockSurfaces();
 }
 
 
-/*	Support for the balloon engine. */
+/*	Test possible use of rule 41, rgbComponentAlpha:with: Nothing to return, just set up some variables */
 
-EXPORT(sqInt) copyBitsFromtoat(sqInt startX, sqInt stopX, sqInt yValue) {
-	destX = startX;
-	destY = yValue;
-	sourceX = startX;
-	width = stopX - startX;
-	copyBits();
-	/* begin showDisplayBits */
-	interpreterProxy->showDisplayBitsLeftTopRightBottom(destForm, affectedL, affectedT, affectedR, affectedB);
+static sqInt copyBitsRule41Test(void) {
+	sqInt ungammaLookupTableOop;
+	sqInt gammaLookupTableOop;
+
+	if (combinationRule == 41) {
+
+		/* fetch the forecolor into componentAlphaModeColor. */
+
+		componentAlphaModeAlpha = 255;
+		componentAlphaModeColor = 16777215;
+		gammaLookupTable = null;
+		ungammaLookupTable = null;
+		if ((interpreterProxy->methodArgumentCount()) >= 2) {
+			componentAlphaModeAlpha = interpreterProxy->stackIntegerValue((interpreterProxy->methodArgumentCount()) - 2);
+			if (!(!(interpreterProxy->failed()))) {
+				return interpreterProxy->primitiveFail();
+			}
+			componentAlphaModeColor = interpreterProxy->stackIntegerValue((interpreterProxy->methodArgumentCount()) - 1);
+			if (!(!(interpreterProxy->failed()))) {
+				return interpreterProxy->primitiveFail();
+			}
+			if ((interpreterProxy->methodArgumentCount()) == 4) {
+				gammaLookupTableOop = interpreterProxy->stackObjectValue(1);
+				if (interpreterProxy->isBytes(gammaLookupTableOop)) {
+					gammaLookupTable = interpreterProxy->firstIndexableField(gammaLookupTableOop);
+				}
+				ungammaLookupTableOop = interpreterProxy->stackObjectValue(0);
+				if (interpreterProxy->isBytes(ungammaLookupTableOop)) {
+					ungammaLookupTable = interpreterProxy->firstIndexableField(ungammaLookupTableOop);
+				}
+			}
+		} else {
+			if ((interpreterProxy->methodArgumentCount()) == 1) {
+				componentAlphaModeColor = interpreterProxy->stackIntegerValue(0);
+				if (!(!(interpreterProxy->failed()))) {
+					return interpreterProxy->primitiveFail();
+				}
+			} else {
+				return interpreterProxy->primitiveFail();
+			}
+		}
+	}
 }
 
 
 /*	This version of the inner loop assumes noSource = false. */
 
 static sqInt copyLoop(void) {
-    sqInt destWord;
-    sqInt hInc;
-    sqInt halftoneWord;
-    sqInt i;
-    sqInt (*mergeFnwith)(sqInt, sqInt);
-    sqInt mergeWord;
-    sqInt notSkewMask;
-    sqInt prevWord;
-    sqInt skewMask;
-    sqInt skewWord;
-    sqInt thisWord;
-    sqInt unskew;
-    sqInt word;
-    sqInt y;
-    sqInt idx;
-    sqInt idx1;
-    sqInt idx2;
-    sqInt idx3;
-    sqInt idx4;
-    sqInt idx5;
-    sqInt idx6;
-    sqInt idx7;
-    sqInt idx8;
-    sqInt idx9;
-    sqInt idx10;
-    sqInt idx11;
-    sqInt idx12;
-    sqInt idx13;
-    sqInt idx14;
+	sqInt mergeWord;
+	sqInt skewWord;
+	sqInt skewMask;
+	sqInt halftoneWord;
+	sqInt unskew;
+	sqInt (*mergeFnwith)(sqInt, sqInt);
+	sqInt hInc;
+	sqInt destWord;
+	sqInt word;
+	sqInt prevWord;
+	sqInt y;
+	sqInt i;
+	sqInt thisWord;
+	sqInt notSkewMask;
+	sqInt idx;
+	sqInt idx1;
+	sqInt idx2;
+	sqInt idx3;
+	sqInt idx4;
+	sqInt idx5;
+	sqInt idx6;
+	sqInt idx7;
+	sqInt idx8;
+	sqInt idx9;
+	sqInt idx10;
+	sqInt idx11;
+	sqInt idx12;
+	sqInt idx13;
+	sqInt idx14;
 
 	mergeFnwith = ((sqInt (*)(sqInt, sqInt)) (opTable[combinationRule + 1]));
 	mergeFnwith;
@@ -1498,15 +1876,20 @@
 	}
 	y = dy;
 	for (i = 1; i <= bbH; i += 1) {
+
+		/* here is the vertical loop */
+
 		if (halftoneHeight > 1) {
 
-			/* here is the vertical loop */
 			/* Otherwise, its always the same */
 
 			halftoneWord = long32At(halftoneBase + ((y % halftoneHeight) * 4));
 			y += vDir;
 		}
 		if (preload) {
+
+			/* load the 64-bit shifter */
+
 			/* begin srcLongAt: */
 			idx = sourceIndex;
 			prevWord = long32At(idx);
@@ -1539,7 +1922,13 @@
 		destMask = AllOnes;
 		if (combinationRule == 3) {
 			if ((skew == 0) && (halftoneWord == AllOnes)) {
+
+				/* Very special inner loop for STORE mode with no skew -- just move words */
+
 				if (hDir == -1) {
+
+					/* Woeful patch: revert to older code for hDir = -1 */
+
 					for (word = 2; word <= (nWords - 1); word += 1) {
 						/* begin srcLongAt: */
 						idx1 = sourceIndex;
@@ -1552,6 +1941,9 @@
 					}
 				} else {
 					for (word = 2; word <= (nWords - 1); word += 1) {
+
+						/* Note loop starts with prevWord loaded (due to preload) */
+
 						/* begin dstLongAt:put: */
 						idx3 = destIndex;
 						long32Atput(idx3, prevWord);
@@ -1563,6 +1955,9 @@
 					}
 				}
 			} else {
+
+				/* Special inner loop for STORE mode -- no need to call merge */
+
 				for (word = 2; word <= (nWords - 1); word += 1) {
 					/* begin srcLongAt: */
 					idx5 = sourceIndex;
@@ -1581,6 +1976,9 @@
 			}
 		} else {
 			for (word = 2; word <= (nWords - 1); word += 1) {
+
+				/* Normal inner loop does merge: */
+
 				/* begin srcLongAt: */
 				idx7 = sourceIndex;
 				thisWord = long32At(idx7);
@@ -1627,28 +2025,28 @@
 	positive, and perload and skew are unused */
 
 static sqInt copyLoopNoSource(void) {
-    sqInt destWord;
-    sqInt halftoneWord;
-    sqInt i;
-    sqInt (*mergeFnwith)(sqInt, sqInt);
-    sqInt mergeWord;
-    sqInt word;
-    sqInt idx;
-    sqInt idx1;
-    sqInt idx2;
-    sqInt idx3;
-    sqInt idx4;
-    sqInt idx5;
-    sqInt idx6;
-    sqInt idx7;
+	sqInt mergeWord;
+	sqInt halftoneWord;
+	sqInt (*mergeFnwith)(sqInt, sqInt);
+	sqInt destWord;
+	sqInt word;
+	sqInt i;
+	sqInt idx;
+	sqInt idx1;
+	sqInt idx2;
+	sqInt idx3;
+	sqInt idx4;
+	sqInt idx5;
+	sqInt idx6;
+	sqInt idx7;
 
 	mergeFnwith = ((sqInt (*)(sqInt, sqInt)) (opTable[combinationRule + 1]));
 	mergeFnwith;
 	for (i = 1; i <= bbH; i += 1) {
-		if (noHalftone) {
 
-			/* here is the vertical loop */
+		/* here is the vertical loop */
 
+		if (noHalftone) {
 			halftoneWord = AllOnes;
 		} else {
 			/* begin halftoneAt: */
@@ -1681,7 +2079,13 @@
 				destIndex += 4;
 			}
 		} else {
+
+			/* Normal inner loop does merge */
+
 			for (word = 2; word <= (nWords - 1); word += 1) {
+
+				/* Normal inner loop does merge */
+
 				/* begin dstLongAt: */
 				idx2 = destIndex;
 				destWord = long32At(idx2);
@@ -1722,43 +2126,43 @@
 	to inline pickSourcePixels we could optimize the loop instead. */
 
 static sqInt copyLoopPixMap(void) {
-    sqInt destPixMask;
-    sqInt destWord;
-    sqInt dstShift;
-    sqInt dstShiftInc;
-    sqInt dstShiftLeft;
-    sqInt endBits;
-    sqInt halftoneWord;
-    sqInt i;
-    sqInt mapperFlags;
-    sqInt (*mergeFnwith)(sqInt, sqInt);
-    sqInt mergeWord;
-    sqInt nPix;
-    sqInt nSourceIncs;
-    sqInt scrStartBits;
-    sqInt skewWord;
-    sqInt sourcePixMask;
-    sqInt srcShift;
-    sqInt srcShiftInc;
-    sqInt startBits;
-    sqInt words;
-    sqInt idx;
-    sqInt idx1;
-    sqInt value;
-    sqInt idx2;
-    sqInt idx3;
-    sqInt destPix;
-    sqInt destWord1;
-    sqInt dstShift1;
-    sqInt nPix1;
-    sqInt sourcePix;
-    sqInt sourceWord;
-    sqInt srcShift1;
-    sqInt pv;
-    sqInt idx4;
-    sqInt val;
-    sqInt idx11;
-    sqInt idx21;
+	sqInt mapperFlags;
+	sqInt srcShiftInc;
+	sqInt dstShiftLeft;
+	sqInt sourcePixMask;
+	sqInt nSourceIncs;
+	sqInt skewWord;
+	sqInt words;
+	sqInt destWord;
+	sqInt startBits;
+	sqInt (*mergeFnwith)(sqInt, sqInt);
+	sqInt dstShift;
+	sqInt i;
+	sqInt halftoneWord;
+	sqInt mergeWord;
+	sqInt destPixMask;
+	sqInt dstShiftInc;
+	sqInt srcShift;
+	sqInt endBits;
+	sqInt nPix;
+	sqInt scrStartBits;
+	sqInt idx;
+	sqInt idx1;
+	sqInt value;
+	sqInt idx2;
+	sqInt idx3;
+	sqInt sourcePix;
+	sqInt srcShift1;
+	sqInt sourceWord;
+	sqInt dstShift1;
+	sqInt destPix;
+	sqInt nPix1;
+	sqInt destWord1;
+	sqInt pv;
+	sqInt idx4;
+	sqInt val;
+	sqInt idx11;
+	sqInt idx21;
 
 	mergeFnwith = ((sqInt (*)(sqInt, sqInt)) (opTable[combinationRule + 1]));
 	mergeFnwith;
@@ -1797,11 +2201,11 @@
 		dstShiftLeft = 32 - destDepth;
 	}
 	for (i = 1; i <= bbH; i += 1) {
-		if (noHalftone) {
 
-			/* here is the vertical loop */
-			/* *** is it possible at all that noHalftone == false? *** */
+		/* here is the vertical loop */
+		/* *** is it possible at all that noHalftone == false? *** */
 
+		if (noHalftone) {
 			halftoneWord = AllOnes;
 		} else {
 			/* begin halftoneAt: */
@@ -1817,6 +2221,9 @@
 		nPix = startBits;
 		words = nWords;
 		do {
+
+			/* pick up the word */
+
 			/* begin pickSourcePixels:flags:srcMask:destMask:srcShiftInc:dstShiftInc: */
 			/* begin srcLongAt: */
 			idx21 = sourceIndex;
@@ -1890,6 +2297,9 @@
 				value = destMask & mergeWord;
 				long32Atput(idx1, value);
 			} else {
+
+				/* General version using dest masking */
+
 				/* begin dstLongAt: */
 				idx2 = destIndex;
 				destWord = long32At(idx2);
@@ -1938,7 +2348,7 @@
 	]. */
 
 static unsigned int * default8To32Table(void) {
-    static unsigned int theTable[256] = { 
+	static unsigned int theTable[256] = { 
 0x0, 0xFF000001, 0xFFFFFFFF, 0xFF808080, 0xFFFF0000, 0xFF00FF00, 0xFF0000FF, 0xFF00FFFF, 
 0xFFFFFF00, 0xFFFF00FF, 0xFF202020, 0xFF404040, 0xFF606060, 0xFF9F9F9F, 0xFFBFBFBF, 0xFFDFDFDF, 
 0xFF080808, 0xFF101010, 0xFF181818, 0xFF282828, 0xFF303030, 0xFF383838, 0xFF484848, 0xFF505050, 
@@ -1987,8 +2397,8 @@
 /*	Return the integer value of the given field of the given object. If the field contains a Float, truncate it and return its integral part. Fail if the given field does not contain a small integer or Float, or if the truncated Float is out of the range of small integers. */
 
 static sqInt fetchIntOrFloatofObject(sqInt fieldIndex, sqInt objectPointer) {
-    sqInt fieldOop;
-    double  floatValue;
+	double  floatValue;
+	sqInt fieldOop;
 
 	fieldOop = interpreterProxy->fetchPointerofObject(fieldIndex, objectPointer);
 	if ((fieldOop & 1)) {
@@ -2006,8 +2416,8 @@
 /*	Return the integer value of the given field of the given object. If the field contains a Float, truncate it and return its integral part. Fail if the given field does not contain a small integer or Float, or if the truncated Float is out of the range of small integers. */
 
 static sqInt fetchIntOrFloatofObjectifNil(sqInt fieldIndex, sqInt objectPointer, sqInt defaultValue) {
-    sqInt fieldOop;
-    double  floatValue;
+	double  floatValue;
+	sqInt fieldOop;
 
 	fieldOop = interpreterProxy->fetchPointerofObject(fieldIndex, objectPointer);
 	if ((fieldOop & 1)) {
@@ -2100,13 +2510,13 @@
 }
 
 static sqInt initDither8Lookup(void) {
-    sqInt b;
-    sqInt t;
-    sqInt value;
-    sqInt out;
-    sqInt pv;
-    sqInt threshold;
-    sqInt value1;
+	sqInt t;
+	sqInt b;
+	sqInt value;
+	sqInt pv;
+	sqInt threshold;
+	sqInt value1;
+	sqInt out;
 
 	for (b = 0; b <= 255; b += 1) {
 		for (t = 0; t <= 15; t += 1) {
@@ -2144,6 +2554,11 @@
 EXPORT(sqInt) initialiseModule(void) {
 	initBBOpTable();
 	initDither8Lookup();
+	
+# ifdef ENABLE_FAST_BLT  // init the fastpath lists
+	initialiseCopyBits();
+# endif  // ENABLE_FAST_BLT
+	
 	return 1;
 }
 
@@ -2174,18 +2589,18 @@
 		-- once it works! */
 
 static sqInt loadBitBltFromwarping(sqInt bbObj, sqInt aBool) {
-    sqInt ok;
-    sqInt formPointer;
-    sqInt formPointer1;
-    sqInt destBitsSize;
-    sqInt sourceBitsSize;
-    sqInt cmOop;
-    sqInt cmSize;
-    sqInt oldStyle;
-    sqInt oop;
-    sqInt halftoneBits;
-    sqInt mapOop;
-    sqInt mapOop1;
+	sqInt ok;
+	sqInt formPointer;
+	sqInt formPointer1;
+	sqInt destBitsSize;
+	sqInt sourceBitsSize;
+	sqInt oop;
+	sqInt cmOop;
+	sqInt cmSize;
+	sqInt oldStyle;
+	sqInt halftoneBits;
+	sqInt mapOop;
+	sqInt mapOop1;
 
 	bitBltOop = bbObj;
 	isWarping = aBool;
@@ -2540,16 +2955,19 @@
 	 */
 
 static sqInt lockSurfaces(void) {
-    sqInt b;
-    sqInt destHandle;
-    sqInt (*fn)(sqInt, sqInt*, sqInt, sqInt, sqInt, sqInt);
-    sqInt l;
-    sqInt r;
-    sqInt sourceHandle;
-    sqInt t;
+	sqInt destHandle;
+	sqInt sourceHandle;
+	sqInt t;
+	sqInt (*fn)(sqInt, sqInt*, sqInt, sqInt, sqInt, sqInt);
+	sqInt r;
+	sqInt b;
+	sqInt l;
 
 	hasSurfaceLock = 0;
 	if (destBits == 0) {
+
+		/* Blitting *to* OS surface */
+
 		if (lockSurfaceFn == 0) {
 			if (!(loadSurfacePlugin())) {
 				return null;
@@ -2563,6 +2981,10 @@
 
 			sourceHandle = interpreterProxy->fetchIntegerofObject(FormBitsIndex, sourceForm);
 			if (sourceHandle == destHandle) {
+
+				/* If we have overlapping source/dest we lock the entire area
+				so that there is only one area transmitted */
+
 				if (isWarping) {
 
 					/* Otherwise use overlapping area */
@@ -2573,6 +2995,9 @@
 					b = (((sy < sy) ? sy : sy)) + bbH;
 					sourceBits = fn(sourceHandle, &sourcePitch, l, t, r-l, b-t);
 				} else {
+
+					/* When warping we always need the entire surface for the source */
+
 					sourceBits = fn(sourceHandle, &sourcePitch, 0,0, sourceWidth, sourceHeight);
 				}
 				destBits = sourceBits;
@@ -2614,6 +3039,9 @@
 
 EXPORT(sqInt) moduleUnloaded(char *aModuleName) {
 	if ((strcmp(aModuleName, "SurfacePlugin")) == 0) {
+
+		/* The surface plugin just shut down. How nasty. */
+
 		querySurfaceFn = (lockSurfaceFn = (unlockSurfaceFn = 0));
 	}
 }
@@ -2624,9 +3052,9 @@
 	Used for erasing, eg, brush shapes prior to ORing in a color */
 
 static sqInt partitionedANDtonBitsnPartitions(sqInt word1, sqInt word2, sqInt nBits, sqInt nParts) {
-    sqInt i;
-    sqInt mask;
-    sqInt result;
+	sqInt result;
+	sqInt i;
+	sqInt mask;
 
 
 	/* partition mask starts at the right */
@@ -2653,11 +3081,11 @@
 	words as unsigned int in those cases where comparisions are done (jmv) */
 
 static sqInt partitionedAddtonBitsnPartitions(unsigned int word1, unsigned int word2, sqInt nBits, sqInt nParts) {
-    sqInt i;
-    unsigned int mask;
-    unsigned int maskedWord1;
-    unsigned int result;
-    unsigned int sum;
+	unsigned int result;
+	unsigned int sum;
+	unsigned int maskedWord1;
+	sqInt i;
+	unsigned int mask;
 
 
 	/* partition mask starts at the right */
@@ -2690,9 +3118,9 @@
 	words as unsigned int in those cases where comparisions are done (jmv) */
 
 static sqInt partitionedMaxwithnBitsnPartitions(unsigned int word1, unsigned int word2, sqInt nBits, sqInt nParts) {
-    sqInt i;
-    unsigned int mask;
-    unsigned int result;
+	unsigned int result;
+	sqInt i;
+	unsigned int mask;
 
 
 	/* partition mask starts at the right */
@@ -2716,9 +3144,9 @@
 	words as unsigned int in those cases where comparisions are done (jmv) */
 
 static sqInt partitionedMinwithnBitsnPartitions(unsigned int word1, unsigned int word2, sqInt nBits, sqInt nParts) {
-    sqInt i;
-    unsigned int mask;
-    unsigned int result;
+	unsigned int result;
+	sqInt i;
+	unsigned int mask;
 
 
 	/* partition mask starts at the right */
@@ -2744,10 +3172,10 @@
 	always be zero (jmv) */
 
 static sqInt partitionedMulwithnBitsnPartitions(sqInt word1, sqInt word2, sqInt nBits, sqInt nParts) {
-    sqInt dMask;
-    sqInt product;
-    sqInt result;
-    sqInt sMask;
+	sqInt dMask;
+	sqInt result;
+	sqInt product;
+	sqInt sMask;
 
 
 	/* partition mask starts at the right */
@@ -2784,11 +3212,11 @@
 	words as unsigned int in those cases where comparisions are done (jmv) */
 
 static sqInt partitionedSubfromnBitsnPartitions(unsigned int word1, unsigned int word2, sqInt nBits, sqInt nParts) {
-    sqInt i;
-    unsigned int mask;
-    unsigned int p1;
-    unsigned int p2;
-    unsigned int result;
+	unsigned int p2;
+	unsigned int result;
+	unsigned int p1;
+	sqInt i;
+	unsigned int mask;
 
 
 	/* partition mask starts at the right */
@@ -2818,11 +3246,11 @@
 /*	Clear all pixels in destinationWord for which the pixels of sourceWord have the same values. Used to clear areas of some constant color to zero. */
 
 static sqInt pixClearwith(sqInt sourceWord, sqInt destinationWord) {
-    sqInt i;
-    sqInt mask;
-    sqInt nBits;
-    sqInt pv;
-    sqInt result;
+	sqInt pv;
+	sqInt nBits;
+	sqInt result;
+	sqInt i;
+	sqInt mask;
 
 	if (destDepth == 32) {
 		if (sourceWord == destinationWord) {
@@ -2852,9 +3280,9 @@
 }
 
 static sqInt pixMaskwith(sqInt sourceWord, sqInt destinationWord) {
-    sqInt i;
-    sqInt mask;
-    sqInt result;
+	sqInt result;
+	sqInt i;
+	sqInt mask;
 
 	/* begin partitionedAND:to:nBits:nPartitions: */
 	mask = maskTable[destDepth];
@@ -2879,11 +3307,11 @@
 /*	Swap the pixels in destWord */
 
 static sqInt pixSwapwith(sqInt sourceWord, sqInt destWord) {
-    sqInt highMask;
-    sqInt i;
-    sqInt lowMask;
-    sqInt result;
-    sqInt shift;
+	sqInt result;
+	sqInt shift;
+	sqInt lowMask;
+	sqInt highMask;
+	sqInt i;
 
 	if (destPPW == 1) {
 		return destWord;
@@ -2915,7 +3343,7 @@
 /*	Invoke the copyBits primitive. If the destination is the display, then copy it to the screen. */
 
 EXPORT(sqInt) primitiveCopyBits(void) {
-    sqInt rcvr;
+	sqInt rcvr;
 
 	rcvr = interpreterProxy->stackValue(interpreterProxy->methodArgumentCount());
 	if (!(loadBitBltFromwarping(rcvr, 0))) {
@@ -2938,23 +3366,23 @@
 }
 
 EXPORT(sqInt) primitiveDisplayString(void) {
-    sqInt ascii;
-    sqInt bbObj;
-    sqInt charIndex;
-    sqInt glyphIndex;
-    sqInt glyphMap;
-    sqInt kernDelta;
-    sqInt left;
-    sqInt maxGlyph;
-    sqInt quickBlt;
-    char *sourcePtr;
-    sqInt sourceString;
-    sqInt startIndex;
-    sqInt stopIndex;
-    sqInt xTable;
-    sqInt endBits;
-    sqInt pixPerM1;
-    sqInt startBits;
+	sqInt charIndex;
+	char *sourcePtr;
+	sqInt stopIndex;
+	sqInt bbObj;
+	sqInt xTable;
+	sqInt maxGlyph;
+	sqInt quickBlt;
+	sqInt glyphIndex;
+	sqInt glyphMap;
+	sqInt left;
+	sqInt kernDelta;
+	sqInt startIndex;
+	sqInt ascii;
+	sqInt sourceString;
+	sqInt endBits;
+	sqInt startBits;
+	sqInt pixPerM1;
 
 	if (!((interpreterProxy->methodArgumentCount()) == 6)) {
 		return interpreterProxy->primitiveFail();
@@ -2986,6 +3414,9 @@
 		return interpreterProxy->primitiveFail();
 	}
 	if ((combinationRule == 30) || (combinationRule == 31)) {
+
+		/* needs extra source alpha */
+
 		return interpreterProxy->primitiveFail();
 	}
 	quickBlt = (destBits != 0) && ((sourceBits != 0) && ((noSource == 0) && ((sourceForm != destForm) && ((cmFlags != 0) || ((sourceMSB != destMSB) || (sourceDepth != destDepth))))));
@@ -3053,19 +3484,19 @@
 /*	Invoke the line drawing primitive. */
 
 EXPORT(sqInt) primitiveDrawLoop(void) {
-    sqInt rcvr;
-    sqInt xDelta;
-    sqInt yDelta;
-    sqInt P;
-    sqInt affB;
-    sqInt affL;
-    sqInt affR;
-    sqInt affT;
-    sqInt dx1;
-    sqInt dy1;
-    sqInt i;
-    sqInt px;
-    sqInt py;
+	sqInt yDelta;
+	sqInt rcvr;
+	sqInt xDelta;
+	sqInt P;
+	sqInt affT;
+	sqInt dx1;
+	sqInt px;
+	sqInt affR;
+	sqInt affL;
+	sqInt py;
+	sqInt i;
+	sqInt affB;
+	sqInt dy1;
 
 	rcvr = interpreterProxy->stackValue(2);
 	xDelta = interpreterProxy->stackIntegerValue(1);
@@ -3176,14 +3607,108 @@
 }
 
 
+/*	returns the single pixel at x at y.
+	It does not handle LSB bitmaps right now.
+	If x or y are < 0, return 0 to indicate transparent (cf BitBlt>bitPeekerFromForm: usage).
+	Likewise if x>width or y>depth.
+	Fail if the rcvr doesn't seem to be a Form, or x|y seem wrong */
+
+EXPORT(sqInt) primitivePixelValueAt(void) {
+	sqInt pixel;
+	sqInt rcvr;
+	sqInt shift;
+	sqInt depth;
+	sqInt bitmap;
+	sqInt ppW;
+	sqInt word;
+	sqInt stride;
+	sqInt mask;
+	sqInt xVal;
+	sqInt yVal;
+	sqInt _return_value;
+
+	xVal = interpreterProxy->stackIntegerValue(1);
+	yVal = interpreterProxy->stackIntegerValue(0);
+	rcvr = interpreterProxy->stackValue(2);
+	if (interpreterProxy->failed()) {
+		return null;
+	}
+	if ((xVal < 0) || (yVal < 0)) {
+		_return_value = ((0 << 1) | 1);
+		if (interpreterProxy->failed()) {
+			return null;
+		}
+		interpreterProxy->popthenPush(3, _return_value);
+		return null;
+	}
+	rcvr = interpreterProxy->stackValue(interpreterProxy->methodArgumentCount());
+	if (!((interpreterProxy->isPointers(rcvr)) && ((interpreterProxy->slotSizeOf(rcvr)) >= 4))) {
+		interpreterProxy->primitiveFail();
+		return null;
+	}
+	bitmap = interpreterProxy->fetchPointerofObject(FormBitsIndex, rcvr);
+	width = interpreterProxy->fetchIntegerofObject(FormWidthIndex, rcvr);
+	height = interpreterProxy->fetchIntegerofObject(FormHeightIndex, rcvr);
+
+	/* if width/height/depth are not integer, fail */
+
+	depth = interpreterProxy->fetchIntegerofObject(FormDepthIndex, rcvr);
+	if (interpreterProxy->failed()) {
+		return null;
+	}
+	if ((xVal >= width) || (yVal >= height)) {
+		_return_value = ((0 << 1) | 1);
+		if (interpreterProxy->failed()) {
+			return null;
+		}
+		interpreterProxy->popthenPush(3, _return_value);
+		return null;
+	}
+	if (depth < 0) {
+		interpreterProxy->primitiveFail();
+		return null;
+	}
+
+	/* pixels in each word */
+
+	ppW = 32 / depth;
+
+	/* how many words per rox of pixels */
+
+	stride = (width + (ppW - 1)) / ppW;
+
+	/* load the word that contains our target */
+
+	word = interpreterProxy->fetchLong32ofObject((yVal * stride) + (xVal / ppW), bitmap);
+
+	/* make a mask to isolate the pixel within that word */
+
+	mask = ((usqInt) 4294967295U) >> (32 - depth);
+
+	/* this is the tricky MSB part - we mask the xVal to find how far into the word we need, then add 1 for the pixel we're looking for, then * depth to get the bit shift */
+
+	shift = 32 - (((xVal & (ppW - 1)) + 1) * depth);
+
+	/* shift, mask and dim the lights */
+
+	pixel = (((usqInt) word) >> shift) & mask;
+	_return_value = interpreterProxy->positive32BitIntegerFor(pixel);
+	if (interpreterProxy->failed()) {
+		return null;
+	}
+	interpreterProxy->popthenPush(3, _return_value);
+	return null;
+}
+
+
 /*	Invoke the warpBits primitive. If the destination is the display, then copy it to the screen. */
 
 EXPORT(sqInt) primitiveWarpBits(void) {
-    sqInt rcvr;
-    sqInt ns;
-    sqInt endBits;
-    sqInt pixPerM1;
-    sqInt startBits;
+	sqInt rcvr;
+	sqInt ns;
+	sqInt endBits;
+	sqInt startBits;
+	sqInt pixPerM1;
 
 	rcvr = interpreterProxy->stackValue(interpreterProxy->methodArgumentCount());
 	if (!(loadBitBltFromwarping(rcvr, 1))) {
@@ -3292,11 +3817,20 @@
 
 static sqInt rgbAddwith(sqInt sourceWord, sqInt destinationWord) {
 	if (destDepth < 16) {
+
+		/* Add each pixel separately */
+
 		return partitionedAddtonBitsnPartitions(sourceWord, destinationWord, destDepth, destPPW);
 	}
 	if (destDepth == 16) {
+
+		/* Add RGB components of each pixel separately */
+
 		return (partitionedAddtonBitsnPartitions(sourceWord, destinationWord, 5, 3)) + ((partitionedAddtonBitsnPartitions(((usqInt) sourceWord) >> 16, ((usqInt) destinationWord) >> 16, 5, 3)) << 16);
 	} else {
+
+		/* Add RGBA components of the pixel separately */
+
 		return partitionedAddtonBitsnPartitions(sourceWord, destinationWord, 8, 4);
 	}
 }
@@ -3311,22 +3845,22 @@
 /*	This particular method should be optimized in itself */
 
 static sqInt rgbComponentAlpha16(void) {
-    sqInt deltaX;
-    sqInt deltaY;
-    sqInt destWord;
-    sqInt ditherBase;
-    sqInt ditherIndex;
-    sqInt ditherThreshold;
-    sqInt dstIndex;
-    sqInt dstMask;
-    sqInt dstY;
-    sqInt sourceWord;
-    sqInt srcAlpha;
-    sqInt srcIndex;
-    sqInt srcShift;
-    sqInt srcY;
-    sqInt addThreshold;
-    sqInt dstValue;
+	sqInt ditherBase;
+	sqInt ditherThreshold;
+	sqInt srcShift;
+	sqInt sourceWord;
+	sqInt srcIndex;
+	sqInt deltaX;
+	sqInt dstIndex;
+	sqInt srcAlpha;
+	sqInt dstMask;
+	sqInt deltaY;
+	sqInt srcY;
+	sqInt destWord;
+	sqInt dstY;
+	sqInt ditherIndex;
+	sqInt addThreshold;
+	sqInt dstValue;
 
 
 	/* So we can pre-decrement */
@@ -3428,15 +3962,15 @@
 	 */
 
 static sqInt rgbComponentAlpha32(void) {
-    register int deltaX;
-    sqInt deltaY;
-    sqInt destWord;
-    register int dstIndex;
-    sqInt dstY;
-    register int sourceWord;
-    sqInt srcAlpha;
-    register int srcIndex;
-    sqInt srcY;
+	register int sourceWord;
+	register int srcIndex;
+	register int deltaX;
+	register int dstIndex;
+	sqInt srcAlpha;
+	sqInt deltaY;
+	sqInt srcY;
+	sqInt destWord;
+	sqInt dstY;
 
 
 	/* This particular method should be optimized in itself */
@@ -3446,6 +3980,8 @@
 	(good to know on an Intel architecture) but then the increments
 	would be different between ST code and C code so must hope the
 	compiler notices what happens (MS Visual C does) */
+
+
 	/* So we can pre-decrement */
 
 	deltaY = bbH + 1;
@@ -3511,21 +4047,21 @@
 /*	Do NOT inline this into optimized loops */
 
 static sqInt rgbComponentAlpha32with(sqInt sourceWord, sqInt destinationWord) {
-    sqInt a;
-    sqInt aA;
-    sqInt aB;
-    sqInt aG;
-    sqInt aR;
-    sqInt alpha;
-    sqInt answer;
-    sqInt b;
-    sqInt d;
-    sqInt dstMask;
-    sqInt g;
-    sqInt r;
-    sqInt s;
-    sqInt srcAlpha;
-    sqInt srcColor;
+	sqInt g;
+	sqInt srcColor;
+	sqInt aG;
+	sqInt d;
+	sqInt a;
+	sqInt aA;
+	sqInt aR;
+	sqInt dstMask;
+	sqInt srcAlpha;
+	sqInt r;
+	sqInt b;
+	sqInt aB;
+	sqInt alpha;
+	sqInt answer;
+	sqInt s;
 
 	alpha = sourceWord;
 	if (alpha == 0) {
@@ -3613,23 +4149,23 @@
 	 */
 
 static sqInt rgbComponentAlpha8(void) {
-    sqInt adjust;
-    sqInt deltaX;
-    sqInt deltaY;
-    sqInt destWord;
-    sqInt dstIndex;
-    sqInt dstMask;
-    sqInt dstY;
-    sqInt mapperFlags;
-    unsigned int *mappingTable;
-    sqInt sourceWord;
-    sqInt srcAlpha;
-    sqInt srcIndex;
-    sqInt srcShift;
-    sqInt srcY;
-    sqInt pv;
-    sqInt val;
-    sqInt dstValue;
+	sqInt srcShift;
+	sqInt sourceWord;
+	sqInt srcIndex;
+	sqInt deltaX;
+	unsigned int *mappingTable;
+	sqInt dstIndex;
+	sqInt adjust;
+	sqInt mapperFlags;
+	sqInt srcAlpha;
+	sqInt dstMask;
+	sqInt deltaY;
+	sqInt srcY;
+	sqInt destWord;
+	sqInt dstY;
+	sqInt pv;
+	sqInt val;
+	sqInt dstValue;
 
 
 	/* This particular method should be optimized in itself */
@@ -3676,9 +4212,11 @@
 			srcAlpha = sourceWord & 16777215;
 			srcAlpha = (((((usqInt) srcAlpha) >> 16) + ((((usqInt) srcAlpha) >> 8) & 255)) + (srcAlpha & 255)) / 3;
 			if (srcAlpha > 31) {
+
+				/* Everything below 31 is transparent */
+
 				if (srcAlpha > 224) {
 
-					/* Everything below 31 is transparent */
 					/* treat everything above 224 as opaque */
 
 					sourceWord = 4294967295U;
@@ -3761,19 +4299,19 @@
 /*	Do NOT inline this into optimized loops */
 
 static sqInt rgbComponentAlphawith(sqInt sourceWord, sqInt destinationWord) {
-    sqInt alpha;
-    sqInt nBits;
-    sqInt nParts;
-    sqInt i;
-    sqInt mask;
-    sqInt p1;
-    sqInt p2;
-    sqInt result;
-    sqInt v;
-    sqInt d;
-    sqInt destPix;
-    sqInt mask3;
-    sqInt srcPix;
+	sqInt alpha;
+	sqInt nBits;
+	sqInt nParts;
+	sqInt p2;
+	sqInt result;
+	sqInt p1;
+	sqInt i;
+	sqInt v;
+	sqInt mask;
+	sqInt d;
+	sqInt destPix;
+	sqInt srcPix;
+	sqInt mask3;
 
 	alpha = sourceWord;
 	if (alpha == 0) {
@@ -3853,16 +4391,16 @@
 	For non-rgb, return the number of differing pixels. */
 
 static sqInt rgbDiffwith(sqInt sourceWord, sqInt destinationWord) {
-    sqInt bitsPerColor;
-    sqInt destPixVal;
-    sqInt destShifted;
-    sqInt diff;
-    sqInt i;
-    sqInt maskShifted;
-    sqInt pixMask;
-    sqInt rgbMask;
-    sqInt sourcePixVal;
-    sqInt sourceShifted;
+	sqInt sourcePixVal;
+	sqInt bitsPerColor;
+	sqInt diff;
+	sqInt sourceShifted;
+	sqInt pixMask;
+	sqInt rgbMask;
+	sqInt destShifted;
+	sqInt i;
+	sqInt maskShifted;
+	sqInt destPixVal;
 
 	pixMask = maskTable[destDepth];
 	if (destDepth == 16) {
@@ -3905,14 +4443,16 @@
 /*	Convert the given pixel value with nBitsIn bits for each color component to a pixel value with nBitsOut bits for each color component. Typical values for nBitsIn/nBitsOut are 3, 5, or 8. */
 
 static sqInt rgbMapfromto(sqInt sourcePixel, sqInt nBitsIn, sqInt nBitsOut) {
-    sqInt d;
-    sqInt destPix;
-    sqInt mask;
-    sqInt srcPix;
+	sqInt d;
+	sqInt destPix;
+	sqInt srcPix;
+	sqInt mask;
 
 	if (((d = nBitsOut - nBitsIn)) > 0) {
 
 		/* Expand to more bits by zero-fill */
+
+
 		/* Transfer mask */
 
 		mask = (1 << nBitsIn) - 1;
@@ -3923,11 +4463,22 @@
 		srcPix = srcPix << d;
 		return (destPix + (srcPix & mask)) + ((srcPix << d) & (mask << nBitsOut));
 	} else {
+
+		/* Compress to fewer bits by truncation */
+
 		if (d == 0) {
 			if (nBitsIn == 5) {
+
+				/* Sometimes called with 16 bits, though pixel is 15,
+					but we must never return more than 15. */
+
 				return sourcePixel & 32767;
 			}
 			if (nBitsIn == 8) {
+
+				/* Sometimes called with 32 bits, though pixel is 24,
+					but we must never return more than 24. */
+
 				return sourcePixel & 16777215;
 			}
 			return sourcePixel;
@@ -3954,58 +4505,103 @@
 
 static sqInt rgbMaxwith(sqInt sourceWord, sqInt destinationWord) {
 	if (destDepth < 16) {
+
+		/* Max each pixel separately */
+
 		return partitionedMaxwithnBitsnPartitions(sourceWord, destinationWord, destDepth, destPPW);
 	}
 	if (destDepth == 16) {
+
+		/* Max RGB components of each pixel separately */
+

@@ Diff output truncated at 50000 characters. @@


More information about the Vm-dev mailing list