[squeak-dev] The Trunk: Collections-ar.125.mcz

commits at source.squeak.org commits at source.squeak.org
Fri Sep 4 04:41:20 UTC 2009


Andreas Raab uploaded a new version of Collections to project The Trunk:
http://source.squeak.org/trunk/Collections-ar.125.mcz

==================== Summary ====================

Name: Collections-ar.125
Author: ar
Time: 3 September 2009, 9:40:06 am
UUID: 65d7a3ef-c4cd-d847-a137-8ebe7bcae3be
Ancestors: Collections-jcg.124

http://bugs.squeak.org/view.php?id=7314

Summary  	 0007314: UTF8 conversion speedup
Description 	The conversions between Squeak and UTF8 can be fairly slow. The attached changes speed up the typical conversions dramatically. 

=============== Diff against Collections-jcg.124 ===============

Item was added:
+ ----- Method: ByteString>>squeakToUtf8 (in category 'converting') -----
+ squeakToUtf8
+ 	"Convert the given string from UTF-8 using the fast path if converting to Latin-1"
+ 	| outStream lastIndex nextIndex |
+ 	Latin1ToUtf8Map ifNil:[^super squeakToUtf8]. "installation guard"
+ 	lastIndex := 1.
+ 	nextIndex := ByteString findFirstInString: self inSet: Latin1ToUtf8Map startingAt: lastIndex.
+ 	nextIndex = 0 ifTrue:[^self].
+ 	outStream := (String new: self size * 2) writeStream.
+ 	[outStream next: nextIndex-lastIndex putAll: self startingAt: lastIndex.
+ 	outStream nextPutAll: (Latin1ToUtf8Encodings at: (self byteAt: nextIndex)+1).
+ 	lastIndex := nextIndex + 1.
+ 	nextIndex := ByteString findFirstInString: self inSet: Latin1ToUtf8Map startingAt: lastIndex.
+ 	nextIndex = 0] whileFalse.
+ 	outStream next: self size-lastIndex+1 putAll: self startingAt: lastIndex.
+ 	^outStream contents
+ !

Item was added:
+ ----- Method: ByteString>>utf8ToSqueak (in category 'converting') -----
+ utf8ToSqueak
+ 	"Convert the given string from UTF-8 using the fast path if converting to Latin-1"
+ 	| outStream lastIndex nextIndex byte1 byte2 byte3 byte4 unicode |
+ 	Latin1ToUtf8Map ifNil:[^super utf8ToSqueak]. "installation guard"
+ 	lastIndex := 1.
+ 	nextIndex := ByteString findFirstInString: self inSet: Latin1ToUtf8Map startingAt: lastIndex.
+ 	nextIndex = 0 ifTrue:[^self].
+ 	outStream := (String new: self size) writeStream.
+ 	[outStream next: nextIndex-lastIndex putAll: self startingAt: lastIndex.
+ 	byte1 := self byteAt: nextIndex.
+ 	(byte1 bitAnd: 16rE0) = 192 ifTrue: [ "two bytes"
+ 		byte2 := self byteAt: (nextIndex := nextIndex+1).
+ 		(byte2 bitAnd: 16rC0) = 16r80 ifFalse:[^self]. "invalid UTF-8; presume Latin-1"
+ 		unicode := ((byte1 bitAnd: 31) bitShift: 6) + (byte2 bitAnd: 63)].
+ 	(byte1 bitAnd: 16rF0) = 224 ifTrue: [ "three bytes"
+ 		byte2 := self byteAt: (nextIndex := nextIndex+1).
+ 		(byte2 bitAnd: 16rC0) = 16r80 ifFalse:[^self]. "invalid UTF-8; presume Latin-1"
+ 		byte3 := self byteAt: (nextIndex := nextIndex+1).
+ 		(byte3 bitAnd: 16rC0) = 16r80 ifFalse:[^self]. "invalid UTF-8; presume Latin-1"
+ 		unicode := ((byte1 bitAnd: 15) bitShift: 12) + ((byte2 bitAnd: 63) bitShift: 6)
+ 			+ (byte3 bitAnd: 63)].
+ 	(byte1 bitAnd: 16rF8) = 240 ifTrue: [ "four bytes"
+ 		byte2 := self byteAt: (nextIndex := nextIndex+1).
+ 		(byte2 bitAnd: 16rC0) = 16r80 ifFalse:[^self]. "invalid UTF-8; presume Latin-1"
+ 		byte3 := self byteAt: (nextIndex := nextIndex+1).
+ 		(byte3 bitAnd: 16rC0) = 16r80 ifFalse:[^self]. "invalid UTF-8; presume Latin-1"
+ 		byte4 := self byteAt: (nextIndex := nextIndex+1).
+ 		(byte4 bitAnd: 16rC0) = 16r80 ifFalse:[^self]. "invalid UTF-8; presume Latin-1"
+ 		unicode := ((byte1 bitAnd: 16r7) bitShift: 18) +
+ 						((byte2 bitAnd: 63) bitShift: 12) + 
+ 						((byte3 bitAnd: 63) bitShift: 6) +
+ 						(byte4 bitAnd: 63)].
+ 	unicode ifNil:[^self]. "invalid UTF-8; presume Latin-1"
+ 	outStream nextPut: (Character value: unicode).
+ 	lastIndex := nextIndex + 1.
+ 	nextIndex := ByteString findFirstInString: self inSet: Latin1ToUtf8Map startingAt: lastIndex.
+ 	nextIndex = 0] whileFalse.
+ 	outStream next: self size-lastIndex+1 putAll: self startingAt: lastIndex.
+ 	^outStream contents
+ !

Item was changed:
  String variableByteSubclass: #ByteString
  	instanceVariableNames: ''
+ 	classVariableNames: 'Latin1ToUtf8Encodings Latin1ToUtf8Map'
- 	classVariableNames: ''
  	poolDictionaries: ''
  	category: 'Collections-Strings'!
  
  !ByteString commentStamp: '<historical>' prior: 0!
  This class represents the array of 8 bit wide characters.
  !

Item was added:
+ ----- Method: ByteString class>>initialize (in category 'initialization') -----
+ initialize
+ 	"ByteString initialize"
+ 	| latin1 utf8 |
+ 	Latin1ToUtf8Map := ByteArray new: 256.
+ 	Latin1ToUtf8Encodings := Array new: 256.
+ 	0 to: 255 do:[:i|
+ 		latin1 := String with: (Character value: i).
+ 		utf8 := latin1 convertToWithConverter: UTF8TextConverter new.
+ 		latin1 = utf8 ifTrue:[
+ 			Latin1ToUtf8Map at: i+1 put: 0. "no translation needed"
+ 		] ifFalse:[
+ 			Latin1ToUtf8Map at: i+1 put: 1. "no translation needed"
+ 			Latin1ToUtf8Encodings at: i+1 put: utf8.
+ 		].
+ 	].!




More information about the Squeak-dev mailing list