[squeak-dev] The Trunk: Multilingual-ul.114.mcz

commits at source.squeak.org commits at source.squeak.org
Sat Mar 27 19:44:07 UTC 2010


Levente Uzonyi uploaded a new version of Multilingual to project The Trunk:
http://source.squeak.org/trunk/Multilingual-ul.114.mcz

==================== Summary ====================

Name: Multilingual-ul.114
Author: ul
Time: 27 March 2010, 8:38:42.093 pm
UUID: fb6bc83b-7972-5142-afff-b4de65f2a5a5
Ancestors: Multilingual-bf.110

- removed unused selectors and instance variables from UTF8TextConverter
- added string encoding/decoding capabilities to TextConverter
- copied ByteString's #utf8ToSqueak and #squeakToUtf8 implementation to UTF8TextConverter class without the Latin-1 fallback code (an error is raised if the input is not valid)

=============== Diff against Multilingual-bf.110 ===============

Item was added:
+ ----- Method: UTF8TextConverter class>>decodeByteString: (in category 'conversion') -----
+ decodeByteString: aByteString
+ 	"Convert the given string from UTF-8 using the fast path if converting to Latin-1"
+ 
+ 	| outStream lastIndex nextIndex byte1 byte2 byte3 byte4 unicode |
+ 	lastIndex := 1.
+ 	(nextIndex := ByteString findFirstInString: aByteString inSet: latin1Map startingAt: lastIndex) = 0
+ 		ifTrue: [ ^aByteString ].
+ 	outStream := (String new: aByteString size) writeStream.
+ 	[
+ 		outStream next: nextIndex - lastIndex putAll: aByteString startingAt: lastIndex.
+ 		byte1 := aByteString byteAt: nextIndex.
+ 		(byte1 bitAnd: 16rE0) = 192 ifTrue: [ "two bytes"
+ 			byte2 := aByteString byteAt: (nextIndex := nextIndex + 1).
+ 			(byte2 bitAnd: 16rC0) = 16r80 ifFalse:[	^self errorMalformedInput ].
+ 			unicode := ((byte1 bitAnd: 31) bitShift: 6) + (byte2 bitAnd: 63)].
+ 		(byte1 bitAnd: 16rF0) = 224 ifTrue: [ "three bytes"
+ 			byte2 := aByteString byteAt: (nextIndex := nextIndex + 1).
+ 			(byte2 bitAnd: 16rC0) = 16r80 ifFalse:[ ^self errorMalformedInput ].
+ 			byte3 := aByteString byteAt: (nextIndex := nextIndex + 1).
+ 			(byte3 bitAnd: 16rC0) = 16r80 ifFalse:[ ^self errorMalformedInput ].
+ 			unicode := ((byte1 bitAnd: 15) bitShift: 12) + ((byte2 bitAnd: 63) bitShift: 6)
+ 				+ (byte3 bitAnd: 63)].
+ 		(byte1 bitAnd: 16rF8) = 240 ifTrue: [ "four bytes"
+ 			byte2 := aByteString byteAt: (nextIndex := nextIndex + 1).
+ 			(byte2 bitAnd: 16rC0) = 16r80 ifFalse:[ ^self errorMalformedInput ].
+ 			byte3 := aByteString byteAt: (nextIndex := nextIndex + 1).
+ 			(byte3 bitAnd: 16rC0) = 16r80 ifFalse:[ ^self errorMalformedInput ].
+ 			byte4 := aByteString byteAt: (nextIndex := nextIndex + 1).
+ 			(byte4 bitAnd: 16rC0) = 16r80 ifFalse:[ ^self errorMalformedInput ].
+ 			unicode := ((byte1 bitAnd: 16r7) bitShift: 18) +
+ 							((byte2 bitAnd: 63) bitShift: 12) + 
+ 							((byte3 bitAnd: 63) bitShift: 6) +
+ 							(byte4 bitAnd: 63)].
+ 		unicode ifNil: [ ^self errorMalformedInput ].
+ 		unicode = 16rFEFF ifFalse: [ "Skip byte order mark"
+ 			outStream nextPut: (Unicode value: unicode) ].
+ 		lastIndex := nextIndex + 1.
+ 		(nextIndex := ByteString findFirstInString: aByteString inSet: latin1Map startingAt: lastIndex) = 0 ] whileFalse.
+ 	^outStream 
+ 		next: aByteString size - lastIndex + 1 putAll: aByteString startingAt: lastIndex;
+ 		contents
+ !

Item was changed:
  ----- Method: UTF8TextConverter>>errorMalformedInput (in category 'conversion') -----
  errorMalformedInput
+ 	
+ 	^self class errorMalformedInput!
- 	^self error: 'Invalid utf8 input detected'!

Item was added:
+ ----- Method: TextConverter>>encodeString: (in category 'conversion') -----
+ encodeString: aString
+ 
+ 	^String new: aString size streamContents: [ :stream | 
+ 		self 
+ 			nextPutAll: aString
+ 			toStream: stream ]!

Item was added:
+ ----- Method: UTF8TextConverter>>encodeString: (in category 'conversion') -----
+ encodeString: aString
+ 
+ 	aString isByteString ifTrue: [ ^self class encodeByteString: aString ].
+ 	^super encodeString: aString!

Item was added:
+ ----- Method: UTF8TextConverter class>>encodeByteString: (in category 'conversion') -----
+ encodeByteString: aByteString
+ 	"Convert the given string from UTF-8 using the fast path if converting to Latin-1"
+ 
+ 	| outStream lastIndex nextIndex |
+ 	lastIndex := 1.
+ 	(nextIndex := ByteString findFirstInString: aByteString inSet: latin1Map startingAt: lastIndex) = 0
+ 		ifTrue: [ ^aByteString ].
+ 	outStream := (String new: aByteString size + 1) writeStream.
+ 	[
+ 		outStream 
+ 			next: nextIndex - lastIndex putAll: aByteString startingAt: lastIndex;
+ 			nextPutAll: (latin1Encodings at: (aByteString byteAt: nextIndex) + 1).
+ 		lastIndex := nextIndex + 1.
+ 		(nextIndex := ByteString findFirstInString: aByteString inSet: latin1Map startingAt: lastIndex) = 0 ] whileFalse.
+ 	^outStream 
+ 		next: aByteString size - lastIndex + 1 putAll: aByteString startingAt: lastIndex;
+ 		contents!

Item was changed:
  ----- Method: UTF8TextConverter>>nextFromStream: (in category 'conversion') -----
  nextFromStream: aStream
  
  	| character1 value1 character2 value2 unicode character3 value3 character4 value4 |
  	aStream isBinary ifTrue: [^ aStream basicNext].
  	character1 := aStream basicNext.
  	character1 isNil ifTrue: [^ nil].
  	value1 := character1 asciiValue.
  	value1 <= 127 ifTrue: [
  		"1-byte character"
- 		currentCharSize := 1.
  		^ character1
  	].
  
  	"at least 2-byte character"
  	character2 := aStream basicNext.
  	character2 = nil ifTrue: [^self errorMalformedInput].
  	value2 := character2 asciiValue.
  
  	(value1 bitAnd: 16rE0) = 192 ifTrue: [
- 		currentCharSize := 2.
  		^ Unicode value: ((value1 bitAnd: 31) bitShift: 6) + (value2 bitAnd: 63).
  	].
  
  	"at least 3-byte character"
  	character3 := aStream basicNext.
  	character3 = nil ifTrue: [^self errorMalformedInput].
  	value3 := character3 asciiValue.
  	(value1 bitAnd: 16rF0) = 224 ifTrue: [
  		unicode := ((value1 bitAnd: 15) bitShift: 12) + ((value2 bitAnd: 63) bitShift: 6)
  				+ (value3 bitAnd: 63).
- 		currentCharSize := 3.
  	].
  
  	(value1 bitAnd: 16rF8) = 240 ifTrue: [
  		"4-byte character"
  		character4 := aStream basicNext.
  		character4 = nil ifTrue: [^self errorMalformedInput].
  		value4 := character4 asciiValue.
- 		currentCharSize := 4.
  		unicode := ((value1 bitAnd: 16r7) bitShift: 18) +
  					((value2 bitAnd: 63) bitShift: 12) + 
  					((value3 bitAnd: 63) bitShift: 6) +
  					(value4 bitAnd: 63).
  	].
  
  	unicode isNil ifTrue: [^self errorMalformedInput].
  	unicode > 16r10FFFD ifTrue: [^self errorMalformedInput].
  	
  	unicode = 16rFEFF ifTrue: [^ self nextFromStream: aStream].
  	^ Unicode value: unicode.
  !

Item was changed:
  TextConverter subclass: #UTF8TextConverter
+ 	instanceVariableNames: ''
- 	instanceVariableNames: 'currentCharSize forceToEncodingTag'
  	classVariableNames: ''
  	poolDictionaries: ''
  	category: 'Multilingual-TextConversion'!
  
  !UTF8TextConverter commentStamp: '<historical>' prior: 0!
  Text converter for UTF-8.  Since the BOM is used to distinguish the MacRoman code and UTF-8 code, BOM is written for UTF-8 by #writeBOMOn: which is called by client.!

Item was added:
+ ----- Method: UTF8TextConverter class>>errorMalformedInput (in category 'utilities') -----
+ errorMalformedInput
+ 
+ 	^self error: 'Invalid utf8 input detected'!

Item was added:
+ ----- Method: TextConverter>>decodeString: (in category 'conversion') -----
+ decodeString: aString
+ 
+ 	^String new: aString size streamContents: [ :stream |
+ 		| readStream character |
+ 		readStream := aString readStream.
+ 		[ (character := self nextFromStream: readStream) == nil ]
+ 			whileFalse: [ stream nextPut: character ] ]
+ !

Item was added:
+ ----- Method: CompoundTextConverter>>encodeString: (in category 'conversion') -----
+ encodeString: aString
+ 
+ 	^String new: aString size streamContents: [ :stream | 
+ 		self 
+ 			nextPutAll: aString
+ 			toStream: stream.
+ 		Latin1
+ 			emitSequenceToResetStateIfNeededOn: stream
+ 			forState: state ]!

Item was added:
+ ----- Method: UTF8TextConverter>>decodeString: (in category 'conversion') -----
+ decodeString: aString
+ 
+ 	aString isByteString ifTrue: [ ^self class decodeByteString: aString ].
+ 	^super decodeString: aString!

Item was removed:
- ----- Method: UTF8TextConverter>>currentCharSize (in category 'friend') -----
- currentCharSize
- 
- 	^ currentCharSize.
- !

Item was removed:
- ----- Method: UTF8TextConverter>>forceToEncodingTag: (in category 'accessing') -----
- forceToEncodingTag: encodingTagOrNil
- 
- 	forceToEncodingTag := encodingTagOrNil.
- !

Item was removed:
- ----- Method: UTF8TextConverter>>forceToEncodingTag (in category 'accessing') -----
- forceToEncodingTag
- 
- 	^ forceToEncodingTag.
- !




More information about the Squeak-dev mailing list