[squeak-dev] The Inbox: Regex-Core-ct.63.mcz

commits at source.squeak.org commits at source.squeak.org
Mon Aug 23 16:35:36 UTC 2021


A new version of Regex-Core was added to project The Inbox:
http://source.squeak.org/inbox/Regex-Core-ct.63.mcz

==================== Summary ====================

Name: Regex-Core-ct.63
Author: ct
Time: 23 August 2021, 6:35:34.901768 pm
UUID: d8bde553-3c08-514b-af2e-6817361b0737
Ancestors: Regex-Core-mt.61

Adds support for non-capturing groups. Also fixes a bug while parsing lookaround-like regexes (see #testLookaroundParser, Regex-Tests-Core-ct.18).

=============== Diff against Regex-Core-mt.61 ===============

Item was changed:
  ----- Method: RxMatcher>>syntaxRegex: (in category 'double dispatch') -----
  syntaxRegex: regexNode
+ 	"Double dispatch from the syntax tree.
+ 	Regex node is a chain of branches to be tried. Should compile this into a bundle of parallel branches, between two marker nodes."
+ 
+ 	| startNode endNode alternatives |
+ 	regexNode isCapturing ifFalse: [
+ 		^ regexNode branch dispatchTo: self].
- 	"Double dispatch from the syntax tree. 
- 	Regex node is a chain of branches to be tried. Should compile this 
- 	into a bundle of parallel branches, between two marker nodes." 
  	
+ 	startNode := RxmMarker new index: self allocateMarker.
+ 	endNode := RxmMarker new index: self allocateMarker.
- 	| startIndex endIndex endNode alternatives |
- 	startIndex := self allocateMarker.
- 	endIndex := self allocateMarker.
- 	endNode := RxmMarker new index: endIndex.
  	alternatives := self hookBranchOf: regexNode onto: endNode.
+ 	^ startNode
- 	^(RxmMarker new index: startIndex)
  		pointTailTo: alternatives;
  		yourself!

Item was changed:
  ----- Method: RxParser>>atom (in category 'recursive descent') -----
  atom
  	"An atom is one of a lot of possibilities, see below."
  
  	| atom |
  	(lookahead == nil 
  	or: [ lookahead == $| 
  	or: [ lookahead == $)
  	or: [ lookahead == $*
  	or: [ lookahead == $+ 
  	or: [ lookahead == $? ]]]]])
  		ifTrue: [ ^RxsEpsilon new ].
  		
  	lookahead == $( 
  		ifTrue: [
+ 			^ self group ].
- 			"<atom> ::= '(' <regex> ')' "
- 			self match: $(.
- 			atom := self regex.
- 			self match: $).
- 			^atom ].
  	
  	lookahead == $[
  		ifTrue: [
  			"<atom> ::= '[' <characterSet> ']' "
  			self match: $[.
  			atom := self characterSet.
  			self match: $].
  			^atom ].
  	
  	lookahead == $: 
  		ifTrue: [
  			"<atom> ::= ':' <messagePredicate> ':' "
  			self match: $:.
  			atom := self messagePredicate.
  			self match: $:.
  			^atom ].
  	
  	lookahead == $. 
  		ifTrue: [
  			"any non-whitespace character"
  			self next.
  			^RxsContextCondition new beAny].
  	
  	lookahead == $^ 
  		ifTrue: [
  			"beginning of line condition"
  			self next.
  			^RxsContextCondition new beBeginningOfLine].
  	
  	lookahead == $$ 
  		ifTrue: [
  			"end of line condition"
  			self next.
  			^RxsContextCondition new beEndOfLine].
  		
  	lookahead == $\ 
  		ifTrue: [
  			"<atom> ::= '\' <character>"
  			self next ifNil: [ self signalParseError: 'bad quotation' ].
  			(BackslashConstants includesKey: lookahead) ifTrue: [
  				atom := RxsCharacter with: (BackslashConstants at: lookahead).
  				self next.
+ 				^ atom].
+ 			self
+ 				ifSpecial: lookahead
+ 				then: [:node | self next. ^ node]].
- 				^atom].
- 			self ifSpecial: lookahead
- 				then: [:node | self next. ^node]].
  		
  	"If passed through the above, the following is a regular character."
  	atom := RxsCharacter with: lookahead.
  	self next.
  	^atom!

Item was added:
+ ----- Method: RxParser>>group (in category 'recursive descent') -----
+ group
+ 
+ 	self match: $(.
+ 	lookahead == $?
+ 		ifFalse: [
+ 			| group |
+ 			"<group> ::= '(' <regex> ')' "
+ 			group := self regex.
+ 			self match: $).
+ 			^ group].
+ 	
+ 	self next.
+ 	
+ 	lookahead == $:
+ 		ifTrue: [
+ 			"non-capturing group"
+ 			"<group> ::= '(?:' <regex> ')' "
+ 			| group |
+ 			self next.
+ 			group := self regex.
+ 			group beNonCapturing.
+ 			self match: $).
+ 			^ group].
+ 	
+ 	('<=!!' includes: lookahead)
+ 		ifTrue: [
+ 			| lookaround |
+ 			lookaround := self lookAround.
+ 			self match: $).
+ 			^ lookaround ].
+ 	
+ 	^ self signalParseError!

Item was changed:
  ----- Method: RxParser>>lookAround (in category 'recursive descent') -----
  lookAround
  	"Parse a lookaround expression after: (?<lookaround>) 
+ 	<lookaround> ::= !!<regex> | =<regex>
+ 	Positive lookahead: ?=
+ 	Negative lookahead: ?!!
+ 	Positive lookbehind: ?<=
+ 	Negative lookbehind: ?<!!"
+ 
- 	<lookaround> ::= !!<regex> | =<regex>"
  	| lookbehind positive |
+ 	('<!!=' includes: lookahead) ifFalse: [
- 	('!!=<' includes: lookahead) ifFalse: [
  		^ self signalParseError: 'Invalid lookaround expression ?', lookahead asString].
+ 	
  	lookbehind := lookahead == $<
  		ifTrue: [self next];
  		yourself.
+ 	
+ 	('!!=' includes: lookahead) ifFalse: [
+ 		^ self signalParseError: 'Invalid lookaround expression'].
+ 	
  	positive := lookahead == $=.
  	self next.
  	^ RxsLookaround
  		with: self regex
  		forward: lookbehind not
  		positive: positive!

Item was changed:
  ----- Method: RxParser>>piece (in category 'recursive descent') -----
  piece
  	"<piece> ::= <atom> | <atom>* | <atom>+ | <atom>? | <atom>{<number>,<number>}"
  
  	| atom |
  	atom := self atom.
  	
  	lookahead == $*
  		ifTrue: [ 
  			self next.
  			atom isNullable
  				ifTrue: [ self signalNullableClosureParserError ].
  			^ RxsPiece new initializeStarAtom: atom ].
  
  	lookahead == $+
  		ifTrue: [ 
  			self next.
  			atom isNullable
  				ifTrue: [ self signalNullableClosureParserError ].
  			^ RxsPiece new initializePlusAtom: atom ].
  
  	lookahead == $?
  		ifTrue: [ 
  			self next.
  			atom isNullable
+ 				ifTrue: [ self signalNullableClosureParserError ].
- 				ifTrue: [ 
- 					^ self lookAround ].
  			^ RxsPiece new initializeOptionalAtom: atom ].
  	
  	lookahead == ${
  		ifTrue: [
  			^ self quantifiedAtom: atom ].
  		
  	^ RxsPiece new initializeAtom: atom!

Item was changed:
  RxsNode subclass: #RxsRegex
+ 	instanceVariableNames: 'branch regex isCapturing'
- 	instanceVariableNames: 'branch regex'
  	classVariableNames: ''
  	poolDictionaries: ''
  	category: 'Regex-Core'!
  
  !RxsRegex commentStamp: 'Tbn 11/12/2010 23:15' prior: 0!
  -- Regular Expression Matcher v 1.1 (C) 1996, 1999 Vassili Bykov
  --
  The body of a parenthesized thing, or a top-level expression, also an atom.  
  
  Instance variables:
  	branch		<RxsBranch>
  	regex		<RxsRegex | RxsEpsilon>!

Item was added:
+ ----- Method: RxsRegex>>beNonCapturing (in category 'accessing') -----
+ beNonCapturing
+ 
+ 	isCapturing := false.!

Item was added:
+ ----- Method: RxsRegex>>initialize (in category 'initialize-release') -----
+ initialize
+ 
+ 	super initialize.
+ 	isCapturing := true.!

Item was added:
+ ----- Method: RxsRegex>>isCapturing (in category 'accessing') -----
+ isCapturing
+ 
+ 	^ isCapturing!



More information about the Squeak-dev mailing list