param (
$PsxFile
)
enum TokenType {
OPEN_ELEMENT_START # <
CLOSE_ELEMENT # >
VOID_ELEMENT_CLOSE # />
EQUAL # =
OPEN_ELEMENT_END #
TAG_NAME # [a-zA-Z0-9_-\.]+
ATTR_NAME # [a-zA-Z0-9_-\.]+
ATTR_VALUE # plain identifier
ATTR_VALUE_SCRIPTBLOCK # { ... }
ATTR_SPLAT # @VariableName
EOF
ILLEGAL # idk
}
[Flags()] enum ElementState {
UNOPENED = 1 # before <
OPENED = 2 # <
AFTER_TAG = 4 #
CLOSED_VOID = 128 #
END_OPENED = 256 #
,
ILLEGAL = 2048
VALID_ATTRIBUTE_NAME_START = (4 + 8 + 32 + 2048)
}
class Token {
[TokenType] $Type
[String] $Literal
Token($Type, $Literal) {
$this.Type = $Type
$this.Literal = $Literal
}
[String] ToString() {
return "{0} {1}" -f $this.Type, $this.Literal
}
[Boolean] Equals($that) {
return (($this.Type -eq $that.Type) -and ($this.Literal -eq $that.Literal))
}
}
class Lexer {
[String] $LexInput
[Int] $Pos
[Int] $NextPos
[String] $Char
[String] $CurrentTag = ''
[ElementState] $State = [ElementState]::UNOPENED
Lexer($LexInput) {
$this.LexInput = $LexInput
$this.Pos = 0
$this.NextPos = 1
$this.Char = [String]($LexInput[0])
}
hidden [String] PopChar() {
if ($this.NextPos -ge $this.LexInput.Length) {
$this.Char = ''
} else {
$this.Char = $this.LexInput[$this.NextPos]
}
$this.Pos = $this.NextPos
$this.NextPos++
return $this.Char
}
hidden [String] PeekChar() {
if ($this.NextPos -ge $this.LexInput.Length) {
return ''
} else {
return $this.LexInput[$this.NextPos]
}
}
hidden [Boolean] CharIsIdentifier() {
return $this.Char -match '[a-zA-Z0-9_-]'
}
hidden [String] PopIdentifier() {
$Start = $this.Pos
while ($this.CharIsIdentifier()) {
$this.PopChar()
}
# js string.substring is (indexStart, indexEnd)
# dotnet String.SubString is (startIndex, length)
return $this.LexInput.SubString($Start, $this.Pos-$Start)
}
hidden [String] PeekIdentifier() {
$Start = $this.Pos
$OldNext = $this.NextPos
$OldChar = $this.Char
while ($this.CharIsIdentifier()) {
$this.PopChar()
}
$ret = $this.LexInput.SubString($Start, $this.Pos-$Start)
# there's a better way to do this but that is for later
$this.Pos = $Start
$this.NextPos = $OldNext
$this.Char = $OldChar
return $ret
}
hidden [String] PopQuotedAttrValue() {
if ($this.Char -notin '"',"'",'{') {
$this.Die($this.Char)
}
$ExpectingQuote = $this.Char -eq '{' ? '}' : $this.Char
$this.PopChar() # first "
$Start = $this.Pos
while ($this.PopChar() -ne $ExpectingQuote) {}
$End = $this.Pos
$this.PopChar() # final "
return $this.LexInput.SubString($Start, $End - $Start)
}
hidden [Void] ConsumeWhitespace() {
while ($this.Char -match '\s') {
$this.PopChar()
}
}
hidden [Void] Die([String] $Char) {
throw "Unexpected token $char (char $($this.Pos)) at state $($this.State)"
}
[Token] NextToken() {
$Token = $null
if ($this.Pos -eq $this.LexInput.Length) {
return [Token]::new( [TokenType]::EOF, '' )
}
if ($this.State.HasFlag([ElementState]::ILLEGAL)) {
throw "Lexer found in illegal state $($this.State)"
}
switch -Regex ($this.Char) {
'<' {
switch ($this.State) {
{$_ -eq [ElementState]::UNOPENED} {
$Token = [Token]::new( [TokenType]::OPEN_ELEMENT_START, $this.Char )
$this.State = [ElementState]::OPENED
}
{$_ -eq [ElementState]::CLOSED} {
if ($this.PeekChar() -eq '/') {
$Token = [Token]::new( [TokenType]::OPEN_ELEMENT_END, $this.Char + $this.PopChar() )
$this.State = [ElementState]::END_OPENED
}
}
default {
$this.Die('<')
}
}
}
'/' {
switch ($this.State) {
{[ElementState]::VALID_ATTRIBUTE_NAME_START.HasFlag($_)} {
if ($this.PeekChar() -eq '>') {
$Token = [Token]::new( [TokenType]::VOID_ELEMENT_CLOSE, $this.Char + $this.PopChar() )
$this.State = [ElementState]::CLOSED_VOID
}
}
default {
$this.Die('/')
}
}
}
'>' {
# check for nested psx here!!!
switch ($this.State) {
{[ElementState]::VALID_ATTRIBUTE_NAME_START.HasFlag($_)} {
$Token = [Token]::new( [TokenType]::CLOSE_ELEMENT, $this.Char )
$this.State = [ElementState]::CLOSED
}
{$_ -eq [ElementState]::END_AFTER_TAG} {
$Token = [Token]::new( [TokenType]::CLOSE_ELEMENT, $this.Char )
$this.State = [ElementState]::END_CLOSED
}
default {
$this.Die('>')
}
}
$Token = [Token]::new([TokenType]::CLOSE_ELEMENT, $this.Char)
}
{$_ -in '"', "'"} {
# TODO: check for nested PowerShell
if ($this.State -ne [ElementState]::AFTER_ATTRIBUTE_EQUALS) {
$this.Die($_)
}
$this.State = [ElementState]::AFTER_ATTRIBUTE_VALUE
return [Token]::new( [TokenType]::ATTR_VALUE , $this.PopQuotedAttrValue() )
}
'{' {
if ($this.State -ne [ElementState]::AFTER_ATTRIBUTE_EQUALS) {
$this.Die($_)
}
$this.State = [ElementState]::AFTER_ATTRIBUTE_VALUE
return [Token]::new( [TokenType]::ATTR_VALUE_SCRIPTBLOCK , $this.PopQuotedAttrValue() )
}
'@' {
$this.PopChar() # drop @
if ($this.State -notin
[ElementState]::AFTER_TAG,
[ElementState]::AFTER_ATTRIBUTE_NAME,
[ElementState]::AFTER_ATTRIBUTE_VALUE
) {
$this.Die('@')
}
return [Token]::new( [TokenType]::ATTR_SPLAT, $this.PopIdentifier() )
}
'[a-zA-Z0-9_-]' {
$TokType = $null
switch ($this.State) {
{$_ -eq [ElementState]::OPENED} {
$TokType = [TokenType]::TAG_NAME
$this.CurrentTag = $this.PeekIdentifier()
$this.State = [ElementState]::AFTER_TAG
}
{[ElementState]::VALID_ATTRIBUTE_NAME_START.HasFlag($_)} {
$TokType = [TokenType]::ATTR_NAME
$this.State = [ElementState]::AFTER_ATTRIBUTE_NAME
}
{$_ -eq [ElementState]::AFTER_ATTRIBUTE_EQUALS} {
$TokType = [TokenType]::ATTR_VALUE
$this.State = [ElementState]::AFTER_ATTRIBUTE_VALUE
}
{$_ -eq [ElementState]::END_OPENED} {
$Cur = $this.CurrentTag
$Peek = $this.PeekIdentifier()
if ($Cur -ne $Peek) {
throw "Start tag name ($Cur) and end tag name ($Peek) doesn't match"
}
$TokType = [TokenType]::TAG_NAME
$this.State = [ElementState]::END_AFTER_TAG
}
default {
$this.Die( $this.Char )
}
}
return [Token]::new($TokType, $this.PopIdentifier())
}
'=' {
if ($this.State -ne [ElementState]::AFTER_ATTRIBUTE_NAME) {
$this.Die('=')
}
$Token = [Token]::new([TokenType]::EQUAL, $this.Char)
$this.State = [ElementState]::AFTER_ATTRIBUTE_EQUALS
}
'\s' {
$this.ConsumeWhitespace()
return $this.NextToken()
}
default {
$Token = [Token]::new([TokenType]::ILLEGAL, $this.LexInput.Substring($this.Pos))
$this.State = [ElementState]::ILLEGAL
}
}
$this.PopChar()
return $Token
}
}