Let's build a transpiler! Part 13
This is the thirteenth post in a series of building a transpiler.You can find the previous ones here.
Last time I said we would polish some things and fix some omissions.
VB6 has a feature that allows us to use named arguments. Example: Set SecondNumber = ReadNumber(FirstDigit:=Ch)
We need to scan that ":=". We'll treat it as a binary operator.
These are the changes to GetToken to cope with that:
Case ":"
Set Token = NewToken(Ch, tkSoftLineBreak)
If Not AtEnd Then
Ch = GetChar
If Ch = "=" Then
Token.Kind = tkOperator
Token.Text = ":="
Else
UngetChar Ch
End If
End If
When we added compound operators, we let it slip some that make no sense, like Not= or Is=. We'll fix this now.
ElseIf IsOperator(Token) Then
If Not AtEnd Then
Ch = GetChar
If Ch = "=" Then
Select Case Token.Text
Case "And", "Eqv", "Imp", "Mod", "Or", "Xor"
Token.Text = Token.Text & Ch
Case Else
UngetChar Ch
End Select
Else
UngetChar Ch
End If
End If
We are going to normalize keywords, identifiers, and operators. It means that it doesn't matter if the code has a token written in different forms, like, say, AND, And, AnD, or and. We'll use a canonical form for every identifier.
Changes to IsKeyword:
Private Function IsKeyword(ByVal Token As Token) As Boolean
Dim Keyword As Variant
(...)
If Token.Suffix <> vbNullChar Then Exit Function
If StrComp(Token.Text, Keyword, vbTextCompare) = 0 Then
Token.Text = CStr(Keyword)
Token.Kind = tkKeyword
IsKeyword = True
Exit For
End If
Next
End Function
Changes to IsOperator:
Private Function IsOperator(ByVal Token As Token) As Boolean
Dim Keyword As Variant
If Token.Suffix <> vbNullChar Then Exit Function
For Each Keyword In Array("AddressOf", "And", "AndAlso", "Eqv", "Imp", "Is", "IsNot", "Mod", "New", "Not", _
"Or", "OrElse", "TypeOf", "Xor")
If StrComp(Token.Text, Keyword, vbTextCompare) = 0 Then
Token.Text = CStr(Keyword)
Token.Kind = tkOperator
IsOperator = True
Exit For
End If
Next
End Function
Introducing Normalize:
Private Ids_ As New Dictionary
Private Sub Normalize(ByVal Token As Token)
Dim Contextual As Variant
Dim Key As String
For Each Contextual In Array("Access", "Alias", "Append", "Base", "Binary", "Compare", "Error", "Explicit", "Lib", _
"Line", "Name", "Output", "PtrSafe", "Random", "Read", "Reset", "Spc", "Step", "Tab", "Text", "Width")
If StrComp(Token.Text, Contextual, vbTextCompare) = 0 Then
Token.Text = CStr(Contextual)
Exit Sub
End If
Next
Key = UCase$(Token.Text)
If Not Ids_.Exists(Key) Then Ids_.Add Key, Token.Text
Token.Text = Ids_(Key)
End Sub
We need to change how these functions are called, too. Instead of passing Token.Text we'll pass Token itself.
And we'll also update our syntax-highlighter script.
To distinguish a binary operator dot or bang from a unary one, we added a tilde to them. Our script will need to get rid of it when pretty-printing the code.
We will also create a Parser class to hold the TokenFrom function we created in my last post.
(Click here if you want to skip the code.)
Option Explicit
Private Const SPAN_STRING = "<span style='color:brown;'>"
Private Const SPAN_KEYWORD = "<span style='color:blue;'>"
Private Const SPAN_COMMENT = "<span style='color: green;'>"
Const tkWhiteSpace = 0
Const tkComment = 1
Const tkIdentifier = 2
Const tkEscapedIdentifier = 3
Const tkKeyword = 4
Const tkIntegerNumber = 5
Const tkFloatNumber = 6
Const tkSciNumber = 7
Const tkBinaryNumber = 8
Const tkOctalNumber = 9
Const tkHexaNumber = 10
Const tkFileHandle = 11
Const tkString = 12
Const tkDateTime = 13
Const tkOperator = 14
Const tkLeftParenthesis = 15
Const tkRightParenthesis = 16
Const tkHardLineBreak = 17
Const tkSoftLineBreak = 18
Const tkLineContinuation = 19
Const tkListSeparator = 20
Const tkPrintSeparator = 21
Const tkDirective = 22
Const tkEndOfStream = 23
Const NoContext = 0
Const OptionContext = 1
Const OptionCompareContext = 2
Const OnContext = 3
Const DeclareContext = 4
Const DeclareLibContext = 5
Const DeclareAliasContext = 6
Const ForNextContext = 7
Const ForToContext = 8
Const [Next Keyword Is For] = 9
Const [Next Keyword Is Input | Next Identifier Is Append, Binary, Output, or Random] = 10
Const [Next Keyword Is As or Shared | Next Identifier Is Access] = 11
Const [Next Keyword Is Access/Write | Next Identifier Is Access/Read] = 12
Const [Next Keyword Is Access/Write, Lock, As, or Shared] = 13
Const [Next Keyword Is Lock, As, or Shared] = 14
Const [Next Keyword Is Lock/Write | Next Identifier Is Lock/Read] = 15
Const [Next Keyword Is Lock/Write or As] = 16
Const [Next Keyword Is As] = 17
Const [Next Token Is Filehandle] = 18
Const [Next Identifier Is Len] = 19
Const ForReading = 1
Const ForWriting = 2
Dim vbBack : vbBack = Chr(8)
Private Function EncodeHtml(ByVal Text)
Text = Replace(Text, "&", "&")
Text = Replace(Text, "<", "<")
Text = Replace(Text, ">", ">")
EncodeHtml = Text
End Function
Call PrettyPrint()
Public Sub PrettyPrint()
Dim Nbsp, HtmlFile, Index, Text, FilePath, Token, Scanner, Parser
Rem File path for the source code is passed as a command-line argument.
FilePath = WScript.Arguments(0)
Set Scanner = New Scanner
Scanner.Open FilePath
Set Parser = New Parser
With CreateObject("Scripting.FileSystemObject")
Rem Output file will have the same name as the input file, but with an .HTML extension.
FilePath = .BuildPath(.GetParentFolderName(FilePath), .GetBaseName(FilePath) & ".html")
Set HtmlFile = .OpenTextFile(FilePath, ForWriting, True, True)
End With
Nbsp = True
Do
Set Token = Parser.TokenFrom(Scanner)
Select Case Token.Kind
Case tkWhiteSpace
If Nbsp Then
HtmlFile.Write " "
Else
HtmlFile.Write " "
End If
Case tkComment
HtmlFile.WriteLine SPAN_COMMENT & EncodeHtml(Token.Text) & "</span><br>"
Nbsp = True
Case tkIdentifier, tkIntegerNumber, tkFloatNumber, tkSciNumber
HtmlFile.Write Token.Text
Nbsp = False
Case tkEscapedIdentifier
HtmlFile.Write "[" & Token.Text & "]"
Nbsp = False
Case tkKeyword
HtmlFile.Write SPAN_KEYWORD & Token.Text & "</span>"
Nbsp = False
Case tkOctalNumber
HtmlFile.Write "&O" & Token.Text
Case tkHexaNumber
HtmlFile.Write "&H" & UCase(Token.Text)
Case tkFileHandle
HtmlFile.Write "#" & Token.Text
Case tkString
Text = Token.Text
Text = Replace(Token.Text, """", """""")
Text = EncodeHtml(Text)
HtmlFile.Write SPAN_STRING & """" & Text & """</span>"
Case tkDateTime
HtmlFile.Write "#" & Token.Text & "#"
Case tkOperator
If Scanner.IsLetter(AscW(Token.Text)) Then
HtmlFile.Write SPAN_KEYWORD & Token.Text & "</span>"
ElseIf Left(Token.Text, 1) = "~" Then
HtmlFile.Write Mid(Token.Text, 2)
Else
HtmlFile.Write EncodeHtml(Token.Text)
End If
Case tkLeftParenthesis, tkRightParenthesis, tkSoftLineBreak, tkListSeparator, tkPrintSeparator
HtmlFile.Write Token.Text
Nbsp = False
Case tkLineContinuation
HtmlFile.WriteLine " _<br>"
Nbsp = True
Case tkHardLineBreak
HtmlFile.WriteLine "<br>"
Nbsp = True
Case tkDirective
HtmlFile.Write "#" & Token.Text
Nbsp = False
Case tkEndOfStream
Exit Do
End Select
If Token.Suffix <> vbNullChar Then HtmlFile.Write Token.Suffix
Loop
HtmlFile.Close
End Sub
Class Parser
Private Downgrade_
Private WasAs_
Private LastToken_
Private State_
Private NextToken_
Private Sub Class_Initialize()
Set NextToken_ = Nothing
End Sub
Public Function TokenFrom(ByVal Scanner)
Dim Upgrade, Revoke, Token
If NextToken_ Is Nothing Then
Set Token = Scanner.GetToken
Else
Set Token = NextToken_
Set NextToken_ = Nothing
End If
If IsEndOfContext(Token) Then
State_ = NoContext
Else
Select Case Token.Kind
Case tkOperator
WasAs_ = False
Downgrade_ = Token.Text = "." Or Token.Text = "!"
If LastToken_.Kind = tkWhiteSpace Then
If Token.Text = "." Then Token.Text = "~."
ElseIf Token.Text = "!" Then Token.Text = "~!"
End If
Case tkKeyword
If Downgrade_ Then
Downgrade_ = False
Token.Kind = tkIdentifier
Else
Select Case Token.Text
Case "As"
WasAs_ = True
Select Case State_
Case [Next Keyword Is As or Shared | Next Identifier Is Access], _
[Next Keyword Is Access/Write, Lock, As, or Shared], _
[Next Keyword Is Lock, As, or Shared], _
[Next Keyword Is Lock/Write or As], _
[Next Keyword Is As]
State_ = [Next Token Is Filehandle]
End Select
Case "Date", "String"
If Not WasAs_ Then Token.Kind = tkIdentifier
Case "Declare"
If State_ = NoContext Then State_ = DeclareContext
Case "For"
If State_ = NoContext Then
State_ = ForNextContext
ElseIf State_ = [Next Keyword Is For] Then
State_ = [Next Keyword Is Input | Next Identifier Is Append, Binary, Output, or Random]
End If
Case "Input"
If State_ = [Next Keyword Is Input | Next Identifier Is Append, Binary, Output, or Random] Then
State_ = [Next Keyword Is As or Shared | Next Identifier Is Access]
End If
Case "Lock"
Select Case State_
Case [Next Keyword Is Access/Write, Lock, As, or Shared], _
[Next Keyword Is Lock, As, or Shared]
State_ = [Next Keyword Is Lock/Write | Next Identifier Is Lock/Read]
End Select
Case "Open"
If State_ = NoContext Then State_ = [Next Keyword Is For]
Case "Option"
If State_ = NoContext Then State_ = OptionContext
Case "On"
If State_ = NoContext Then State_ = OnContext
Case "Shared"
Select Case State_
Case [Next Keyword Is As or Shared | Next Identifier Is Access], _
[Next Keyword Is Access/Write | Next Identifier Is Access/Read], _
[Next Keyword Is Lock, As, or Shared]
State_ = [Next Keyword Is As]
End Select
Case "To"
If State_ = ForNextContext Then State_ = ForToContext
Case "Write"
Select Case State_
Case [Next Keyword Is Access/Write | Next Identifier Is Access/Read], _
[Next Keyword Is Access/Write, Lock, As, or Shared]
State_ = [Next Keyword Is Lock, As, or Shared]
Case [Next Keyword Is Lock/Write | Next Identifier Is Lock/Read], _
[Next Keyword Is Lock/Write or As]
State_ = [Next Keyword Is As]
End Select
End Select
End If
Case tkIdentifier
Downgrade_ = False
WasAs_ = False
Select Case State_
Case NoContext
Select Case Token.Text
Case "Line"
Set NextToken_ = Scanner.GetToken
Upgrade = NextToken_.Kind = tkKeyword And NextToken_.Text = "Input"
Case "Name"
Set NextToken_ = Scanner.GetToken
Upgrade = Right(NextToken_.Text, 1) <> "="
Case "Reset"
Set NextToken_ = Scanner.GetToken
Upgrade = IsEndOfContext(NextToken_)
Case "Width"
Set NextToken_ = Scanner.GetToken
Upgrade = NextToken_.Kind = tkFileHandle
End Select
Case OptionContext
Upgrade = Token.Text = "Base"
If Not Upgrade Then Upgrade = Token.Text = "Explicit"
If Not Upgrade Then
Upgrade = Token.Text = "Compare"
If Upgrade Then State_ = OptionCompareContext
End If
Case OptionCompareContext
Upgrade = Token.Text = "Binary"
If Not Upgrade Then Upgrade = Token.Text = "Text"
Case DeclareContext
Upgrade = Token.Text = "PtrSafe"
If Upgrade Then
State_ = DeclareLibContext
Else
Upgrade = Token.Text = "Lib"
If Upgrade Then State_ = DeclareAliasContext
End If
Case DeclareLibContext
Upgrade = Token.Text = "Lib"
If Upgrade Then State_ = DeclareAliasContext
Case DeclareAliasContext
Upgrade = Token.Text = "Alias"
Revoke = True
Case ForToContext
Upgrade = Token.Text = "Step"
Revoke = True
Case OnContext
Upgrade = Token.Text = "Error"
Revoke = True
Case [Next Keyword Is Input | Next Identifier Is Append, Binary, Output, or Random]
Upgrade = Token.Text = "Append"
If Not Upgrade Then Upgrade = Token.Text = "Binary"
If Not Upgrade Then Upgrade = Token.Text = "Output"
If Not Upgrade Then Upgrade = Token.Text = "Random"
State_ = [Next Keyword Is As or Shared | Next Identifier Is Access]
Case [Next Keyword Is As or Shared | Next Identifier Is Access]
Upgrade = Token.Text = "Access"
If Upgrade Then
State_ = [Next Keyword Is Access/Write | Next Identifier Is Access/Read]
Else
Upgrade = Token.Text = "Shared"
If Upgrade Then State_ = [Next Keyword Is As]
End If
Case [Next Keyword Is Access/Write, Lock, As, or Shared], _
[Next Keyword Is Lock, As, or Shared]
Upgrade = Token.Text = "Shared"
If Upgrade Then State_ = [Next Keyword Is As]
Case [Next Keyword Is Access/Write | Next Identifier Is Access/Read]
Upgrade = Token.Text = "Read"
If Upgrade Then State_ = [Next Keyword Is Access/Write, Lock, As, or Shared]
Case [Next Keyword Is Lock/Write | Next Identifier Is Lock/Read]
Upgrade = Token.Text = "Read"
If Upgrade Then State_ = [Next Keyword Is Lock/Write or As]
Case [Next Identifier Is Len]
Upgrade = Token.Text = "Len"
Revoke = True
End Select
Case tkFileHandle
If State_ = [Next Token Is Filehandle] Then State_ = [Next Identifier Is Len]
Case tkSoftLineBreak, tkHardLineBreak
WasAs_ = False
End Select
If Upgrade Then
Token.Kind = tkKeyword
If Revoke Then State_ = NoContext
End If
End If
Set LastToken_ = Token
Set TokenFrom = Token
End Function
Private Function IsEndOfContext(ByVal Token)
Dim Result : Result = Token.Kind = tkSoftLineBreak
If Not Result Then Result = Token.Kind = tkHardLineBreak
If Not Result Then Result = Token.Kind = tkRightParenthesis
If Not Result Then Result = Token.Kind = tkListSeparator
If Not Result Then Result = Token.Kind = tkPrintSeparator
If Not Result Then
If Token.Kind = tkKeyword Then
Result = Token.Text = "Then"
If Not Result Then Result = Token.Text = "Else"
End If
End If
IsEndOfContext = Result
End Function
End Class
Class Scanner
Private File_
Private RunningLine_
Private RunningColumn_
Private FrozenColumn_
Private PreviousColumn_
Private UnChars_
Private Ids_
Private Sub Class_Initialize()
Set Ids_ = CreateObject("Scripting.Dictionary")
Set File_ = Nothing
RunningLine_ = 0
RunningColumn_ = 1
End Sub
Private Sub Class_Terminate()
If Not File_ Is Nothing Then File_.Close
End Sub
Private Function AtEnd()
AtEnd = File_.AtEndOfStream
End Function
Private Property Let Middle(ByRef Text, ByVal Start, ByVal Length, ByVal Value)
Text = Left(Text, Start - 1) & Value & Mid(Text, Start + Length)
End Property
Public Sub Open(ByVal FilePath)
With CreateObject("Scripting.FileSystemObject")
Set File_ = .OpenTextFile(FilePath, ForReading, False, True)
End With
Dim Cp : Cp = GetCodePoint
If Cp <> &HFEFF Then UngetChar ChrW(Cp)
End Sub
Public Function GetToken()
Dim Done, Cp, Ch, Token
If AtEnd Then
Set GetToken = NewToken("", tkEndOfStream, vbNullChar)
Exit Function
End If
Do
Done = True
FrozenColumn_ = RunningColumn_
Cp = GetCodePoint
Ch = ChrW(Cp)
Select Case Ch
Case "["
Set Token = ReadEscapedIdentifier
Case """"
Set Token = ReadString
Case "&"
Set Token = ReadAmpersand
Case "#"
Set Token = ReadHash
Case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"
Set Token = ReadNumber(Ch)
Case "+", "-", "*", "/", "\", "^"
Set Token = NewToken(Ch, tkOperator, vbNullChar)
If Not AtEnd Then
Ch = GetChar
If Ch = "=" Then
Token.Text = Token.Text & "="
Else
UngetChar Ch
End If
End If
Case "=", ".", "!"
Set Token = NewToken(Ch, tkOperator, vbNullChar)
Case "<"
Set Token = NewToken(Ch, tkOperator, vbNullChar)
If Not AtEnd Then
Ch = GetChar
If Ch = ">" Or Ch = "=" Or Ch = "<" Then
Token.Text = Token.Text & Ch
Else
UngetChar Ch
End If
End If
Case ">"
Set Token = NewToken(Ch, tkOperator, vbNullChar)
If Not AtEnd Then
Ch = GetChar
Select Case Ch
Case "="
Token.Text = Token.Text & Ch
Case ">"
Token.Text = Token.Text & Ch
If Not AtEnd Then
Ch = GetChar
If Ch = ">" Then
Token.Text = Token.Text & Ch
Else
UngetChar Ch
End If
End If
Case Else
UngetChar Ch
End Select
End If
Case ":"
Set Token = NewToken(Ch, tkSoftLineBreak, vbNullChar)
If Not AtEnd Then
Ch = GetChar
If Ch = "=" Then
Token.Kind = tkOperator
Token.Text = ":="
Else
UngetChar Ch
End If
End If
Case vbLf
Set Token = NewToken(Ch, tkHardLineBreak, vbNullChar)
Case "'"
Set Token = ReadComment(Ch)
Case ","
Set Token = NewToken(Ch, tkListSeparator, vbNullChar)
Case ";"
Set Token = NewToken(Ch, tkPrintSeparator, vbNullChar)
Case "("
Set Token = NewToken(Ch, tkLeftParenthesis, vbNullChar)
Case ")"
Set Token = NewToken(Ch, tkRightParenthesis, vbNullChar)
Case " "
Set Token = NewToken(Ch, tkWhiteSpace, vbNullChar)
Case vbBack
Set Token = NewToken(Ch, tkLineContinuation, vbNullChar)
Case "`"
Done = False
DiscardComment
Case Else
If Not IsLetter(Cp) Then Fail "Invalid token: '" & Ch & "'"
Set Token = ReadIdentifier(Cp)
If IsKeyword(Token) Then
If StrComp(Token.Text, "Rem", vbTextCompare) = 0 Then Set Token = ReadComment(Token.Text)
ElseIf IsOperator(Token) Then
If Not AtEnd Then
Ch = GetChar
If Ch = "=" Then
Select Case Token.Text
Case "And", "Eqv", "Imp", "Mod", "Or", "Xor"
Token.Text = Token.Text & Ch
Case Else
UngetChar Ch
End Select
Else
UngetChar Ch
End If
End If
Else
Normalize Token
End If
End Select
Loop While Not Done
Set GetToken = Token
End Function
Private Sub Normalize(ByVal Token)
Dim Key
Dim Contextual
For Each Contextual In Array("Access", "Alias", "Append", "Base", "Binary", "Compare", "Error", "Explicit", "Lib", _
"Line", "Name", "Output", "PtrSafe", "Random", "Read", "Reset", "Spc","Step","Tab", "Text", "Width")
If StrComp(Token.Text, Contextual, vbTextCompare) = 0 Then
Token.Text = CStr(Contextual)
Exit Sub
End If
Next
Key = UCase(Token.Text)
If Not Ids_.Exists(Key) Then Ids_.Add Key, Token.Text
Token.Text = Ids_(Key)
End Sub
Private Function NextChar()
Dim Result
If UnChars_ = "" Then
If File_.AtEndOfStream Then Fail "Unexpected end of file"
Result = File_.Read(1)
Else
Result = Left(UnChars_, 1)
UnChars_ = Mid(UnChars_, 2)
End If
RunningColumn_ = RunningColumn_ + 1
NextChar = Result
End Function
Private Function GetCodePoint()
Dim CheckLF, Ch1, Ch2, Ch3
Ch1 = NextChar
If IsSpace(AscW(Ch1)) Then Ch1 = " "
Select Case Ch1
Case " "
Ch2 = NextChar
If Ch2 = "_" Then
Ch3 = NextChar
Select Case Ch3
Case vbCr
CheckLF = True
AdvanceLine
Ch1 = vbBack
Case vbLf
AdvanceLine
Ch1 = vbBack
Case Else
UngetChar Ch3
UngetChar Ch2
End Select
Else
UngetChar Ch2
End If
Case vbCr
CheckLF = True
Ch1 = vbLf
End Select
If CheckLF Then
Ch2 = NextChar
If Ch2 <> vbLf Then UngetChar Ch2
End If
If Ch1 = vbLf Then AdvanceLine
GetCodePoint = AscW(Ch1)
End Function
Private Function NextCodePoint()
Dim Result : Result = AscW(File_.Read(1))
RunningColumn_ = RunningColumn_ + 1
NextCodePoint = Result
End Function
Private Function GetChar()
Dim Cp : Cp = GetCodePoint
GetChar = ChrW(Cp)
End Function
Private Sub AdvanceLine()
RunningLine_ = RunningLine_ + 1
PreviousColumn_ = RunningColumn_
RunningColumn_ = 1
End Sub
Private Sub UngetChar(ByVal Character)
If VarType(Character) <> vbString Then Character = ChrW(Character)
If Character = vbLf Or Character = vbBack Then
RunningLine_ = RunningLine_ - 1
RunningColumn_ = PreviousColumn_
End If
RunningColumn_ = RunningColumn_ - 1
If Character = vbBack Then
RunningColumn_ = RunningColumn_ - 1
Character = " _" & vbNewLine
End If
UnChars_ = Character & UnChars_
End Sub
Private Sub Fail(ByVal Msg)
Err.Raise vbObjectError + 13, "Scanner", "(" & RunningLine_ & ", " & FrozenColumn_ & ") " & Msg
End Sub
Private Function ReadIdentifier(ByVal CodePoint)
Const MAX_LENGTH = 255
Dim IsOK, Cp, Count, Ch, Suffix, Buffer
Suffix = vbNullChar
Buffer = String(MAX_LENGTH, vbNullChar)
Count = 1
Middle(Buffer, Count, 1) = ChrW(CodePoint)
Do While Not AtEnd
Cp = GetCodePoint
Ch = ChrW(Cp)
IsOK = Ch = "_"
If Not IsOK Then IsOK = Ch >= "0" And Ch <= "9"
If Not IsOK Then IsOK = IsLetter(Cp)
If Not IsOK Then IsOK = IsSurrogate(Cp)
If Not IsOK Then Exit Do
Count = Count + 1
If Count > MAX_LENGTH Then Fail "Identifier too long"
Middle(Buffer, Count, 1) = Ch
Loop
Select Case Ch
Case "%", "&", "^", "@", "!", "#", "$"
Suffix = Ch
Case Else
UngetChar Ch
End Select
Set ReadIdentifier = NewToken(Left(Buffer, Count), tkIdentifier, Suffix)
End Function
Public Function IsLetter(ByVal CodePoint)
If CodePoint >= -32768 And CodePoint <= -24645 Or _
CodePoint >= -24576 And CodePoint <= -23412 Or _
CodePoint >= -22761 And CodePoint <= -22758 Or _
CodePoint >= -22528 And CodePoint <= -22527 Or _
CodePoint >= -22525 And CodePoint <= -22523 Or _
CodePoint >= -22521 And CodePoint <= -22518 Or _
CodePoint >= -22516 And CodePoint <= -22494 Or _
CodePoint >= -22464 And CodePoint <= -22413 Or _
CodePoint >= -21504 And CodePoint <= -10333 Or _
CodePoint >= -1792 And CodePoint <= -1491 Or _
CodePoint >= -1488 And CodePoint <= -1430 Or _
CodePoint >= -1424 And CodePoint <= -1319 Or _
CodePoint >= -1280 And CodePoint <= -1274 Or _
CodePoint >= -1261 And CodePoint <= -1257 Or _
CodePoint = -1251 Or _
CodePoint >= -1249 And CodePoint <= -1240 Or _
CodePoint >= -1238 And CodePoint <= -1226 Or _
CodePoint >= -1224 And CodePoint <= -1220 Or _
CodePoint = -1218 Or _
CodePoint = -1216 Or _
CodePoint = -1215 Or _
CodePoint = -1213 Or _
CodePoint = -1212 Or _
CodePoint >= -1210 And CodePoint <= -1103 Or _
CodePoint = -1069 Or _
CodePoint >= -1068 And CodePoint <= -707 Or _
CodePoint >= -688 And CodePoint <= -625 Or _
CodePoint >= -622 And CodePoint <= -569 Or _
CodePoint >= -528 And CodePoint <= -517 Or _
CodePoint >= -400 And CodePoint <= -396 Or _
CodePoint >= -394 And CodePoint <= -260 Or _
CodePoint >= -223 And CodePoint <= -198 Or _
CodePoint >= -191 And CodePoint <= -166 Or _
CodePoint >= -154 And CodePoint <= -66 Or _
CodePoint >= -62 And CodePoint <= -57 Or _
CodePoint >= -54 And CodePoint <= -49 Or _
CodePoint >= -46 And CodePoint <= -41 Or _
CodePoint >= -38 And CodePoint <= -36 Or _
CodePoint >= 65 And CodePoint <= 90 Or _
CodePoint >= 97 And CodePoint <= 122 Or _
CodePoint = 170 Or _
CodePoint = 181 Or _
CodePoint = 186 Or _
CodePoint >= 192 And CodePoint <= 214 Or _
CodePoint >= 216 And CodePoint <= 246 Or _
CodePoint >= 248 And CodePoint <= 705 Or _
CodePoint >= 710 And CodePoint <= 721 Or _
CodePoint >= 736 And CodePoint <= 740 Or _
CodePoint = 750 Or _
CodePoint >= 890 And CodePoint <= 893 Or _
CodePoint = 902 Or _
CodePoint >= 904 And CodePoint <= 906 Or _
CodePoint = 908 Or _
CodePoint >= 910 And CodePoint <= 929 Or _
CodePoint >= 931 And CodePoint <= 974 Or _
CodePoint >= 976 And CodePoint <= 1013 Or _
CodePoint >= 1015 And CodePoint <= 1153 Or _
CodePoint >= 1162 And CodePoint <= 1299 Or _
CodePoint >= 1329 And CodePoint <= 1366 Or _
CodePoint = 1369 Or _
CodePoint >= 1377 And CodePoint <= 1415 Or _
CodePoint >= 1488 And CodePoint <= 1514 Or _
CodePoint >= 1520 And CodePoint <= 1522 Or _
CodePoint >= 1569 And CodePoint <= 1594 Or _
CodePoint >= 1600 And CodePoint <= 1610 Or _
CodePoint = 1646 Or _
CodePoint = 1647 Or _
CodePoint >= 1649 And CodePoint <= 1747 Or _
CodePoint = 1749 Or _
CodePoint = 1765 Or _
CodePoint = 1766 Or _
CodePoint = 1774 Or _
CodePoint = 1775 Or _
CodePoint >= 1786 And CodePoint <= 1788 Or _
CodePoint = 1791 Or _
CodePoint = 1808 Or _
CodePoint >= 1810 And CodePoint <= 1839 Or _
CodePoint >= 1869 And CodePoint <= 1901 Or _
CodePoint >= 1920 And CodePoint <= 1957 Or _
CodePoint = 1969 Or _
CodePoint >= 1994 And CodePoint <= 2026 Or _
CodePoint = 2036 Or _
CodePoint = 2037 Or _
CodePoint = 2042 Then
IsLetter = True
End If
End Function
Private Function IsKeyword(ByVal Token)
Dim Keyword
If Token.Suffix <> vbNullChar Then Exit Function
For Each Keyword In Array("Any", "As", "Attribute", "Boolean", "ByRef", "ByVal", "Byte", "Call", "Case", _
"CDecl", "Circle", "Class", "Close", "Const", "Continue", "Currency", "Date", "Debug", "Declare", "Decimal", _
"Default", "DefBool", "DefByte", "DefCur", "DefDate", "DefDbl", "DefDec", "DefInt", "DefLng", "DefLngLng", _
"DefLngPtr", "DefObj", "DefSng", "DefStr", "DefVar", "Dim", "Do", "Double", "Each", "ElseIf", "Else", _
"Empty", "EndIf", "End", "EndIf", "Enum", "Event", "Exit", "False", "For", "Friend", "Function", "Get", _
"Global", "GoSub", "GoTo", "If", "Implements", "In", "Input", "Integer", "Let", "Like", "Local", "Long", _
"LongPtr", "LongLong", "Loop", "LSet", "Len", "Me", "Module", "Next", "Nothing", "Null", "On", "Open", _
"Option", "Optional", "ParamArray", "PSet", "Preserve", "Print", "Private", "Public", "Put", _
"RaiseEvent", "ReDim", "Rem", "Resume", "Return", "RSet", "Seek", "Select", "Set", "Scale", "Single", _
"Static", "Stop", "String", "Sub", "Then", "To", "True", "Type", "Unlock", "Until", "Variant", "Wend", _
"While", "With", "WithEvents", "Write")
If StrComp(Token.Text, Keyword, vbTextCompare) = 0 Then
Token.Text = CStr(Keyword)
Token.Kind = tkKeyword
IsKeyword = True
Exit For
End If
Next
End Function
Private Function IsOperator(ByVal Token)
Dim Keyword
If Token.Suffix <> vbNullChar Then Exit Function
For Each Keyword In Array("AddressOf", "And", "AndAlso", "Eqv", "Imp", "Is", "IsNot", "Mod", "New", "Not", _
"Or", "OrElse", "TypeOf", "Xor")
If StrComp(Token.Text, Keyword, vbTextCompare) = 0 Then
Token.Text = CStr(Keyword)
Token.Kind = tkOperator
IsOperator = True
Exit For
End If
Next
End Function
Private Function ReadEscapedIdentifier()
Const MAX_LENGTH = 255
Dim Cp, Count, Suffix, Buffer
Suffix = vbNullChar
Buffer = String(MAX_LENGTH, vbNullChar)
Do While Not AtEnd
Cp = GetCodePoint
If Cp = AscW("]") Then Exit Do
If Cp = 10 Then Fail "Invalid identifier"
Count = Count + 1
If Count > MAX_LENGTH Then Fail "Identifier too long"
Middle(Buffer, Count, 1) = ChrW(Cp)
Loop
If Not AtEnd Then
Suffix = GetChar
Select Case Suffix
Case "%", "&", "^", "@", "!", "#", "$"
Rem OK
Case Else
UngetChar Suffix
Suffix = vbNullChar
End Select
End If
Set ReadEscapedIdentifier = NewToken(Left(Buffer, Count), tkEscapedIdentifier, Suffix)
End Function
Private Function ReadString()
Const MAX_LENGTH = 1013
Dim Count, Ch, Buffer
Buffer = String(MAX_LENGTH, vbNullChar)
Do
If Count = MAX_LENGTH Then Fail "String too long"
If AtEnd Then
Ch = vbLf
Else
Ch = GetChar
End If
Select Case Ch
Case """"
If AtEnd Then Exit Do
Ch = GetChar
If Ch = """" Then
Count = Append(Count, Buffer, Ch)
Else
Rem We read too much. Let's put it "back".
UngetChar Ch
Exit Do
End If
Case vbLf
Fail "Unclosed string"
Case Else
Count = Append(Count, Buffer, Ch)
End Select
Loop
Set ReadString = NewToken(Left(Buffer, Count), tkString, vbNullChar)
End Function
Private Function Append(ByVal Count, ByRef Buffer, ByVal Ch)
Count = Count + 1
Middle(Buffer, Count, 1) = Ch
Append = Count
End Function
Private Function ReadInteger(ByVal FirstDigit)
Const MAX_LENGTH = 29
Dim Cp, Count, Ch, Suffix, Buffer
Suffix = vbNullChar
Buffer = String(MAX_LENGTH, vbNullChar)
If FirstDigit >= "0" And FirstDigit <= "9" Then
Count = 1
Middle(Buffer, Count, 1) = FirstDigit
End If
Do Until AtEnd
If Count = MAX_LENGTH Then Fail "Literal too long"
Cp = GetCodePoint
Ch = ChrW(Cp)
Select Case Ch
Case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"
Count = Count + 1
Middle(Buffer, Count, 1) = Ch
Case "%", "&", "^", "@", "!", "#"
Suffix = Ch
Exit Do
Case "_"
Rem We'll ignore it
Case Else
UngetChar Ch
Exit Do
End Select
Loop
Set ReadInteger = NewToken(Left(Buffer, Count), tkIntegerNumber, Suffix)
End Function
Private Function ReadFloat(ByVal FirstDigit)
Dim Ch, Result, FracPart
Set Result = ReadInteger(FirstDigit)
If Result.Suffix = vbNullChar Then
If Not AtEnd Then
Ch = GetChar
If Ch = "." Then
Set FracPart = ReadInteger("")
If FracPart.Text = "" Then Fail "Invalid literal"
Result.Text = Result.Text & "." & FracPart.Text
Result.Kind = tkFloatNumber
Result.Suffix = FracPart.Suffix
Else
UngetChar Ch
End If
End If
End If
Set ReadFloat = Result
End Function
Private Function ReadNumber(ByVal FirstDigit)
Dim Ch, Sg, Result, ExpPart
Set Result = ReadFloat(FirstDigit)
If Result.Suffix = vbNullChar Then
If Not AtEnd Then
Ch = GetChar
Select Case Ch
Case "e", "E"
If AtEnd Then
UngetChar Ch
Else
Sg = GetChar
If Sg = "-" Or Sg = "+" Then
Ch = ""
Else
Ch = Sg
Sg = "+"
End If
Set ExpPart = ReadInteger(Ch)
If ExpPart.Text = "" Or ExpPart.Suffix <> vbNullChar Then Fail "Invalid literal"
Result.Text = Result.Text & "E" & Sg & ExpPart.Text
Result.Kind = tkSciNumber
End If
Case Else
UngetChar Ch
End Select
End If
End If
Set ReadNumber = Result
End Function
Private Function ReadAmpersand()
Dim Ch : Ch = GetChar
Select Case Ch
Case "b", "B"
Set ReadAmpersand = ReadBin
Case "o", "O"
Set ReadAmpersand = ReadOctal
Case "h", "H"
Set ReadAmpersand = ReadHexa
Case "="
Set ReadAmpersand = NewToken("&=", tkOperator, vbNullChar)
Case Else
UngetChar Ch
Set ReadAmpersand = NewToken("&", tkOperator, vbNullChar)
End Select
End Function
Private Function ReadBin()
Const MAX_LENGTH = 96
Dim Count, Ch, Suffix, Buffer
Suffix = vbNullChar
Buffer = String(MAX_LENGTH, vbNullChar)
Do While Not AtEnd
If Count = MAX_LENGTH Then Fail "Literal too long"
Ch = GetChar
Select Case Ch
Case "0", "1"
Count = Count + 1
Middle(Buffer, Count, 1) = Ch
Case "%", "&", "^", "@", "!", "#"
Suffix = Ch
Exit Do
Case "_"
Rem We'll ignore it
Case Else
Exit Do
End Select
Loop
If Count = 0 Then Fail "Invalid literal"
Set ReadBin = NewToken(Left(Buffer, Count), tkBinaryNumber, Suffix)
End Function
Private Function ReadOctal()
Const MAX_LENGTH = 32
Dim Count, Ch, Suffix, Buffer
Suffix = vbNullChar
Buffer = String(MAX_LENGTH, vbNullChar)
Do While Not AtEnd
If Count = MAX_LENGTH Then Fail "Literal too long"
Ch = GetChar
Select Case Ch
Case "0", "1", "2", "3", "4", "5", "6", "7"
Count = Count + 1
Middle(Buffer, Count, 1) = Ch
Case "%", "&", "^", "@", "!", "#"
Suffix = Ch
Exit Do
Case "_"
Rem We'll ignore it
Case Else
Exit Do
End Select
Loop
If Count = 0 Then Fail "Invalid literal"
Set ReadOctal = NewToken(Left(Buffer, Count), tkOctalNumber, Suffix)
End Function
Private Function ReadHexa()
Const MAX_LENGTH = 24
Dim Count, Ch, Suffix, Buffer
Suffix = vbNullChar
Buffer = String(MAX_LENGTH, vbNullChar)
Do While Not AtEnd
If Count = MAX_LENGTH Then Fail "Literal too long"
Ch = GetChar
Select Case Ch
Case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", _
"a", "b", "c", "d", "e", "f", _
"A", "B", "C", "D", "E", "F"
Count = Count + 1
Middle(Buffer, Count, 1) = Ch
Case "%", "&", "^", "@", "!", "#"
Suffix = Ch
Exit Do
Case "_"
Rem We'll ignore it
Case Else
UngetChar Ch
Exit Do
End Select
Loop
If Count = 0 Then Fail "Invalid literal"
Set ReadHexa = NewToken(Left(Buffer, Count), tkHexaNumber, Suffix)
End Function
Private Function ReadHash()
Const Msg = "Invalid literal"
Dim Cp, Number, Name, Ch, Token
Rem Let's get the first number.
Set Token = ReadInteger("")
If Token.Text = "" Then
Rem Maybe we have a month name?
Name = ReadMonthName
Select Case UCase(Name)
Case "IF", "ELSEIF", "ELSE", "END", "CONST"
Rem Not a month name, we have a compiler directive instead.
Set ReadHash = NewToken(Name, tkDirective, vbNullChar)
Exit Function
Case ""
Fail Msg
Case Else
Number = ConvertNameToNumber(Name)
If Number = 0 Then
Rem Not a month name, we have a variable filehandle instead.
Rem Hopefully this variable is not named Feb, for instance, otherwise we'll get tricked...
Set ReadHash = NewToken(Name, tkFileHandle, vbNullChar)
Exit Function
End If
Token.Text = CStr(Number)
End Select
End If
Rem Let's get the first separator.
Cp = GetCodePoint
Ch = ChrW(Cp)
If IsLetter(Cp) Or Ch = "," Then
Rem We have a numeric filehandle
UngetChar Ch
Token.Kind = tkFileHandle
Set ReadHash = Token
Exit Function
End If
If Ch = ":" Then
Rem We are reading a time literal.
Name = ReadTime(Token.Text)
Rem Date literal must end with a '#'.
Ch = GetChar
If Ch <> "#" Then Fail Msg
Name = "1899-12-30 " & Name
Set ReadHash = NewToken(Name, tkDateTime, vbNullChar)
Exit Function
End If
Rem We'll suppose it is a valid separator.
On Error Resume Next
Name = ReadDate(Token.Text, Ch)
If Err.Number Then
Rem It is not a date, but a numeric file handle
Rem TODO: Can ReadDate scan more than one character?
On Error GoTo 0
UngetChar Ch
Token.Kind = tkFileHandle
Set ReadHash = Token
Exit Function
End If
On Error GoTo 0
Ch = GetChar
Select Case Ch
Case " "
Rem We may have a date and time literal together.
Set ReadHash = NewToken(ReadTime(""), tkDateTime, vbNullChar)
If ReadHash.Text = "" Then Fail Msg
ReadHash.Text = Name & " " & ReadHash.Text
Ch = GetChar
If Ch <> "#" Then Fail Msg
Case "#"
Rem Literal does not have a time part. Let's add it.
Set ReadHash = NewToken(Name & " 00:00:00", tkDateTime, vbNullChar)
Case Else
Fail Msg
End Select
End Function
Private Function ReadDate(ByVal FirstNumber, ByVal Separator)
Const Msg = "Invalid literal"
Dim YYYY, MM, DD, Result, SecondNumber, ThirdNumber, Ch
Set SecondNumber = ReadInteger("")
If SecondNumber.Text = "" Then Fail Msg
Rem The next separator must match the first one.
Ch = GetChar
If Ch <> Separator Then Fail Msg
Set ThirdNumber = ReadInteger("")
If ThirdNumber.Text = "" Then Fail Msg
If CInt(FirstNumber) >= 100 And Separator = "-" Then
YYYY = CInt(FirstNumber)
MM = CInt(SecondNumber.Text)
DD = CInt(ThirdNumber.Text)
Else
MM = CInt(FirstNumber)
DD = CInt(SecondNumber.Text)
YYYY = CInt(ThirdNumber.Text)
If YYYY < 100 Then
YYYY = YYYY + 1900
If YYYY < 1950 Then YYYY = YYYY + 100
End If
End If
Rem Validate year.
If YYYY > 9999 Then Fail Msg
Rem Validate month.
If MM < 1 Or MM > 12 Then Fail Msg
Rem Validate day.
Select Case MM
Case 4, 6, 9, 11
If DD > 30 Then Fail Msg
Case 2
If YYYY Mod 4 = 0 And YYYY Mod 100 <> 0 Or YYYY Mod 400 = 0 Then
If DD > 29 Then Fail Msg
Else
If DD > 28 Then Fail Msg
End If
Case Else
If DD > 31 Then Fail Msg
End Select
Rem Put it together in YYYY-MM-DD format.
If YYYY < 1000 Then Result = "0"
If YYYY < 100 Then Result = Result & "0"
If YYYY < 10 Then Result = Result & "0"
Result = Result & CStr(YYYY)
Result = Result & "-"
If MM < 10 Then Result = Result & "0"
Result = Result & CStr(MM)
Result = Result & "-"
If DD < 10 Then Result = Result & "0"
Result = Result & CStr(DD)
ReadDate = Result
End Function
Private Function ReadTime(ByVal FirstNumber)
Const Msg = "Invalid literal"
Dim HH, NN, SS, Number, Ch, Ch2, AP
HH = CInt(FirstNumber)
Number = ReadInteger("")
If Number = "" Then Err.Raise 0
NN = CInt(Number)
Ch = GetChar
If Ch = ":" Then
Number = ReadInteger("")
If Number = "" Then Err.Raise 0
SS = CInt(Number)
Else
UngetChar Ch
SS = 0
End If
If Not AtEnd Then
Ch = GetChar
If Ch = " " Then
If Not AtEnd Then
Ch = GetChar
If Ch = "a" Or Ch = "A" Then
Ch2 = GetChar
If Ch2 = "m" Or Ch2 = "M" Then
AP = "A"
Else
UngetChar Ch2
UngetChar Ch
UngetChar " "
End If
ElseIf Ch = "p" Or Ch = "P" Then
Ch2 = GetChar
If Ch2 = "m" Or Ch2 = "M" Then
AP = "P"
Else
UngetChar Ch2
UngetChar Ch
UngetChar " "
End If
Else
UngetChar Ch
UngetChar " "
End If
End If
Else
UngetChar Ch
End If
End If
Rem Validate hour, minute, and second.
If HH < 0 Or HH > 23 Then Err.Raise 0
If NN < 0 Or NN > 59 Then Err.Raise 0
If SS < 0 Or SS > 59 Then Err.Raise 0
If AP = "A" Then
If HH = 12 Then HH = 0
ElseIf AP = "P" Then
If HH <> 12 Then HH = HH + 12
End If
Rem Put it together in HH:NN:SS format.
Number = CStr(SS)
If SS < 10 Then Number = "0" & Number
Number = ":" & Number
Number = CStr(NN) & Number
If NN < 10 Then Number = "0" & Number
Number = ":" & Number
Number = CStr(HH) & Number
If HH < 10 Then Number = "0" & Number
ReadTime = Number
End Function
Private Function ReadMonthName()
Dim Result, Ch, Prv
Do While Not AtEnd
Prv = Ch
Ch = GetChar
Select Case Ch
Case "#", vbLf, ",", ";", ")", " "
UngetChar Ch
Exit Do
Case "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"
Rem We safely can assume we read two characters more than needed.
UngetChar Ch
UngetChar Prv
Result = Left(Result, Len(Result) - 1)
Exit Do
Case Else
Result = Result & Ch
End Select
Loop
ReadMonthName = Result
End Function
Private Function ConvertNameToNumber(ByVal Name)
Dim Count, Result, MonthName, MonthNames
MonthNames = Array("January", "February", "March", "April", "May", "June", _
"July", "August", "September", "October", "November", "December")
For Each MonthName In MonthNames
Count = Count + 1
If StrComp(Name, MonthName, vbTextCompare) = 0 Then Result = Count
If Result = 0 Then: If StrComp(Name, Left(MonthName, 3), vbTextCompare) = 0 Then Result = Count
If Result <> 0 Then Exit For
Next
ConvertNameToNumber = Result
End Function
Private Function IsSpace(ByVal CodePoint)
Const NULL_CHAR = 0
Const VERTICAL_TAB = 9
Const EOM = 25
Const WHITE_SPACE = 32
Const NO_BREAK_SPACE = 160
Const OGHAM_SPACE_MARK = &H1680
Const MONGOLIAN_VOWEL_SEPARATOR = &H180E
Const EN_QUAD = &H2000
Const HAIR_SPACE = &H200A
Const NARROW_NO_BREAK_SPACE = &H202F
Const MEDIUM_MATHEMATICAL_SPACE = &H205F
Const IDEOGRAPHIC_SPACE = &H3000
Select Case CodePoint
Case NULL_CHAR, WHITE_SPACE, VERTICAL_TAB, EOM, NO_BREAK_SPACE, OGHAM_SPACE_MARK, MONGOLIAN_VOWEL_SEPARATOR, _
NARROW_NO_BREAK_SPACE, MEDIUM_MATHEMATICAL_SPACE, IDEOGRAPHIC_SPACE
IsSpace = True
Case Else
IsSpace = CodePoint >= EN_QUAD And CodePoint <= HAIR_SPACE
End Select
End Function
Private Function NewToken(ByVal Text, ByVal Kind, ByVal Suffix)
Set NewToken = New Token
With NewToken
.Text = Text
.Kind = Kind
.Suffix = Suffix
.Line = RunningLine_
.Column = FrozenColumn_
End With
End Function
Private Function ReadComment(ByVal Mark)
Const MAX_LENGTH = 1013
Dim Count, Ch, Buffer
Buffer = String(MAX_LENGTH, vbNullChar)
Count = Len(Mark)
Middle(Buffer, 1, Count) = Mark
Do While Not AtEnd
If Count = MAX_LENGTH Then Fail "Comment too long"
Ch = GetChar
If Ch = vbLf Then Exit Do
Count = Count + 1
Middle(Buffer, Count, 1) = Ch
Loop
Set ReadComment = NewToken(Left(Buffer, Count), tkComment, vbNullChar)
End Function
Private Sub DiscardComment()
Dim Count, Ch
Count = 1
Do While Not AtEnd
Ch = GetChar
Select Case Ch
Case "`"
Count = Count + 1
Case "ยด"
Count = Count - 1
If Count = 0 Then Exit Do
End Select
Loop
End Sub
Rem The first (high) surrogate is a 16-bit code value in the range U+D800 to U+DBFF.
Private Function IsHighSurrogate(ByVal Character)
IsHighSurrogate = Character >= -10240 And Character <= -9217 Or Character >= 55296 And Character <= 56319
End Function
Rem The second (low) surrogate is a 16-bit code value in the range U+DC00 to U+DFFF.
Private Function IsLowSurrogate(ByVal Character)
IsLowSurrogate = Character >= -9216 And Character <= -8193 Or Character >= 56320 And Character <= 57343
End Function
Public Function IsSurrogate(ByVal Character)
IsSurrogate = IsLowSurrogate(Character) Or IsHighSurrogate(Character)
End Function
End Class
Class Token
Public Text
Public Suffix
Public Kind
Public Line
Public Column
Private Sub Class_Initialize()
Text = " "
Suffix = vbNullChar
End Sub
End Class
So far we have been spitting out tokens, but we are not checking if they make sense.
Next week we'll change that.
Andrej Biasic
2020-10-14