Metamorphing Machine I rather be this walking metamorphosis
than having that old formed opinion about everything!

Let's build a transpiler! Part 11

This is the eleventh post in a series of building a transpiler.
You can find the previous ones here.

Last time I said we would update VB6.

There are some features I would like VB6 to have. Compound operators, for instance.
It is boring to write "Count = Count + 1". It is much more ergonomic to do "Count += 1" instead.
Also, VB6 lacks shift operators (<<, >>, and >>>)
Do you remember when we mentioned that nothing prevents us from scanning a binary literal? Now is the time to do it.
Speaking of literals, it would be nice to allow underscores within them to make them more legible.
And we can support Class and Module keywords!

But before we jump to that, you may have noticed that our syntax-coloring script has its flaws.
We made our scanner dumb, so it reads tokens and classifies them without taking into account their context.
As an example, Print is a reserved word, so we are coloring it, but when it is preceded by a dot ("."), it is not VB's Print statement anymore and should not be blued.
I'm holding myself to go after this because it is not Scanner's responsibility to do that. As I said, it is dumb and there's a reason for it.
It will be Parser's responsibility to reclassify it and other tokens in other situations. Parsing is what I have been calling the "next phase."

Another thing is that our script was a kind of a teaser and now we need to go back to a proper typed program.
So, here it is with the changes we did in our script:
(Click here if you want to get past the updated code.)

Public Module Program
Option Explicit
Option Compare Binary

Private Const SPAN_STRING = "<span style='color:brown'>"
Private Const SPAN_KEYWORD = "<span style='color:blue'>"
Private Const SPAN_COMMENT = "<span style='color: green;'>"

Public Enum TokenKind
tkWhiteSpace
tkComment
tkIdentifier
tkEscapedIdentifier
tkKeyword
tkIntegerNumber
tkFloatNumber
tkSciNumber
tkOctalNumber
tkHexaNumber
tkFileHandle
tkString
tkDateTime
tkOperator
tkLeftParenthesis
tkRightParenthesis
tkHardLineBreak
tkSoftLineBreak
tkLineContinuation
tkListSeparator
tkPrintSeparator
tkDirective
tkEndOfStream
End Enum


Public Sub Main()
Dim Nbsp As Boolean
Dim HtmlFile As Integer
Dim Index As Integer
Dim Text As String
Dim FilePath As String
Dim Token As Token
Dim Scanner As Scanner

Rem Ensuring we close the file in case we have an error.
On Error GoTo CloseIt

Rem File path for the source code is passed as a command-line argument.
FilePath = Command$

Set Scanner = New Scanner
Scanner.OpenFile FilePath

Rem Output file will have the same name as the input file, but with an .HTML extension.
Index = InStrRev(FilePath, ".")
If Index <> 0 Then FilePath = Left$(FilePath, Index - 1)

FilePath = FilePath & ".html"
HtmlFile = FreeFile
Open FilePath For Output Access Write As #HtmlFile

Nbsp = True

Do
Set Token = Scanner.GetToken

Select Case Token.Kind
Case tkWhiteSpace
If Nbsp Then
Print #HtmlFile, "&nbsp;&nbsp;";
Else
Print #HtmlFile, " ";
End If

Case tkComment
Print #HtmlFile, SPAN_COMMENT; EncodeHtml(Token.Text); "</span><br>"
Nbsp = True

Case tkIdentifier, tkIntegerNumber, tkFloatNumber, tkSciNumber
Print #HtmlFile, Token.Text;
Nbsp = False

Case tkEscapedIdentifier
Print #HtmlFile, "["; Token.Text; "]";
Nbsp = False

Case tkKeyword
Print #HtmlFile, SPAN_KEYWORD; Token.Text; "</span>";
Nbsp = False

Case tkOctalNumber
Print #HtmlFile, "&amp;O"; Token.Text;

Case tkHexaNumber
Print #HtmlFile, "&amp;H"; UCase$(Token.Text);

Case tkFileHandle
Print #HtmlFile, "#"; Token.Text;

Case tkString
Text = Token.Text
Text = Replace(Token.Text, """", """""")
Text = EncodeHtml(Text)
Print #HtmlFile, SPAN_STRING; """"; Text; """</span>";

Case tkDateTime
Print #HtmlFile, "#"; Token.Text; "#";

Case tkOperator
If Scanner.IsLetter(AscW(Token.Text)) Then
Print #HtmlFile, SPAN_KEYWORD; Token.Text; "</span>";
Else
Print #HtmlFile, EncodeHtml(Token.Text);
End If

Case tkLeftParenthesis, tkRightParenthesis, tkSoftLineBreak, tkListSeparator, tkPrintSeparator
Print #HtmlFile, Token.Text;

Case tkLineContinuation
Print #HtmlFile, "&nbsp;_<br>"
Nbsp = True

Case tkHardLineBreak
Print #HtmlFile, "<br>"
Nbsp = True

Case tkDirective
Print #HtmlFile, "#"; Token.Text;
Nbsp = False

Case tkEndOfStream
Exit Do
End Select

If Token.Suffix <> vbNullChar Then Print #HtmlFile, Token.Suffix;
Loop

CloseIt:
Close #HtmlFile
Rem This is equivalent to a Throw in a Catch.
If Err.Number Then Err.Raise Err.Number
End Sub

Private Function EncodeHtml(ByVal Text As String) As String
Text = Replace(Text, "&", "&amp;")
Text = Replace(Text, "<", "&lt;")
Text = Replace(Text, ">", "&gt;")
EncodeHtml = Text
End Function
End Module

Public Class Token
Public Text As String
Public Suffix As String
Public Kind As TokenKind
Public Line As Long
Public Column As Long

Private Sub Class_Initialize()
Text = " "
Suffix = vbNullChar
End Sub
End Class

Public Class Scanner
Private File_ As Integer
Private RunningLine_ As Long
Private RunningColumn_ As Long
Private FrozenColumn_ As Long
Private PreviousColumn_ As Long

Private Sub Class_Initialize()
RunningLine_ = 0
RunningColumn_ = 1
End Sub

Private Sub Class_Terminate()
If File_ <> 0 Then Close #File_
End Sub

Private Function AtEnd() As Boolean
AtEnd = Seek(File_) > LOF(File_)
End Function

Public Sub OpenFile(ByVal FilePath As String)
Dim Cp As Integer

If Dir(FilePath) = "" Then Err.Raise 53
File_ = FreeFile
Open FilePath For Binary Access Read As #File_
If LOF(File_) = 0 Then Err.Raise 53
Cp = GetCodePoint
If Cp <> &HFEFF Then UngetChar ChrW$(Cp)
End Sub

Public Function GetToken() As Token
Dim Cp As Integer
Dim Ch As String * 1
Dim Token As Token

If AtEnd Then
Set GetToken = NewToken("", tkEndOfStream)
Exit Function
End If

FrozenColumn_ = RunningColumn_
Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "["
Set Token = ReadEscapedIdentifier

Case """"
Set Token = ReadString

Case "&"
Set Token = ReadAmpersand

Case "#"
Set Token = ReadHash

Case "0" To "9"
Set Token = ReadNumber(Ch)

Case "+", "-", "*", "/", "\", "^", "=", ".", "!"
Set Token = NewToken(Ch, tkOperator)

Case "<"
Set Token = NewToken(Ch, tkOperator)
Ch = GetChar

If Ch = ">" Or Ch = "=" Then
Token.Text = Token.Text & Ch
Else
UngetChar Ch
End If

Case ">"
Set Token = NewToken(Ch, tkOperator)
Ch = GetChar

If Ch = "=" Then
Token.Text = Token.Text & "="
Else
UngetChar Ch
End If

Case ":"
Set Token = NewToken(Ch, tkSoftLineBreak)

Case vbLf
Set Token = NewToken(Ch, tkHardLineBreak)

Case "'"
Set Token = ReadComment(Ch)

Case ","
Set Token = NewToken(Ch, tkListSeparator)

Case ";"
Set Token = NewToken(Ch, tkPrintSeparator)

Case "("
Set Token = NewToken(Ch, tkLeftParenthesis)

Case ")"
Set Token = NewToken(Ch, tkRightParenthesis)

Case " "
Set Token = NewToken(Ch, tkComment)

Case vbBack
Set Token = NewToken(Ch, tkLineContinuation)

Case Else
If Not IsLetter(Cp) Then Fail "Invalid token: '" & Ch & "'"

Set Token = ReadIdentifier(Cp)

If IsKeyword(Token.Text) And Token.Suffix = vbNullChar Then
Token.Kind = tkKeyword
If StrComp(Token.Text, "Rem", vbTextCompare) = 0 Then Set Token = ReadComment(Token.Text)

ElseIf IsOperator(Token.Text) And Token.Suffix = vbNullChar Then
Token.Kind = tkOperator
End If
End Select

Set GetToken = Token
End Function

Private Function GetCodePoint() As Integer
Const BS = 8 'Back space. Used for line continuation
Const LF = 10 'Line feed
Const CR = 13 'Carriage return
Const SP = 32 'Space
Const US = 95 'Underscore
Dim CheckLF As Boolean
Dim Cp1 As Integer
Dim Cp2 As Integer
Dim Cp3 As Integer

Cp1 = NextCodePoint
If IsSpace(Cp1) Then Cp1 = SP

Select Case Cp1
Case SP
Cp2 = NextCodePoint

If Cp2 = US Then
Cp3 = NextCodePoint

Select Case Cp3
Case CR
CheckLF = True
AdvanceLine
Cp1 = BS

Case LF
AdvanceLine
Cp1 = BS

Case Else
UngetChar ChrW$(Cp3)
UngetChar ChrW$(Cp2)
End Select
Else
UngetChar ChrW$(Cp2)
End If

Case CR
CheckLF = True
Cp1 = LF
End Select

If CheckLF Then
Cp2 = NextCodePoint
If Cp2 <> LF Then UngetChar ChrW$(Cp2)
End If

If Cp1 = LF Then AdvanceLine
GetCodePoint = Cp1
End Function

Private Function NextCodePoint() As Integer
Dim Result As Integer

Get #File_, , Result
RunningColumn_ = RunningColumn_ + 1
NextCodePoint = Result
End Function

Private Function GetChar() As String
Dim Cp As Integer

Cp = GetCodePoint
GetChar = ToChar(Cp)
End Function

Private Function ToChar(ByVal CodePoint As Integer) As String
Dim Bytes(0 To 1) As Byte

Bytes(0) = CodePoint And &HFF
Bytes(1) = ((CodePoint And &HFF00) \ &H100) And &HFF
ToChar = Bytes
End Function

Private Sub AdvanceLine()
RunningLine_ = RunningLine_ + 1
PreviousColumn_ = RunningColumn_
RunningColumn_ = 1
End Sub

Private Sub UngetChar(ByVal Character As String)
Dim Pos As Long
Dim Length As Long

Length = 2
If Character = vbBack Then Length = (Length + Len(vbNewLine)) * 2

Pos = Seek(File_)
Seek #File_, Pos - Length

Select Case Character
Case vbLf, vbBack
RunningLine_ = RunningLine_ - 1
RunningColumn_ = PreviousColumn_
End Select

RunningColumn_ = RunningColumn_ - IIf(Character = vbBack, 2, 1)
End Sub

Private Sub Fail(ByVal Msg)
Err.Raise vbObjectError + 13, "Scanner", "(" & RunningLine_ & ", " & FrozenColumn_ & ") " & Msg
End Sub

Private Function ReadIdentifier(ByVal CodePoint As Integer) As Token
Const MAX_LENGTH = 255
Dim IsOK As Boolean
Dim Cp As Integer
Dim Count As Integer
Dim Ch As String * 1
Dim Suffix As String * 1
Dim Buffer As String * MAX_LENGTH

Count = 1
Mid$(Buffer, Count, 1) = ChrW(CodePoint)

Do While Not AtEnd
Cp = GetCodePoint
Ch = ToChar(Cp)

IsOK = Ch = "_"
If Not IsOK Then IsOK = Ch >= "0" And Ch <= "9"
If Not IsOK Then IsOK = IsLetter(Cp)
If Not IsOK Then IsOK = IsSurrogate(Cp)
If Not IsOK Then Exit Do

Count = Count + 1
If Count > MAX_LENGTH Then Fail "Identifier too long"
Mid$(Buffer, Count, 1) = Ch
Loop

Select Case Ch
Case "%", "&", "^", "@", "!", "#", "$"
Suffix = Ch

Case Else
UngetChar Ch
End Select

Set ReadIdentifier = NewToken(Left$(Buffer, Count), tkIdentifier, Suffix)
End Function

Public Function IsLetter(ByVal CodePoint As Integer) As Boolean
Select Case CodePoint
Case -32768 To -24645, -24576 To -23412, -22761 To -22758, -22528 To -22527, -22525 To -22523, _
-22521 To -22518, -22516 To -22494, -22464 To -22413, -21504 To -10333, -1792 To -1491, _
-1488 To -1430, -1424 To -1319, -1280 To -1274, -1261 To -1257, -1251, -1249 To -1240, _
-1238 To -1226, -1224 To -1220, -1218, -1216, -1215, -1213, -1212, -1210 To -1103, _
-1069, -1068 To -707, -688 To -625, -622 To -569, -528 To -517, -400 To -396, -394 To -260, _
-223 To -198, -191 To -166, -154 To -66, -62 To -57, -54 To -49, -46 To -41, -38 To -36, _
65 To 90, 97 To 122, 170, 181, 186, 192 To 214, 216 To 246, 248 To 705, 710 To 721, _
736 To 740, 750, 890 To 893, 902, 904 To 906, 908, 910 To 929, 931 To 974, 976 To 1013, _
1015 To 1153, 1162 To 1299, 1329 To 1366, 1369, 1377 To 1415, 1488 To 1514, 1520 To 1522, _
1569 To 1594, 1600 To 1610, 1646, 1647, 1649 To 1747, 1749, 1765, 1766, 1774, 1775, _
1786 To 1788, 1791, 1808, 1810 To 1839, 1869 To 1901, 1920 To 1957, 1969, 1994 To 2026, 2036, _
2037, 2042
IsLetter = True
End Select
End Function

Private Function IsKeyword(ByVal Identifier As String) As Boolean
Dim Keyword As Variant

For Each Keyword In Array("Any", "As", "Attribute", "Boolean", "ByRef", "ByVal", "Byte", "Call", "Case", "CDecl", "Circle", "Close", "Const", _
"Currency", "Date", "Debug", "Decimal", "Declare", "DefBool", "DefByte", "DefCur", "DefDate", "DefDbl", "DefDec", "DefInt", _
"DefLng", "DefObj", "DefSng", "DefStr", "DefVar", "Dim", "Do", "Double", "Each", "ElseIf", "Else", "Empty", "EndIf", _
"End", "EndIf", "Enum", "Erase", "Event", "Exit", "False", "For", "Friend", "Function", "Get", "Global", "GoSub", "GoTo", "If", _
"Implements", "In", "Input", "Integer", "Local", "Lock", "Let", "Like", "Long", "Loop", "LSet", "Len", "Me", "Next", "Nothing", _
"Null", "On", "Open", "Option", "Optional", "ParamArray", "PSet", "Preserve", "Print", "Private", "Public", "Put", _
"RaiseEvent", "ReDim", "Rem", "Resume", "Return", "RSet", "Seek", "Select", "Set", "Scale", "Shared", "Single", _
"Static", "Spc", "Stop", "String", "Sub", "Tab", "Then", "To", "True", "Type", "Unlock", "Until", "Variant", "Wend", "While", "With", _
"WithEvents", "Write")
If StrComp(Identifier, Keyword, vbTextCompare) = 0 Then
IsKeyword = True
Exit For
End If
Next
End Function

Private Function IsOperator(ByVal Identifier As String) As Boolean
Dim Keyword As Variant

For Each Keyword In Array("AddressOf", "And", "Eqv", "Imp", "Is", "Mod", "New", "Not", "Or", "TypeOf", "Xor")
If StrComp(Identifier, Keyword, vbTextCompare) = 0 Then
IsOperator = True
Exit For
End If
Next
End Function

Private Function ReadEscapedIdentifier() As Token
Const MAX_LENGTH = 255
Dim Cp As Integer
Dim Count As Integer
Dim Suffix As String * 1
Dim Buffer As String * MAX_LENGTH

Do While Not AtEnd
Cp = GetCodePoint
If Cp = AscW("]") Then Exit Do
If Cp = 10 Then Fail "Invalid identifier"

Count = Count + 1
If Count > MAX_LENGTH Then Fail "Identifier too long"
Mid$(Buffer, Count, 1) = ToChar(Cp)
Loop

If Not AtEnd Then
Suffix = GetChar

Select Case Suffix
Case "%", "&", "^", "@", "!", "#", "$"
Rem OK

Case Else
UngetChar Suffix
Suffix = vbNullChar
End Select
End If

Set ReadEscapedIdentifier = NewToken(Left$(Buffer, Count), tkEscapedIdentifier, Suffix)
End Function

Private Function ReadString() As Token
Const MAX_LENGTH = 1013
Dim Count As Integer
Dim Ch As String * 1
Dim Buffer As String * MAX_LENGTH

Do
If Count = MAX_LENGTH Then Fail "String too long"

If AtEnd Then
Ch = vbLf
Else
Ch = GetChar
End If

Select Case Ch
Case """"
If AtEnd Then Exit Do
Ch = GetChar

If Ch = """" Then
Count = Append(Count, Buffer, Ch)
Else
Rem We read too much. Let's put it "back".
UngetChar Ch
Exit Do
End If

Case vbLf
Fail "Unclosed string"

Case Else
Count = Append(Count, Buffer, Ch)
End Select
Loop

Set ReadString = NewToken(Left(Buffer, Count), tkString)
End Function

Private Function Append(ByRef Count As Integer, ByRef Buffer As String, ByVal Ch As String) As Integer
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch
Append = Count
End Function

Private Function ReadInteger(Optional ByVal FirstDigit As String) As Token
Const MAX_LENGTH = 29
Dim Cp As Integer
Dim Count As Integer
Dim Ch As String * 1
Dim Suffix As String * 1
Dim Buffer As String * MAX_LENGTH

If FirstDigit >= "0" And FirstDigit <= "9" Then
Count = 1
Mid$(Buffer, Count, 1) = FirstDigit
End If

Do Until AtEnd
If Count = MAX_LENGTH Then Fail "Literal too long"
Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "0" To "9"
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch

Case "%", "&", "^", "@", "!", "#"
Suffix = Ch
Exit Do

Case Else
UngetChar Ch
Exit Do
End Select
Loop

Set ReadInteger = NewToken(Left$(Buffer, Count), tkIntegerNumber, Suffix)
End Function

Private Function ReadFloat(ByVal FirstDigit As String) As Token
Dim Ch As String * 1
Dim Result As Token
Dim FracPart As Token

Set Result = ReadInteger(FirstDigit:=FirstDigit)

If Result.Suffix = vbNullChar Then
If Not AtEnd Then
Ch = GetChar

If Ch = "." Then
Set FracPart = ReadInteger
If FracPart.Text = "" Then Fail "Invalid literal"
Result.Text = Result.Text & "." & FracPart.Text
Result.Kind = tkFloatNumber
Result.Suffix = FracPart.Suffix
Else
UngetChar Ch
End If
End If
End If

Set ReadFloat = Result
End Function

Private Function ReadNumber(ByVal FirstDigit As String) As Token
Dim Ch As String * 1
Dim Sg As String * 1
Dim Result As Token
Dim ExpPart As Token

Set Result = ReadFloat(FirstDigit)

If Result.Suffix = vbNullChar Then
If Not AtEnd Then
Ch = GetChar

Select Case Ch
Case "e", "E"
If AtEnd Then
UngetChar Ch
Else
Sg = GetChar

If Sg = "-" Or Sg = "+" Then
Ch = ""
Else
Ch = Sg
Sg = "+"
End If

Set ExpPart = ReadInteger(FirstDigit:=Ch)
If ExpPart.Text = "" Or ExpPart.Suffix <> vbNullChar Then Fail "Invalid literal"
Result.Text = Result.Text & "E" & Sg & ExpPart.Text
Result.Kind = tkSciNumber
End If

Case Else
UngetChar Ch
End Select
End If
End If

Set ReadNumber = Result
End Function

Private Function ReadAmpersand() As Token
Dim Cp As Integer
Dim Ch As String * 1

Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "o", "O"
Set ReadAmpersand = ReadOctal

Case "h", "H"
Set ReadAmpersand = ReadHexa

Case Else
UngetChar Ch
Set ReadAmpersand = NewToken("&", tkOperator)
End Select
End Function

Private Function ReadOctal() As Token
Const MAX_LENGTH = 32
Dim Count As Integer
Dim Ch As String * 1
Dim Suffix As String * 1
Dim Buffer As String * MAX_LENGTH

Do While Not AtEnd
If Count = MAX_LENGTH Then Fail "Literal too long"
Ch = GetChar

Select Case Ch
Case "0" To "7"
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch

Case "%", "&", "^", "@", "!", "#"
Suffix = Ch
Exit Do

Case Else
Exit Do
End Select
Loop

If Count = 0 Then Fail "Invalid literal"
Set ReadOctal = NewToken(Left$(Buffer, Count), tkOctalNumber, Suffix)
End Function

Private Function ReadHexa() As Token
Const MAX_LENGTH = 24
Dim Count As Integer
Dim Ch As String * 1
Dim Suffix As String * 1
Dim Buffer As String * MAX_LENGTH

Do While Not AtEnd
If Count = MAX_LENGTH Then Fail "Literal too long"
Ch = GetChar

Select Case Ch
Case "0" To "9", "a" To "f", "A" To "F"
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch

Case "%", "&", "^", "@", "!", "#"
Suffix = Ch
Exit Do

Case Else
UngetChar Ch
Exit Do
End Select
Loop

If Count = 0 Then Fail "Invalid literal"
Set ReadHexa = NewToken(Left$(Buffer, Count), tkHexaNumber, Suffix)
End Function

Private Function ReadHash() As Token
Const Msg = "Invalid literal"
Dim Cp As Integer
Dim Number As Integer
Dim Name As String
Dim Ch As String * 1
Dim Token As Token

Rem Let's get the first number.
Set Token = ReadInteger

If Token.Text = "" Then
Rem Maybe we have a month name?
Name = ReadMonthName

Select Case UCase$(Name)
Case "IF", "ELSEIF", "ELSE", "END", "CONST"
Rem Not a month name, we have a compiler directive instead.
Set ReadHash = NewToken(Name, tkDirective)
Exit Function

Case ""
Fail Msg

Case Else
Number = ConvertNameToNumber(Name)

If Number = 0 Then
Rem Not a month name, we have a variable file-handle instead.
Rem Hopefully this variable is not named Feb, for instance, otherwise we'll get tricked...
Set ReadHash = NewToken(Name, tkFileHandle)
Exit Function
End If

Token.Text = CStr(Number)
End Select
End If

Rem Let's get the first separator.
Cp = GetCodePoint
Ch = ToChar(Cp)

If IsLetter(Cp) Or Ch = "," Then
Rem We have a numeric file-handle
UngetChar Ch
Token.Kind = tkFileHandle
Set ReadHash = Token
Exit Function
End If

If Ch = ":" Then
Rem We are reading a time literal.
Name = ReadTime(Token.Text)

Rem Date literal must end with a '#'.
Ch = GetChar
If Ch <> "#" Then Fail Msg

Name = "1899-12-30 " & Name
Set ReadHash = NewToken(Name, tkDateTime)
Exit Function
End If

Rem We'll suppose it is a valid separator.
On Error Resume Next
Name = ReadDate(Token.Text, Ch)

If Err.Number Then
Rem It is not a date, but a numeric file handle
On Error GoTo 0
UngetChar Ch
Token.Kind = tkFileHandle
Set ReadHash = Token
Exit Function
End If

On Error GoTo 0
Ch = GetChar

Select Case Ch
Case " "
Rem We may have a date and time literal together.
Set ReadHash = NewToken(ReadTime, tkDateTime)
If ReadHash.Text = "" Then Fail Msg
ReadHash.Text = Name & " " & ReadHash.Text

Ch = GetChar
If Ch <> "#" Then Fail Msg

Case "#"
Rem Literal does not have a time part. Let's add it.
Set ReadHash = NewToken(Name & " 00:00:00", tkDateTime)

Case Else
Fail Msg
End Select
End Function

Private Function ReadDate(ByVal FirstNumber As String, ByVal Separator As String) As String
Const Msg = "Invalid literal"
Dim YYYY As Integer
Dim MM As Integer
Dim DD As Integer
Dim N As Integer
Dim S As String
Dim Result As String
Dim Ch As String * 1
Dim SecondNumber As Token
Dim ThirdNumber As Token

Set SecondNumber = ReadInteger
If SecondNumber.Text = "" Then Fail Msg

Rem The next separator must match the first one.
Ch = GetChar
If Ch <> Separator Then Fail Msg

Set ThirdNumber = ReadInteger
If ThirdNumber.Text = "" Then Fail Msg

If CInt(FirstNumber) >= 100 And Separator = "-" Then
YYYY = CInt(FirstNumber)
MM = CInt(SecondNumber.Text)
DD = CInt(ThirdNumber.Text)
Else
MM = CInt(FirstNumber)
DD = CInt(SecondNumber.Text)
YYYY = CInt(ThirdNumber.Text)

If YYYY < 100 Then
YYYY = YYYY + 1900
If YYYY < 1950 Then YYYY = YYYY + 100
End If
End If

Rem Validate year.
If YYYY > 9999 Then Fail Msg

Rem Validate month.
If MM < 1 Or MM > 12 Then Fail Msg

Rem Validate day.
Select Case MM
Case 4, 6, 9, 11
If DD > 30 Then Fail Msg

Case 2
If YYYY Mod 4 = 0 And YYYY Mod 100 <> 0 Or YYYY Mod 400 = 0 Then
If DD > 29 Then Fail Msg
Else
If DD > 28 Then Fail Msg
End If

Case Else
If DD > 31 Then Fail Msg
End Select

Rem Put it together in YYYY-MM-DD format.
If YYYY < 1000 Then Result = "0"
If YYYY < 100 Then Result = Result & "0"
If YYYY < 10 Then Result = Result & "0"
Result = Result & CStr(YYYY)
Result = Result & "-"

If MM < 10 Then Result = Result & "0"
Result = Result & CStr(MM)
Result = Result & "-"

If DD < 10 Then Result = Result & "0"
Result = Result & CStr(DD)

ReadDate = Result
End Function

Private Function ReadTime(Optional ByVal FirstNumber As String) As String
Dim HH As Integer
Dim NN As Integer
Dim SS As Integer
Dim Number As String
Dim Ch As String * 1
Dim Ch2 As String * 1
Dim AP As String * 1

On Error GoTo GoneWrong
HH = CInt(FirstNumber)
Number = ReadInteger
If Number = "" Then Err.Raise 0
NN = CInt(Number)

Ch = GetChar

If Ch = ":" Then
Number = ReadInteger
If Number = "" Then Err.Raise 0
SS = CInt(Number)
Else
UngetChar Ch
End If

If Not AtEnd Then
Ch = GetChar

If Ch = " " Then
If Not AtEnd Then
Ch = GetChar

If Ch = "a" Or Ch = "A" Then
Ch2 = GetChar

If Ch2 = "m" Or Ch2 = "M" Then
AP = "A"
Else
UngetChar Ch2
UngetChar Ch
UngetChar " "
End If

ElseIf Ch = "p" Or Ch = "P" Then
Ch2 = GetChar

If Ch2 = "m" Or Ch2 = "M" Then
AP = "P"
Else
UngetChar Ch2
UngetChar Ch
UngetChar " "
End If

Else
UngetChar Ch
UngetChar " "
End If
End If
Else
UngetChar Ch
End If
End If

Rem Validate hour, minute, and second.
If HH < 0 Or HH > 23 Then Err.Raise 0
If NN < 0 Or NN > 59 Then Err.Raise 0
If SS < 0 Or SS > 59 Then Err.Raise 0

If AP = "A" Then
If HH = 12 Then HH = 0

ElseIf AP = "P" Then
If HH <> 12 Then HH = HH + 12
End If

Rem Put it together in HH:NN:SS format.
Number = CStr(SS)
If SS < 10 Then Number = "0" & Number
Number = ":" & Number

Number = CStr(NN) & Number
If NN < 10 Then Number = "0" & Number

Number = ":" & Number
Number = CStr(HH) & Number
If HH < 10 Then Number = "0" & Number

ReadTime = Number
Exit Function

GoneWrong:
Fail "Invalid literal"
End Function

Private Function ReadMonthName() As String
Dim Result As String
Dim Ch As String * 1
Dim Prv As String * 1

Do While Not AtEnd
Prv = Ch
Ch = GetChar

Select Case Ch
Case "#", vbLf, ",", ";", ")", " "
UngetChar Ch
Exit Do

Case "0" To "9"
Rem We safely can assume we read two characters more than needed.
UngetChar Ch
UngetChar Prv
Result = Left$(Result, Len(Result) - 1)
Exit Do

Case Else
Result = Result & Ch
End Select
Loop

ReadMonthName = Result
End Function

Private Function ConvertNameToNumber(ByVal Name As String) As Integer
Dim Count As Integer
Dim Result As Integer
Dim MonthName As Variant
Dim MonthNames As Variant

MonthNames = Array("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")

For Each MonthName In MonthNames
Count = Count + 1

If StrComp(Name, MonthName, vbTextCompare) = 0 Then Result = Count
If Result = 0 Then: If StrComp(Name, Left$(MonthName, 3), vbTextCompare) = 0 Then Result = Count
If Result <> 0 Then Exit For
Next

ConvertNameToNumber = Result
End Function

Private Function IsSpace(ByVal CodePoint As Long) As Boolean
Const NULL_CHAR = 0
Const VERTICAL_TAB = 9
Const EOM = 25
Const WHITE_SPACE = 32
Const NO_BREAK_SPACE = 160
Const OGHAM_SPACE_MARK = &H1680
Const MONGOLIAN_VOWEL_SEPARATOR = &H180E
Const EN_QUAD = &H2000
Const HAIR_SPACE = &H200A
Const NARROW_NO_BREAK_SPACE = &H202F
Const MEDIUM_MATHEMATICAL_SPACE = &H205F
Const IDEOGRAPHIC_SPACE = &H3000

Select Case CodePoint
Case NULL_CHAR, WHITE_SPACE, VERTICAL_TAB, EOM, NO_BREAK_SPACE, OGHAM_SPACE_MARK, MONGOLIAN_VOWEL_SEPARATOR, _
NARROW_NO_BREAK_SPACE, MEDIUM_MATHEMATICAL_SPACE, IDEOGRAPHIC_SPACE, EN_QUAD To HAIR_SPACE
IsSpace = True
End Select
End Function

Private Function NewToken(ByVal Text As String, ByVal Kind As TokenKind, Optional ByVal Suffix As String = vbNullChar) As Token
Set NewToken = New Token

With NewToken
.Text = Text
.Kind = Kind
.Suffix = Suffix
.Line = RunningLine_
.Column = FrozenColumn_
End With
End Function

Private Function ReadComment(ByVal Mark As String) As Token
Const MAX_LENGTH = 1013
Dim Count As Integer
Dim Ch As String * 1
Dim Buffer As String * MAX_LENGTH

Count = Len(Mark)
Mid$(Buffer, 1, Count) = Mark

Do While Not AtEnd
If Count = MAX_LENGTH Then Fail "Comment too long"
Ch = GetChar
If Ch = vbLf Then Exit Do

Count = Count + 1
Mid$(Buffer, Count, 1) = Ch
Loop

Set ReadComment = NewToken(Left$(Buffer, Count), tkWhiteSpace, vbNullChar)
End Function

Rem The first (high) surrogate is a 16-bit code value in the range U+D800 to U+DBFF.
Private Function IsHighSurrogate(ByVal Character As Integer) As Boolean
IsHighSurrogate = Character >= -10240 And Character <= -9217 Or Character >= 55296 And Character <= 56319
End Function

Rem The second (low) surrogate is a 16-bit code value in the range U+DC00 to U+DFFF.
Private Function IsLowSurrogate(ByVal Character As Integer) As Boolean
IsLowSurrogate = Character >= -9216 And Character <= -8193 Or Character >= 56320 And Character <= 57343
End Function

Public Function IsSurrogate(ByVal Character As Integer) As Boolean
IsSurrogate = IsLowSurrogate(Character) Or IsHighSurrogate(Character)
End Function
End Class

OK, with that out of our way, let's dig into the fun part!
Add a tkBinaryNumber to TokenKind enum and the function below to Scanner class.
Let's scan those binary number literals!

Public Enum TokenKind
(...)
tkSciNumber
tkBinaryNumber
tkOctalNumber
(...)
End Enum


Private Function ReadBin() As Token
Const MAX_LENGTH = 96
Dim Count As Integer
Dim Ch As String * 1
Dim Suffix As String * 1
Dim Buffer As String * MAX_LENGTH

Do While Not AtEnd
If Count = MAX_LENGTH Then Fail "Literal too long"
Ch = GetChar

Select Case Ch
Case "0", "1"
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch

Case "%", "&", "^", "@", "!", "#"
Suffix = Ch
Exit Do

Case "_"
Rem We'll ignore it

Case Else
Exit Do
End Select
Loop

If Count = 0 Then Fail "Invalid literal"
Set ReadBin = NewToken(Left$(Buffer, Count), tkBinaryNumber, Suffix)
End Function

Copy that line that ignores underscores and its comment and paste them in their proper places within ReadOctal, ReadHexa, and ReadInteger functions.

Now we need to change ReadAmpersand to take into account literals like &B1011.
As we will work on ReadAmpersand, let's make it understands operator "&=", too:
(New or changed code are highlighted below.)

Private Function ReadAmpersand() As Token
Dim Ch As String * 1

Ch = GetChar

Select Case Ch
Case "b", "B"
Set ReadAmpersand = ReadBin

Case "o", "O"
Set ReadAmpersand = ReadOctal

Case "h", "H"
Set ReadAmpersand = ReadHexa

Case "="
Set ReadAmpersand = NewToken("&=", tkOperator)

Case Else
UngetChar Ch
Set ReadAmpersand = NewToken("&", tkOperator)
End Select
End Function

To scan the other compound operators, we'll need to change GetToken:

Public Function GetToken() As Token
Dim Cp As Integer
Dim Ch As String * 1
Dim Token As Token

If AtEnd Then
Set GetToken = NewToken("", tkEndOfStream)
Exit Function
End If

FrozenColumn_ = RunningColumn_
Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "["
Set Token = ReadEscapedIdentifier

Case """"
Set Token = ReadString

Case "&"
Set Token = ReadAmpersand

Case "#"
Set Token = ReadHash

Case "0" To "9"
Set Token = ReadNumber(Ch)

Case "+", "-", "*", "/", "\", "^"
Set Token = NewToken(Ch, tkOperator)

If Not AtEnd Then
Ch = GetChar

If Ch = "=" Then
Token.Text = Token.Text & "="
Else
UngetChar Ch
End If
End If

Case "=", ".", "!"
Set Token = NewToken(Ch, tkOperator)

Case "<"
Set Token = NewToken(Ch, tkOperator)

If Not AtEnd Then
Ch = GetChar

If Ch = ">" Or Ch = "=" Or Ch = "<" Then
Token.Text = Token.Text & Ch
Else
UngetChar Ch
End If
End If

Case ">"
Set Token = NewToken(Ch, tkOperator)

If Not AtEnd Then
Ch = GetChar

Select Case Ch
Case "="
Token.Text = Token.Text & Ch

Case ">"
Token.Text = Token.Text & Ch

If Not AtEnd Then
Ch = GetChar

If Ch = ">" Then
Token.Text = Token.Text & Ch
Else
UngetChar Ch
End If
End If

Case Else
UngetChar Ch
End Select
End If

Case ":"
Set Token = NewToken(Ch, tkSoftLineBreak)

Case vbLf
Set Token = NewToken(Ch, tkHardLineBreak)

Case "'"
Set Token = ReadComment(Ch)

Case ","
Set Token = NewToken(Ch, tkListSeparator)

Case ";"
Set Token = NewToken(Ch, tkPrintSeparator)

Case "("
Set Token = NewToken(Ch, tkLeftParenthesis)

Case ")"
Set Token = NewToken(Ch, tkRightParenthesis)

Case " "
Set Token = NewToken(Ch, tkWhiteSpace)

Case vbBack
Set Token = NewToken(Ch, tkLineContinuation)

Case Else
If Not IsLetter(Cp) Then Fail "Invalid token: '" & Ch & "'"

Set Token = ReadIdentifier(Cp)

If IsKeyword(Token.Text) And Token.Suffix = vbNullChar Then
Token.Kind = tkKeyword
If StrComp(Token.Text, "Rem", vbTextCompare) = 0 Then Set Token = ReadComment(Token.Text)

ElseIf IsOperator(Token.Text) And Token.Suffix = vbNullChar Then
Token.Kind = tkOperator

If Not AtEnd Then
Ch = GetChar

If Ch = "=" Then
Token.Text = Token.Text & Ch
Else
UngetChar Ch
End If
End If
End If
End Select

Set GetToken = Token
End Function

Finally, we can add some more keywords to our list, not only Class and Module, but also Continue, Default, DefLngLng, DefLngPtr, LongLong, LongPtr, and PtrSafe. While we are at it, we could add three more operators, too: AndAlso, OrElse, and IsNot.
Be warned however that, for now, any code using these new constructs will be "make-believe" code, because they are not supported by VB6 and we are far from dealing properly with them yet.

Regarding introducing two new data types (LongLong and LongPtr), this is the change we need to do:

Rem In IsKeyword:
(...)
"LongPtr", "LongLong", "Loop", "LSet", "Len", "Me", "Module", "Next", "Nothing", "Null", "On", "Open", _
(...)

There is another thing I would like to introduce: Even though we have two ways to include comments, I think they are not enough.
I would like to be able to insert comments anywhere, not only in a line of its own, or at the end of one. Like this:

For Each Keyword In Array("AddressOf", "And", /*"AndAlso",*/ "Eqv", "Imp", "Is", /*"IsNot",*/ "Mod", "New", "Not", _
"Or", /*"OrElse",*/ "TypeOf", "Xor")

But don't worry, I won't be using /* or */.
One problem I see with them is that if you need to comment out several lines, including one that is already commented out, they fall short to the task:

Private Function IsOperator(ByVal Identifier As String) As Boolean
/*Dim Keyword As Variant

For Each Keyword In Array("AddressOf", "And", /*"AndAlso"*/
, "Eqv", "Imp", "Is", /*"IsNot",*/ "Mod", "New", "Not", _
"Or", /*"OrElse",*/ "TypeOf", "Xor")
If StrComp(Identifier, Keyword, vbTextCompare) = 0 Then
IsOperator = True
Exit For
End If
Next*/ '<--OOPS!
End Function

We'll be using a "backtick" ("`") to start a comment and a tick ("´") to end it.
First, let's create a Sub to get rid of this kind of comment:

Private Sub DiscardComment()
Dim Count As Long
Dim Ch As String * 1
Count = 1

Do While Not AtEnd
Ch = GetChar

Select Case Ch
Case "`"
Count = Count + 1

Case "´"
Count = Count - 1
If Count = 0 Then Exit Do
End Select
Loop
End Sub

Now we need to adapt GetToken. While we get this kind of comment, we cannot leave the function until we have an actual token to return.
We'll have a kinda bizarre loop. Here are what need to be added to GetToken:

Public Function GetToken() As Token
Dim Done As Boolean
(...)
Do
Done = True
FrozenColumn_ = RunningColumn_
(...)
Case "`"
Done = False
DiscardComment

Case Else
(...)
Loop While Not Done

Set GetToken = Token
End Function

And this is it.
Next week, we'll deal with the contextual keywords.

Andrej Biasic
2020-09-30