Metamorphing Machine I rather be this walking metamorphosis
than having that old formed opinion about everything!

Let's build a transpiler! Part 4

This is the fourth post in a series of building a transpiler. You can find the previous ones here.

Last time we were left to scan operators.
We'll still do that, but first, let's refactor our code that is in much need of it.
I did not mention before, but all the functions we have been creating are supposed to be inside a module.
VB6 does not support a Module keyword, but I will use it to emphasize we're working on one.

Here is what we will do: Click here if you want to skip the code.

Module Scanner
Option Explicit
Option Compare Text

Private FileHandle_ As Integer


Public Sub Main()
Dim Cp As Integer
Dim Ch As String * 1
Dim Token As String

Rem Get an available file number.
FileHandle_ = FreeFile
Rem File path for the source code is passed as a command-line argument.
Open Command$ For Binary As #FileHandle_
Rem Ensuring we close the file in case we have an error.
On Error GoTo CloseIt

Rem While we do not reach the end of file...
While Not EOF(FileHandle_)
Rem ...read a codepoint from it.
Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "["
Token = ReadEscapedIdentifier
Debug.Print "Escaped identifier: " & Token

Case """"
Token = ReadString
Debug.Print "String: " & Token

Case "&"
Token = ReadAmpersand
Debug.Print "Octal/Hexa: " & Token

Case "#"
Token = ReadDateTime
Debug.Print "Date/time: " & Token

Case "0" To "9"
Token = ReadNumber(Ch)
Debug.Print "Number: " & Token

Case Else
If IsLetter(Cp) Then
Token = ReadIdentifier(Ch)

If IsKeyword(Token) Then
Debug.Print "Keyword: " & Token
Else
Debug.Print "Identifier: " & Token
End If
End If
End Select
Wend

CloseIt:
Close #FileHandle_
Rem This is equivalent to a Throw in a Catch.
If Err.Number Then Err.Raise Err.Number
End Sub

Private Function GetCodePoint() As Integer
Dim Result As Integer
If EOF(FileHandle_) Then Fail
Get #FileHandle_, , Result
GetCodePoint = Result
End Function

Private Function GetChar() As String
Dim Cp As Integer
Dim Result As String * 1
Cp = GetCodePoint
Result = ToChar(Cp)
GetChar = Result
End Function

Private Sub UngetChar(Optional ByVal Times As Integer = 1)
Dim Pos As Long
Pos = Seek(FileHandle_)
Seek #FileHandle_, Pos - 2 * Times
End Sub

Private Sub Fail(Optional ByVal Msg As String)
If Msg = "" Then Msg = "Unexpected end of file"
Err.Raise vbObjectError + 13, , Msg
End Sub

Private Function ReadIdentifier(ByVal CodePoint As Integer) As String
Const MAX_LENGTH = 255
Dim Buffer As String * MAX_LENGTH
Dim IsOK As Boolean
Dim Cp As Integer
Dim Count As Integer

Count = 1
Mid$(Buffer, Count, 1) = ToChar(CodePoint)

Do While Not EOF(FileHandle_)
Cp = GetCodePoint

IsOK = Cp = AscW("_")
If Not IsOK Then IsOK = Cp >= AscW("0") And Cp <= AscW("9")
If Not IsOK Then IsOK = IsLetter(Cp)

If Not IsOK Then
UngetChar
Exit Do
End If

Count = Count + 1
Mid$(Buffer, Count, 1) = ToChar(Cp)
If Count > MAX_LENGTH Then Fail "Identifier too long"
Loop

ReadIdentifier = Left$(Buffer, Count)
End Function


Private Function ToChar(ByVal CodePoint As Integer) As String
Dim Bytes(0 To 1) As Byte
Bytes(0) = CodePoint And &HFF
Bytes(1) = ((CodePoint And &HFF00) \ &H100) And &HFF
ToChar = Bytes
End Function

Private Function IsLetter(ByVal CodePoint As Integer) As Boolean
Select Case CodePoint
Case -32768To -24645, -24576To -23412, -22761To -22758, -22528To -22527, -22525To -22523, _
-22521To -22518, -22516To -22494, -22464To -22413, -21504To -10333, -1792To -1491, _
-1488To -1430, -1424To -1319, -1280To -1274, -1261To -1257, -1251, -1249To -1240, _
-1238To -1226, -1224To -1220, -1218, -1216, -1215, -1213, -1212, -1210To -1103, _
-1069, -1068To -707, -688To -625, -622To -569, -528To -517, -400To -396, -394To -260, _
-223To -198, -191To -166, -154To -66, -62To -57, -54To -49, -46To -41, -38To -36, _
65To 90, 97To 122, 170, 181, 186, 192To 214, 216To 246, 248To 705, 710To 721, _
736To 740, 750, 890To 893, 902, 904To 906, 908, 910To 929, 931To 974, 976To 1013, _
1015To 1153, 1162To 1299, 1329To 1366, 1369, 1377To 1415, 1488To 1514, 1520To 1522, _
1569To 1594, 1600To 1610, 1646, 1647, 1649To 1747, 1749, 1765, 1766, 1774, 1775, _
1786To 1788, 1791, 1808, 1810To 1839, 1869To 1901, 1920To 1957, 1969, 1994To 2026, 2036, _
2037, 2042
IsLetter = True
End Select
End Function

Private Function IsKeyword(ByVal Identifier As String) As Boolean
Select Case Identifier
Case "Any", "As", "Attribute", "Boolean", "ByRef", "ByVal", "Byte", "Call", "Case", "CDecl", "Circle", "Close", "Const", "Currency", "Date", "Debug", "Decimal", "Declare", "DefBool", "DefByte", "DefCur", "DefDate", "DefDbl", "DefDec", "DefInt", "DefLng", "DefObj", "DefSng", "DefStr", "DefVar", "Dim", "Do", "Double", "Each", "ElseIf", "Else", "Empty", "EndIf", "End", "EndIf", "Enum", "Erase", "Event", "Exit", "False", "For", "Friend", "Function", "Get", "Global", "GoSub", "GoTo", "If", "Implements", "In", "Input", "Integer", "Local", "Lock", "Let", "Like", "Local", "Long", "Loop", "LSet", "Len", "Me", "Next", "Nothing", "Null", "On", "Open", "Option", "Optional", "ParamArray", "PSet", "Preserve", "Print", "Private", "Public", "Put", "RaiseEvent", "ReDim", "Rem", "Resume", "Return", "RSet", "Seek", "Select", "Set", "Scale", "Shared", "Single", "Static", "Spc", "Stop", "String", "Sub", "Tab", "Then", "To", "True", "Type", "Unlock", "Until", "Variant", "Wend", "While", "With", "WithEvents", "Write"
IsKeyword = True
End Select
End Function

Private Function IsOperator(ByVal Identifier As String) As Boolean
Select Case Identifier
Case "AddressOf", "And", "Eqv", "Imp", "Is", "Mod", "New", "Not", "Or", "TypeOf", "Xor"
IsOperator = True
End Select
End Function

Private Function ReadEscapedIdentifier() As String
Const MAX_LENGTH = 255
Dim Buffer As String * MAX_LENGTH
Dim Cp As Integer
Dim Count As Integer

Do While Not EOF(FileHandle_)
Cp = GetCodePoint
If Cp = AscW("]") Then Exit Do
If Cp = 10 Or Cp = 13 Then Fail "Invalid identifier"
Count = Count + 1
If Count > MAX_LENGTH Then Fail "Identifier too long"
Mid$(Buffer, Count, 1) = ToChar(Cp)
Loop

ReadEscapedIdentifier = Left$(Buffer, Count)
End Function

Private Function ReadString() As String
Const MAX_LENGTH = 1013
Dim Count As Integer
Dim Ch As String * 1
Dim Buffer As String * MAX_LENGTH

Do
If Count = MAX_LENGTH Then Fail "String too long"

If EOF(FileHandle_) Then
Ch = vbCr
Else
Ch = GetChar
End If

Select Case Ch
Case """"
If EOF(FileHandle_) Then Exit Do
Ch = GetChar

If Ch = """" Then
GoSub Append
Else
Rem We read too much. Let's put it "back".
UngetChar
Exit Do
End If

Case vbCr, vbLf
Fail "Unclosed string"

Case Else
GoSub Append
End Select
Loop

ReadString = Left$(Buffer, Count)
Exit Function

Append:
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch
Return
End Function

Private Function ReadInteger(Optional ByVal FirstDigit As String) As String
Const MAX_LENGTH = 29
Dim Cp As Integer
Dim Count As Integer
Dim Ch As String * 1
Dim Buffer As String * MAX_LENGTH

If FirstDigit >= "0" And FirstDigit <= "9" Then
Count = 1
Mid$(Buffer, Count, 1) = FirstDigit
End If

Do Until EOF(FileHandle_)
If Count = MAX_LENGTH Then Fail "Literal too long"
Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "0" To "9"
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch

Case Else
UngetChar
Exit Do
End Select
Loop

ReadInteger = Left$(Buffer, Count)
End Function

Private Function ReadFloat(ByVal FirstDigit As String) As String
Dim Ch As String * 1
Dim Result As String
Dim FracPart As String

Result = ReadInteger(FirstDigit:=FirstDigit)

If Not EOF(FileHandle_) Then
Ch = GetChar

If Ch = "." Then
FracPart = ReadInteger
If FracPart = "" Then Fail "Invalid literal"
Result = Result & "." & FracPart
Else
UngetChar
End If
End If

ReadFloat = Result
End Function

Private Function ReadNumber(ByVal FirstDigit As String) As String
Dim Result As String
Dim FracPart As String
Dim Ch As String * 1
Dim Sg As String * 1

Result = ReadFloat(FirstDigit)

If Not EOF(FileHandle_) Then
Ch = GetChar

Select Case Ch
Case "e", "E"
If EOF(FileHandle_) Then
UngetChar
Else
Sg = GetChar

If Sg = "-" Or Sg = "+" Then
Ch = ""
Else
Ch = Sg
Sg = "+"
End If

FracPart = ReadInteger(FirstDigit:=Ch)
If FracPart = "" Then Fail "Invalid literal"
Result = Result & "E" & Sg & FracPart
End If

Case Else
UngetChar
End Select
End If

ReadNumber = Result
End Function

Private Function ReadAmpersand() As String
Const MSG = "Invalid literal"
Dim Ch As String * 1
Dim Cp As Integer
Dim Result As String

Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "o", "O"
Result = ReadOctal

Case "h", "H"
Result = ReadHexa

Case Else
UngetChar
Rem Concatenation operator.
Result = Ch
End Select

ReadAmpersand = Result
End Function

Private Function ReadOctal() As String
Const MAX_LENGTH = 32
Dim Count As Integer
Dim Ch As String * 1
Dim Buffer As String * MAX_LENGTH

Do While Not EOF(FileHandle_)
If Count = MAX_LENGTH Then Fail "Literal too long"
Ch = GetChar

Select Case Ch
Case "0" To "7"
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch

Case Else
UngetChar
Exit Do
End Select
Loop

If Count = 0 Then Fail "Invalid literal"
ReadOctal = Left$(Buffer, Count)
End Function

Private Function ReadHexa() As String
Const MAX_LENGTH = 24
Dim Count As Integer
Dim Ch As String * 1
Dim Buffer As String * MAX_LENGTH

Do While Not EOF(FileHandle_)
If Count = MAX_LENGTH Then Fail "Literal too long"
Ch = GetChar

Select Case Ch
Case "0" To "9", "a" To "f", "A"To "F"
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch

Case Else
UngetChar
Exit Do
End Select
Loop

If Count = 0 Then Fail "Invalid literal"
ReadHexa = Left$(Buffer, Count)
End Function

Private Function ReadDateTime() As String
Const MSG = "Invalid literal"
Dim Cp As Integer
Dim N As Integer
Dim Ch As String * 1
Dim S As String
Dim Result As String
Dim Number As String

Rem Let's get the first number.
Number = ReadInteger

If Number = "" Then
Rem Maybe we have a month name?
S = ReadMonthName
If S = "" Then Fail MSG

N = ConvertNameToNumber(S)
If N = 0 Then Fail MSG

Number = CStr(N)
End If

Rem Let's get the first separator.
Cp = GetCodePoint

Rem It cannot be a letter.
If IsLetter(Cp) Then Fail MSG
Ch = ToChar(Cp)

If Ch = ":" Then
Rem We are reading a time literal.
Result = ReadTime(Number)

Rem Date literal must end with a '#'.
Cp = GetCodePoint
If Ch <> "#" Then Fail MSG

ReadDateTime = "1899-12-30 " & Result
Exit Function
End If

Rem We'll suppose it is a valid separator.
Result = ReadDate(Number, Ch)

Ch = GetChar

Select Case Ch
Case " "
Rem We may have a date and time literal together.
Number = ReadTime
If Number = "" Then Fail MSG
Result = Result & " " & Number

Ch = GetChar
If Ch <> "#" Then Fail MSG

Case "#"
Rem Literal does not have a time part. Let's add it.
Result = Result & " 00:00:00"

Case Else
Fail MSG
End Select

ReadDateTime = Result
End Function

Private Function ReadDate(ByVal FirstNumber As String, ByVal Separator As String) As String
Const MSG = "Invalid literal"
Dim YYYY As Integer
Dim MM As Integer
Dim DD As Integer
Dim N As Integer
Dim S As String
Dim Result As String
Dim SecondNumber As String
Dim ThirdNumber As String
Dim Ch As String * 1

SecondNumber = ReadInteger

If SecondNumber = "" Then Fail MSG

Rem The next separator must match the first one.
Ch = GetChar
If Ch <> Separator Then Fail MSG

ThirdNumber = ReadInteger

If ThirdNumber = "" Then Fail MSG

If CInt(FirstNumber) >= 100 And Separator = "-" Then
YYYY = CInt(FirstNumber)
MM = CInt(SecondNumber)
DD = CInt(ThirdNumber)
Else
MM = CInt(FirstNumber)
DD = CInt(SecondNumber)
YYYY = CInt(ThirdNumber)

If YYYY < 100 Then
YYYY = YYYY + 1900
If YYYY < 1950 Then YYYY = YYYY + 100
End If
End If

Rem Validate year.
If YYYY > 9999 Then Fail MSG

Rem Validate month.
If MM < 1 Or MM > 12 Then Fail MSG

Rem Validate day.
Select Case MM
Case 4, 6, 9, 11
If DD > 30 Then Fail MSG

Case 2
If YYYY Mod 4 = 0 And YYYY Mod 100 <> 0 Or YYYY Mod 400 = 0 Then
If DD > 29 Then Fail MSG
Else
If DD > 28 Then Fail MSG
End If

Case Else
If DD > 31 Then Fail MSG
End Select

Rem Put it together in YYYY-MM-DD format.
If YYYY < 1000 Then Result = "0"
If YYYY < 100 Then Result = Result & "0"
If YYYY < 10 Then Result = Result & "0"
Result = Result & CStr(YYYY)
Result = Result & "-"

If MM < 10 Then Result = Result & "0"
Result = Result & CStr(MM)
Result = Result & "-"

If DD < 10 Then Result = Result & "0"
Result = Result & CStr(DD)

ReadDate = Result
End Function

Private Function ReadTime(Optional ByVal FirstNumber As String) As String
Const MSG = "Invalid literal"
Dim HH As Integer
Dim NN As Integer
Dim SS As Integer
Dim Number As String
Dim Ch As String * 1
Dim AP As String * 1

On Error GoTo GoneWrong
HH = CInt(FirstNumber)
Number = ReadInteger
If Number = "" Then Err.Raise 0
NN = CInt(Number)

Ch = GetChar

If Ch = ":" Then
Number = ReadInteger
If Number = "" Then Err.Raise 0
SS = CInt(Number)
Else
UngetChar
End If

If Not EOF(FileHandle_) Then
Ch = GetChar

If Ch = " " Then
If Not EOF(FileHandle_) Then
Ch = GetChar

If Ch = "a" Or Ch = "A" Then
Ch = GetChar

If Ch = "m" Or Ch = "M" Then
AP = "A"

Else
UngetChar 3
End If

ElseIf Ch = "p" Or Ch = "P" Then
Ch = GetChar

If Ch = "m" Or Ch = "M" Then
AP = "P"
Else
UngetChar 3
End If

Else
UngetChar 2
End If
End If
Else
UngetChar
End If
End If

Rem Validate hour, minute, and second.
If HH < 0 Or HH > 23 Then Err.Raise 0
If NN < 0 Or NN > 59 Then Err.Raise 0
If SS < 0 Or SS > 59 Then Err.Raise 0

If AP = "A" Then
If HH = 12 Then HH = 0

ElseIf AP = "P" Then
If HH <> 12 Then HH = HH + 12
End If

Rem Put it together in HH:NN:SS format.
Number = CStr(SS)
If SS < 10 Then Number = "0" & Number
Number = ":" & Number

Number = CStr(NN) & Number
If NN < 10 Then Number = "0" & Number

Number = ":" & Number
Number = CStr(HH) & Number
If HH < 10 Then Number = "0" & Number

ReadTime = Number
Exit Function

GoneWrong:
Fail MSG
End Function

Private Function ReadMonthName() As String
Dim Ch As String * 1
Dim Result As String

Do While Not EOF(FileHandle_)
Ch = GetChar

Select Case Ch
Case "#", vbCr, vbLf, ",", ";", ")", " "
UngetChar
Exit Do

Case "0" To "9"
Rem We safely can assume we read two characters more than needed.
UngetChar 2
Result = Left$(Result, Len(Result) - 1)
Exit Do

Case Else
Result = Result & Ch
End Select
Loop

ReadMonthName = Result
End Function

Private Function ConvertNameToNumber(ByVal Name As String) As Integer
Dim MonthNames As Variant
Dim MonthName As Variant
Dim Result As Integer
Dim Count As Integer

MonthNames = Array("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")

For Each MonthName In MonthNames
Count = Count + 1

If Name = MonthName Then
Result = Count
Exit For
End If
Next

If Result = 0 Then
Count = 0

For Each MonthName In MonthNames
Count = Count + 1

If Name = Left$(MonthName, 3) Then
Result = Count
Exit For
End If
Next
End If

ConvertNameToNumber = Result
End Function
End Module

You may find it odd to have AddressOf and New as operators, but I assure you it will make life easier later.
There is also another keyword that we'll re-classify as operator later.

Now, for the non-keyword operators. Here are they:

+Unary identity or binary addition
-Unary negation or binary subtraction
*Multiplication
/Division
\Integer division
^Power
&Concatenation
=Equal
<=Lower than or equal
<>Different
>=Greater than or equal
>Greater than
.Member access
!Bang

Having a heavy VB background, it took me a long time to accept that the dot (".") and the bang ("!") are operators.
This is obvious for someone with a different background, like C++ developers, for instance. But if you take a look at VB6's help, they are not even there in the operators' page.
(Nor are AddressOf or New, either.)
However, as I said before for the keyword operators, this will make code simpler later.

The new version of our operator-aware sub Main is:
(New code is highlighted below.)


Public Sub Main()
Dim Cp As Integer
Dim Ch As String * 1
Dim Token As String

Rem Get an available file number.
FileHandle_ = FreeFile
Rem File path for the source code is passed as a command-line argument.
Open Command$ For Binary As #FileHandle_
Rem Ensuring we close the file in case we have an error.
On Error GoTo CloseIt

Rem While we do not reach the end of file...
While Not EOF(FileHandle_)
Rem ...read a codepoint from it.
Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "["
Token = ReadEscapedIdentifier
Debug.Print "Escaped identifier: " & Token

Case """"
Token = ReadString
Debug.Print "String: " & Token

Case "&"
Token = ReadAmpersand
Debug.Print "Octal/Hexa: " & Token

Case "#"
Token = ReadDateTime
Debug.Print "Date/time: " & Token

Case "0" To "9"
Token = ReadNumber(Ch)
Debug.Print "Number: " & Token

Case "+", "-", "*", "/", "\", "^", "=", "!", "."
Debug.Print "Operator: " & Ch

Case "<"
Token = Ch
Ch = GetChar

If Ch = "=" Or Ch = ">" Then
Token = Token & Ch
Else
UngetChar
End If

Debug.Print "Operator: " & Token

Case ">"
Token = Ch

If GetChar = "=" Then
Token = Token & "="
Else
UngetChar
End If

Debug.Print "Operator: " & Token

Case Else
If IsLetter(Cp) Then
Token = ReadIdentifier(Ch)

If IsKeyword(Token) Then
Debug.Print "Keyword: " & Token

ElseIf IsOperator(Token) Then
Debug.Print "Operator: " & Token

Else
Debug.Print "Identifier: " & Token
End If
Else
Rem tokens we did not deal yet
End If
End Select
Wend

CloseIt:
Close #FileHandle_
Rem This is equivalent to a Throw in a Catch.
If Err.Number Then Err.Raise Err.Number
End Sub

Next week: Line breaks. OMG!!! We forgot the line breaks!

Andrej Biasic
2020-08-05
Update:
Added Local keyword.

Andrej Biasic
2020-08-06
Update:
Removed InputB from keywords.

Andrej Biasic
2020-08-08