Metamorphing Machine I rather be this walking metamorphosis
than having that old formed opinion about everything!

Let's build a transpiler! Part 7

This is the seventh post in a series of building a transpiler. You can find the previous ones here.

Last time we said we would fix the code we've been building so far so we can tell a hexadecimal number apart from an octal one, for instance.
For that, we'll create a Token class, having a Text property and a Kind property.

The Text property will store... well, the text for that token. The Kind will store the type of the token.
(We would like to create a Type property instead of a Kind one, but Type is a reserved keyword, so we can't.)

We will need a TokenKind enum with all token types we have so far.
We will also create a NewToken function that will be handy to create Token instances.

Let's go:

Class Token
Public Kind As TokenKind
Public Text As String
Public Suffix As String
End Class


Public Enum TokenKind
tkWhiteSpace
tkComment
tkIdentifier
tkEscapedIdentifier
tkKeyword
tkIntegerNumber
tkFloatNumber
tkSciNumber
tkOctalNumber
tkHexaNumber
tkFileHandle
tkString
tkDateTime
tkOperator
tkLeftParenthesis
tkRightParenthesis
tkHardLineBreak
tkSoftLineBreak
tkListSeparator
tkPrintSeparator
tkDirective
End Enum


Private Function NewToken(ByVal Text As String, ByVal Kind As TokenKind, Optional ByVal Suffix As String = vbNullChar) As Token
Set NewToken = New Token

With NewToken
.Text = Text
.Kind = Kind
.Suffix = Suffix
End With
End Function

You may have noticed three things in the code above: When we see a "#", it can be the beginning of a date/time literal (like #2020-05-31#), a file-handle literal (like Get #FileHandle_ ...), or a compiler directive, like #If and #Const.
Due to that, we'll rename our function ReadDateTime to ReadHash and modify it to take into account file-handlers and compiler directives.

Notable changes are highlighted below.

Private Function ReadHash() As Token
Const Msg = "Invalid literal"
Dim Cp As Integer
Dim Number As Integer
Dim Ch As String * 1
Dim Name As String
Dim Token As Token

Rem Let's get the first number.
Set Token = ReadInteger

If Token.Text = "" Then
Rem Maybe we have a month name?
Name = ReadMonthName

Select Case Name
Case "If", "ElseIf", "Else", "End", "Const"
Rem Not a month name, we have a compiler directive instead.
Set ReadHash = NewToken(Name, tkDirective)
Exit Function

Case ""
Fail Msg

Case Else
Number = ConvertNameToNumber(Name)

If Number = 0 Then
Rem Not a month name, we have a variable file-handle instead.
Rem Hopefully this variable is not named Feb, for instance, otherwise we'll get tricked...
Set ReadHash = NewToken(Name, tkFileHandle)
Exit Function
End If

Token.Text = CStr(Number)
End Select
End If

Rem Let's get the first separator.
Cp = GetCodePoint
Ch = ToChar(Cp)

If IsLetter(Cp) Or Ch = "," Then
Rem We have a numeric file-handle
UngetChar
Token.Kind = tkFileHandle
Set ReadHash = Token
Exit Function
End If

If Ch = ":" Then
Rem We are reading a time literal.
Name = ReadTime(Token.Text)

Rem Date literal must end with a '#'.
Ch = GetChar
If Ch <> "#" Then Fail Msg

Name = "1899-12-30 " & Name
Set ReadHash = NewToken(Name, tkDateTime)
Exit Function
End If

Rem We 'll suppose it is a valid separator.
On Error Resume Next
Name = ReadDate(Token.Text, Ch)

If Err.Number Then
Rem It is not a date, but a numeric file handle
On Error GoTo 0
UngetChar Ch
Token.Kind = tkFileHandle
Set ReadHash = Token
Exit Function
End If

On Error GoTo 0
Ch = GetChar

If IsSpace(Ch) Then
Rem We may have a date and time literal together.
Set ReadHash = NewToken(ReadTime, tkDateTime)
If ReadHash.Text = "" Then Fail Msg
ReadHash.Text = Name & " " & ReadHash.Text

Ch = GetChar
If Ch <> "#" Then Fail Msg

ElseIf Ch = "#" Then
Rem Literal does not have a time part. Let's add it.
Set ReadHash = NewToken(Name & " 00:00:00", tkDateTime)

Else
Fail Msg
End If
End Function

VB6 has a feature that allows one to specify a variable's type using type suffixes. (It can be used to number literals, too.)
Here are they:

Type suffixData type
%Integer
&Long
^LongLong
@Currency
!Single
#Double
$String

(Note that VB6 is 32-bit only. The entry for LongLong, a 64-bit integer, comes from 64-bit Office's VBA.)
Now I'll just dump the remaining functions adapted to return Tokens instead of strings.
Take a look at functions ReadInteger, ReadIdentifier, ReadEscapedIdentifier, ReadFloat, ReadNumber, ReadOctal, and ReadHexa for the suffix stuff.

Next week, we'll put all of this to use.

Andrej Biasic
2020-08-26
Option Explicit
Option Compare Text

Public FileHandle_ As Integer


Public Sub Main()
Dim Suppress As Boolean
Dim Cp As Integer
Dim Ch As String * 1
Dim Token As Token

Rem Get an available file number.
FileHandle_ = FreeFile
Rem File path for the source code is passed as a command-line argument.
Open Command$ For Binary As #FileHandle_
Rem Ensuring we close the file in case we have an error.
On Error GoTo CloseIt

Cp = GetCodePoint
If Cp <> -257 Then UngetChar

Rem While we do not reach the end of file...
While Not EOF(FileHandle_)
Rem ...read a codepoint from it.
Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "["
Set Token = ReadEscapedIdentifier

Case """"
Set Token = ReadString

Case "&"
Set Token = ReadAmpersand

Case "#"
Set Token = ReadHash

Case "0" To "9"
Set Token = ReadNumber(Ch)

Case "+", "-", "*", "/", "\", "^", "=", ".", "!"
Set Token = NewToken(Ch, tkOperator)

Case "<"
Set Token = NewToken(Ch, tkOperator)
Ch = GetChar

If Ch = ">" Or Ch = "=" Then
Token.Text = Token.Text & Ch
Else
UngetChar
End If

Case ">"
Set Token = NewToken(Ch, tkOperator)

If GetChar = "=" Then
Token.Text = Token.Text & "="
Else
UngetChar
End If

Case ":"
Set Token = NewToken(Ch, tkSoftLineBreak)

Case vbLf
Set Token = NewToken(Ch, tkHardLineBreak)
Suppress = False

Case vbCr
If GetCodePoint <> 10 Then UngetChar
Set Token = NewToken(vbLf, tkHardLineBreak)
Suppress = False

Case "'"
Set Token = ReadComment(Ch)

Case ","
Set Token = NewToken(Ch, tkListSeparator)

Case ";"
Set Token = NewToken(Ch, tkPrintSeparator)

Case "("
Set Token = NewToken(Ch, tkLeftParenthesis)

Case ")"
Set Token = NewToken(Ch, tkRightParenthesis)

Case Else
If IsSpace(Cp) Then
If Not EOF(FileHandle_) Then
If GetChar = "_" Then
If EOF(FileHandle_) Then
UngetChar
Else
Cp = GetCodePoint
If Cp <> 10 And Cp <> 13 Then UngetChar 2
End If
Else
UngetChar
End If
End If

Set Token = NewToken(" ", tkWhiteSpace)

ElseIf IsLetter(Cp) Then
Set Token = ReadIdentifier(Cp)

If IsKeyword(Token.Text) And Token.Suffix = vbNullChar Then
Token.Kind = tkKeyword
If Token.Text = "Rem" Then Set Token = ReadComment(Token.Text)

ElseIf IsOperator(Token.Text) And Token.Suffix = vbNullChar Then
Token.Kind = tkOperator
End If

ElseIf Not Suppress Then
Fail "Unknown token: '" & Ch & "'"

Else
Set Token = New Token
End If
End Select

If Not Suppress Then Debug.Print EnumName(Token.Kind) & " <|" & Token.Text & "|>"
Wend

CloseIt:
Close #FileHandle_
Rem This is equivalent to a Throw in a Catch.
If Err.Number Then Err.Raise Err.Number
End Sub


Private Function ReadIdentifier(ByVal CodePoint As Integer) As Token
Const MAX_LENGTH = 255
Dim Buffer As String * MAX_LENGTH
Dim IsOK As Boolean
Dim Cp As Integer
Dim Count As Integer
Dim Suffix As String * 1

Count = 1
Mid$(Buffer, Count, 1) = ToChar(CodePoint)

Do While Not EOF(FileHandle_)
Cp = GetCodePoint

IsOK = Cp = AscW("_")
If Not IsOK Then IsOK = Cp >= AscW("0") And Cp <= AscW("9")
If Not IsOK Then IsOK = IsLetter(Cp)
If Not IsOK Then Exit Do

Count = Count + 1
Mid$(Buffer, Count, 1) = ToChar(Cp)
If Count > MAX_LENGTH Then Fail "Identifier too long"
Loop

Select Case Cp
Case AscW("%"), AscW("&"), AscW("@"), AscW("!"), AscW("#"), AscW("$")
Suffix = ToChar(Cp)

Case Else
UngetChar
End Select

Set ReadIdentifier = NewToken(Left$(Buffer, Count), tkIdentifier, Suffix)
End Function


Private Function ReadEscapedIdentifier() As Token
Const MAX_LENGTH = 255
Dim Cp As Integer
Dim Count As Integer
Dim Suffix As String * 1
Dim Buffer As String * MAX_LENGTH

Do While Not EOF(FileHandle_)
Cp = GetCodePoint
If Cp = AscW("]") Then Exit Do
If Cp = 10 Or Cp = 13 Then Fail "Invalid identifier"
Count = Count + 1
If Count > MAX_LENGTH Then Fail "Identifier too long"
Mid$(Buffer, Count, 1) = ToChar(Cp)
Loop

If Not EOF(FileHandle_) Then
Suffix = GetChar

Select Case Suffix
Case "%", "&", "@", "!", "#", "$"
Rem OK

Case Else
UngetChar
Suffix = vbNullChar
End Select
End If

Set ReadEscapedIdentifier = NewToken(Left$(Buffer, Count), tkEscapedIdentifier, Suffix)
End Function


Private Function ReadString() As Token
Const MAX_LENGTH = 1013
Dim Count As Integer
Dim Ch As String * 1
Dim Buffer As String * MAX_LENGTH

Do
If Count = MAX_LENGTH Then Fail "String too long"

If EOF(FileHandle_) Then
Ch = vbCr
Else
Ch = GetChar
End If

Select Case Ch
Case """"
If EOF(FileHandle_) Then Exit Do
Ch = GetChar

If Ch = """" Then
GoSub Append
Else
Rem We read too much. Let's put it "back".
UngetChar
Exit Do
End If

Case vbCr, vbLf
Fail "Unclosed string"

Case Else
GoSub Append
End Select
Loop

Set ReadString = NewToken(Left$(Buffer, Count), tkString)
Exit Function

Append:
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch
Return
End Function


Private Function ReadInteger(Optional ByVal FirstDigit As String) As Token
Const MAX_LENGTH = 29
Dim Cp As Integer
Dim Count As Integer
Dim Ch As String * 1
Dim Buffer As String * MAX_LENGTH

If FirstDigit >= "0" And FirstDigit <= "9" Then
Count = 1
Mid$(Buffer, Count, 1) = FirstDigit
End If

Do Until EOF(FileHandle_)
If Count = MAX_LENGTH Then Fail "Literal too long"
Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "0" To "9"
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch

Case Else
UngetChar
Exit Do
End Select
Loop

Set ReadInteger = NewToken(Left$(Buffer, Count), tkIntegerNumber)
End Function


Private Function ReadFloat(ByVal FirstDigit As String) As Token
Dim Ch As String * 1
Dim Result As Token
Dim FracPart As Token

Set Result = ReadInteger(FirstDigit:=FirstDigit)

If Result.Suffix = vbNullChar Then
If Not EOF(FileHandle_) Then
Ch = GetChar

If Ch = "." Then
Set FracPart = ReadInteger
If FracPart.Text = "" Then Fail "Invalid literal"
Result.Text = Result.Text & "." & FracPart.Text
Result.Kind = tkFloatNumber
Result.Suffix = FracPart.Suffix
Else
UngetChar
End If
End If
End If

Set ReadFloat = Result
End Function


Private Function ReadNumber(ByVal FirstDigit As String) As Token
Dim Result As Token
Dim ExpPart As Token
Dim Ch As String * 1
Dim Sg As String * 1

Set Result = ReadFloat(FirstDigit)

If Result.Suffix = vbNullChar Then
If Not EOF(FileHandle_) Then
Ch = GetChar

Select Case Ch
Case "e", "E"
If EOF(FileHandle_) Then
UngetChar
Else
Sg = GetChar

If Sg = "-" Or Sg = "+" Then
Ch = ""
Else
Ch = Sg
Sg = "+"
End If

Set ExpPart = ReadInteger(FirstDigit:=Ch)
If ExpPart.Text = "" Or ExpPart.Suffix <> vbNullChar Then Fail "Invalid literal"
Result.Text = Result.Text & "E" & Sg & ExpPart.Text
Result.Kind = tkSciNumber
End If

Case Else
UngetChar
End Select
End If
End If

Set ReadNumber = Result
End Function


Private Function ReadAmpersand() As Token
Const Msg = "Invalid literal"
Dim Ch As String * 1
Dim Cp As Integer

Cp = GetCodePoint
Ch = ToChar(Cp)

Select Case Ch
Case "o", "O"
Set ReadAmpersand = ReadOctal

Case "h", "H"
Set ReadAmpersand = ReadHexa

Case Else
UngetChar
Set ReadAmpersand = NewToken("&", tkOperator)
End Select
End Function


Private Function ReadOctal() As Token
Const MAX_LENGTH = 32
Dim Count As Integer
Dim Ch As String * 1
Dim Buffer As String * MAX_LENGTH
Dim Suffix As String * 1

Do While Not EOF(FileHandle_)
If Count = MAX_LENGTH Then Fail "Literal too long"
Ch = GetChar

Select Case Ch
Case "0" To "7"
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch

Case "%", "&", "@", "!", "#", "$"
Suffix = Ch
Exit Do

Case Else
UngetChar
Exit Do
End Select
Loop

If Count = 0 Then Fail "Invalid literal"
Set ReadOctal = NewToken(Left$(Buffer, Count), tkOctalNumber, Suffix)
End Function


Private Function ReadHexa() As Token
Const MAX_LENGTH = 24
Dim Count As Integer
Dim Ch As String * 1
Dim Buffer As String * MAX_LENGTH
Dim Suffix As String * 1

Do While Not EOF(FileHandle_)
If Count = MAX_LENGTH Then Fail "Literal too long"
Ch = GetChar

Select Case Ch
Case "0" To "9", "a" To "f", "A" To "F"
Count = Count + 1
Mid$(Buffer, Count, 1) = Ch

Case "%", "&", "@", "!", "#", "$"
Suffix = Ch
Exit Do

Case Else
UngetChar
Exit Do
End Select
Loop

If Count = 0 Then Fail "Invalid literal"
Set ReadHexa = NewToken(Left$(Buffer, Count), tkHexaNumber, Suffix)
End Function


Private Function EnumName(ByVal Enumerand As Long) As String
Select Case Enumerand
Case tkWhiteSpace
EnumName = "White-space"

Case tkComment
EnumName = "Comment"

Case tkIdentifier
EnumName = "Identifier"

Case tkEscapedIdentifier
EnumName = "Escaped identifier"

Case tkKeyword
EnumName = "Keyword"

Case tkIntegerNumber
EnumName = "Integer"

Case tkFloatNumber
EnumName = "Float"

Case tkSciNumber
EnumName = "Sci number"

Case tkOctalNumber
EnumName = "Octal number"

Case tkHexaNumber
EnumName = "Hexa number"

Case tkFileHandle
EnumName = "File-handle"

Case tkString
EnumName = "String"

Case tkDateTime
EnumName = "Date"

Case tkOperator
EnumName = "Operator"

Case tkLeftParenthesis
EnumName = "Left parenthesis"

Case tkRightParenthesis
EnumName = "Right parenthesis"

Case tkHardLineBreak
EnumName = "Hard line-break"

Case tkSoftLineBreak
EnumName = "Soft line-break"

Case tkListSeparator
EnumName = "List separator"

Case tkPrintSeparator
EnumName = "Print separator"

Case tkDirective
EnumName = "Compiler directive"
End Select
End Function


Private Function ReadComment(ByVal Mark As String) As Token
Const MAX_LENGTH = 1013
Dim Count As Integer
Dim Cp As Integer
Dim Ch1 As String * 1
Dim Ch2 As String * 1
Dim Ch3 As String * 1
Dim Buffer As String * MAX_LENGTH

Count = Len(Mark)
Mid$(Buffer, 1, Count) = Mark

Do While Not EOF(FileHandle_)
If Count = MAX_LENGTH Then Fail "Comment too long"
Ch1 = Ch2
Ch2 = Ch3
Cp = GetCodePoint
Ch3 = ToChar(Cp)

Select Case Ch3
Case vbCr
If GetChar <> vbLf Then UngetChar
GoTo CaseLF

Case vbLf
CaseLF:If IsSpace(AscW(Ch1)) And Ch2 = "_" Then
Ch3 = " "
GoTo CaseElse
End If

Exit Do

Case Else
CaseElse:  Count = Count + 1
Mid$(Buffer, Count, 1) = Ch3
End Select
Loop

Set ReadComment = NewToken(Left$(Buffer, Count), tkComment)
End Function