mirror of
https://github.com/Kugelschieber/asl.git
synced 2026-01-18 12:00:25 +00:00
274 lines
4.9 KiB
Go
274 lines
4.9 KiB
Go
package tokenizer
|
|
|
|
import (
|
|
"strings"
|
|
)
|
|
|
|
type Token struct {
|
|
Token string
|
|
Preprocessor bool
|
|
Line int
|
|
Column int
|
|
}
|
|
|
|
var (
|
|
delimiter = []byte{
|
|
'=',
|
|
';',
|
|
'{',
|
|
'}',
|
|
'(',
|
|
')',
|
|
'[',
|
|
']',
|
|
'<',
|
|
'>',
|
|
'!',
|
|
',',
|
|
':',
|
|
'&',
|
|
'|',
|
|
'+',
|
|
'-',
|
|
'*',
|
|
'/'} // TODO: modulo?
|
|
|
|
keywords = []string{
|
|
"var",
|
|
"if",
|
|
"while",
|
|
"switch",
|
|
"for",
|
|
"foreach",
|
|
"func",
|
|
"true",
|
|
"false",
|
|
"case",
|
|
"default",
|
|
"return",
|
|
"try",
|
|
"catch",
|
|
"exitwith",
|
|
"waituntil",
|
|
"code"}
|
|
|
|
whitespace = []byte{' ', '\n', '\t', '\r'}
|
|
identifier = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"
|
|
preprocessor = byte('#')
|
|
new_line = []byte{'\r', '\n'}
|
|
)
|
|
|
|
// Tokenizes the given byte array into syntax tokens,
|
|
// which can be parsed later.
|
|
func Tokenize(code []byte, doStripSlashes bool) []Token {
|
|
if doStripSlashes {
|
|
code = stripSlashes(code)
|
|
}
|
|
|
|
code = removeComments(code)
|
|
tokens := make([]Token, 0)
|
|
token, mask, isstring, line, column := "", false, false, 0, 0
|
|
|
|
for i := 0; i < len(code); i++ {
|
|
c := code[i]
|
|
column++
|
|
|
|
if byteArrayContains(new_line, c) {
|
|
line++
|
|
column = 0
|
|
}
|
|
|
|
// string masks (backslash)
|
|
if c == '\\' && !mask {
|
|
token += "\\"
|
|
mask = true
|
|
continue
|
|
}
|
|
|
|
// string
|
|
if c == '"' && !mask {
|
|
token += "\""
|
|
isstring = !isstring
|
|
continue
|
|
}
|
|
|
|
if isstring {
|
|
token += string(c)
|
|
} else {
|
|
// preprocessor, delimeter, keyword or variable/expression
|
|
if c == preprocessor {
|
|
tokens = append(tokens, preprocessorLine(code, &i, line, column))
|
|
token = ""
|
|
} else if byteArrayContains(delimiter, c) {
|
|
if token != "" {
|
|
tokens = append(tokens, Token{token, false, line, column})
|
|
}
|
|
|
|
tokens = append(tokens, Token{string(c), false, line, column})
|
|
token = ""
|
|
} else if stringArrayContains(strings.ToLower(token)) && !isIdentifierCharacter(c) {
|
|
tokens = append(tokens, Token{token, false, line, column})
|
|
token = ""
|
|
} else if !byteArrayContains(whitespace, c) {
|
|
token += string(c)
|
|
}
|
|
}
|
|
|
|
mask = false
|
|
}
|
|
|
|
return tokens
|
|
}
|
|
|
|
// Removes slashes from input code.
|
|
// This is used for the "code" keyword for correct strings in resulting code.
|
|
func stripSlashes(code []byte) []byte {
|
|
newcode := make([]byte, len(code))
|
|
j, mask := 0, false
|
|
|
|
for i := 0; i < len(code); i++ {
|
|
c := code[i]
|
|
|
|
if c == '\\' && !mask {
|
|
mask = true
|
|
continue
|
|
}
|
|
|
|
newcode[j] = code[i]
|
|
mask = false
|
|
j++
|
|
}
|
|
|
|
return newcode
|
|
}
|
|
|
|
// Removes all comments from input byte array.
|
|
// Comments are single line comments, starting with // (two slashes),
|
|
// multi line comments with /* ... */ (slash star, star slash).
|
|
func removeComments(code []byte) []byte {
|
|
newcode := make([]byte, len(code))
|
|
j, mask, isstring := 0, false, false
|
|
|
|
for i := 0; i < len(code); i++ {
|
|
c := code[i]
|
|
|
|
// do not remove comments from strings
|
|
if c == '\\' && !mask {
|
|
mask = true
|
|
}
|
|
|
|
if c == '"' && !mask {
|
|
isstring = !isstring
|
|
}
|
|
|
|
// single/multi line comment
|
|
if !isstring {
|
|
if c == '/' && nextChar(code, i) == '/' {
|
|
i = skipSingleLineComment(code, i+1)
|
|
continue
|
|
} else if c == '/' && nextChar(code, i) == '*' {
|
|
i = skipMultiLineComment(code, i+1)
|
|
continue
|
|
}
|
|
}
|
|
|
|
newcode[j] = c
|
|
j++
|
|
mask = false
|
|
}
|
|
|
|
return newcode[:j]
|
|
}
|
|
|
|
// Reads preprocessor command until end of line
|
|
func preprocessorLine(code []byte, i *int, lineNr, column int) Token {
|
|
c := byte('0')
|
|
var line string
|
|
|
|
for *i < len(code) {
|
|
c = code[*i]
|
|
|
|
if byteArrayContains(new_line, c) {
|
|
break
|
|
}
|
|
|
|
line += string(c)
|
|
(*i)++
|
|
}
|
|
|
|
// read all new line characters (\r and \n)
|
|
c = code[*i]
|
|
|
|
for byteArrayContains(new_line, c) {
|
|
(*i)++
|
|
c = code[*i]
|
|
}
|
|
|
|
(*i)-- // for will count up 1, so subtract it here
|
|
|
|
return Token{line, true, lineNr, column}
|
|
}
|
|
|
|
// Returns the next character in code starting at i.
|
|
// If no character is left, '0' will be returned.
|
|
func nextChar(code []byte, i int) byte {
|
|
i++
|
|
|
|
if i < len(code) {
|
|
return code[i]
|
|
}
|
|
|
|
return '0'
|
|
}
|
|
|
|
// Used to skip a line if a single line comment was found.
|
|
func skipSingleLineComment(code []byte, i int) int {
|
|
for i < len(code) && code[i] != '\n' {
|
|
i++
|
|
}
|
|
|
|
return i
|
|
}
|
|
|
|
// Used to skip a block of characters if a multi line comment was found
|
|
func skipMultiLineComment(code []byte, i int) int {
|
|
for i < len(code) && !(code[i] == '*' && nextChar(code, i) == '/') {
|
|
i++
|
|
}
|
|
|
|
return i + 1
|
|
}
|
|
|
|
// Checks if a byte array (string) contains a delimeter.
|
|
func byteArrayContains(haystack []byte, needle byte) bool {
|
|
for i := range haystack {
|
|
if haystack[i] == needle {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// Checks if a byte array (string) contains a string delimeter.
|
|
func stringArrayContains(needle string) bool {
|
|
for i := range keywords {
|
|
if keywords[i] == needle {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// Checks if a character is allowed for identifiers.
|
|
func isIdentifierCharacter(c byte) bool {
|
|
for i := range identifier {
|
|
if identifier[i] == c {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|