This commit is contained in:
Marvin Blum
2015-10-30 19:03:50 +01:00
parent ae8fad1abc
commit 6e4cef91bc
7 changed files with 49 additions and 40 deletions

202
src/tokenizer/tokenizer.go Normal file
View File

@@ -0,0 +1,202 @@
package tokenizer
import (
"strings"
)
type Token struct {
Token string
}
var delimiter = []byte{
'=',
';',
'{',
'}',
'(',
')',
'[',
']',
'<',
'>',
'!',
',',
':',
'&',
'|',
'+',
'-',
'*',
'/'} // TODO: modulo?
var keywords = []string{
"var",
"if",
"while",
"switch",
"for",
"foreach",
"func",
"true",
"false",
"case",
"default",
"return",
"try",
"catch",
"exitwith",
"waituntil",
"code"}
var whitespace = []byte{' ', '\n', '\t', '\r'}
var identifier = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"
// Tokenizes the given byte array into syntax tokens,
// which can be parsed later.
func Tokenize(code []byte) []Token {
code = removeComments(code)
tokens := make([]Token, 0)
token, mask, isstring := "", false, false
for i := range code {
c := code[i]
// string masks (backslash)
if c == '\\' && !mask {
token += "\\"
mask = true
continue
}
// string
if c == '"' && !mask {
token += "\""
isstring = !isstring
continue
}
if isstring {
token += string(c)
} else {
// delimeter, keyword or variable/expression
if byteArrayContains(delimiter, c) {
if token != "" {
tokens = append(tokens, Token{token})
}
tokens = append(tokens, Token{string(c)})
token = ""
} else if stringArrayContains(strings.ToLower(token)) && !isIdentifierCharacter(c) {
tokens = append(tokens, Token{token})
token = ""
} else if !byteArrayContains(whitespace, c) {
token += string(c)
}
}
mask = false
}
return tokens
}
// Removes all comments from input byte array.
// Comments are single line comments, starting with // (two slashes),
// multi line comments with /* ... */ (slash star, star slash).
func removeComments(code []byte) []byte {
newcode := make([]byte, len(code))
j, mask, isstring := 0, false, false
for i := 0; i < len(code); i++ {
c := code[i]
// do not remove comments from strings
if c == '\\' && !mask {
mask = true
}
if c == '"' && !mask {
isstring = !isstring
}
// single/multi line comment
if !isstring {
if c == '/' && nextChar(code, i) == '/' {
i = skipSingleLineComment(code, i+1)
continue
} else if c == '/' && nextChar(code, i) == '*' {
i = skipMultiLineComment(code, i+1)
continue
}
}
newcode[j] = c
j++
mask = false
}
return newcode[:j]
}
// Returns the next character in code starting at i.
// If no character is left, '0' will be returned.
func nextChar(code []byte, i int) byte {
i++
if i < len(code) {
return code[i]
}
return '0'
}
// Used to skip a line if a single line comment was found.
func skipSingleLineComment(code []byte, i int) int {
for i < len(code) && code[i] != '\n' {
i++
}
return i
}
// Used to skip a block of characters if a multi line comment was found
func skipMultiLineComment(code []byte, i int) int {
for i < len(code) && !(code[i] == '*' && nextChar(code, i) == '/') {
i++
}
return i + 1
}
// Checks if a byte array (string) contains a delimeter.
func byteArrayContains(haystack []byte, needle byte) bool {
for i := range haystack {
if haystack[i] == needle {
return true
}
}
return false
}
// Checks if a byte array (string) contains a string delimeter.
func stringArrayContains(needle string) bool {
for i := range keywords {
if keywords[i] == needle {
return true
}
}
return false
}
// Checks if a character is allowed for identifiers.
func isIdentifierCharacter(c byte) bool {
for i := range identifier {
if identifier[i] == c {
return true
}
}
return false
}