asl/src/tokenizer/tokenizer.go

package tokenizer

import (
	"strings"
)

type Token struct {
	Token        string
	Preprocessor bool
	Line         int
	Column       int
}

var (
	delimiter = []byte{
		'=',
		';',
		'{',
		'}',
		'(',
		')',
		'[',
		']',
		'<',
		'>',
		'!',
		',',
		':',
		'&',
		'|',
		'+',
		'-',
		'*',
		'/'} // TODO: modulo?

	keywords = []string{
		"var",
		"if",
		"while",
		"switch",
		"for",
		"foreach",
		"func",
		"true",
		"false",
		"case",
		"default",
		"return",
		"try",
		"catch",
		"exitwith",
		"waituntil",
		"code"}

	whitespace   = []byte{' ', '\n', '\t', '\r'}
	identifier   = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"
	preprocessor = byte('#')
	new_line     = []byte{'\r', '\n'}
)

// Tokenizes the given byte array into syntax tokens,
// which can be parsed later.
func Tokenize(code []byte, doStripSlashes bool) []Token {
	if doStripSlashes {
		code = stripSlashes(code)
	}

	code = removeComments(code)
	tokens := make([]Token, 0)
	token, mask, isstring, line, column := "", false, false, 0, 0

	for i := 0; i < len(code); i++ {
		c := code[i]
		column++

		if byteArrayContains(new_line, c) {
			line++
			column = 0
		}

		// string masks (backslash)
		if c == '\\' && !mask {
			token += "\\"
			mask = true
			continue
		}

		// string
		if c == '"' && !mask {
			token += "\""
			isstring = !isstring
			continue
		}

		if isstring {
			token += string(c)
		} else {
			// preprocessor, delimeter, keyword or variable/expression
			if c == preprocessor {
				tokens = append(tokens, preprocessorLine(code, &i, line, column))
				token = ""
			} else if byteArrayContains(delimiter, c) {
				if token != "" {
					tokens = append(tokens, Token{token, false, line, column})
				}

				tokens = append(tokens, Token{string(c), false, line, column})
				token = ""
			} else if stringArrayContains(strings.ToLower(token)) && !isIdentifierCharacter(c) {
				tokens = append(tokens, Token{token, false, line, column})
				token = ""
			} else if !byteArrayContains(whitespace, c) {
				token += string(c)
			}
		}

		mask = false
	}

	return tokens
}

// Removes slashes from input code.
// This is used for the "code" keyword for correct strings in resulting code.
func stripSlashes(code []byte) []byte {
	newcode := make([]byte, len(code))
	j, mask := 0, false

	for i := 0; i < len(code); i++ {
		c := code[i]

		if c == '\\' && !mask {
			mask = true
			continue
		}

		newcode[j] = code[i]
		mask = false
		j++
	}

	return newcode
}

// Removes all comments from input byte array.
// Comments are single line comments, starting with // (two slashes),
// multi line comments with /* ... */ (slash star, star slash).
func removeComments(code []byte) []byte {
	newcode := make([]byte, len(code))
	j, mask, isstring := 0, false, false

	for i := 0; i < len(code); i++ {
		c := code[i]

		// do not remove comments from strings
		if c == '\\' && !mask {
			mask = true
		}

		if c == '"' && !mask {
			isstring = !isstring
		}

		// single/multi line comment
		if !isstring {
			if c == '/' && nextChar(code, i) == '/' {
				i = skipSingleLineComment(code, i+1)
				continue
			} else if c == '/' && nextChar(code, i) == '*' {
				i = skipMultiLineComment(code, i+1)
				continue
			}
		}

		newcode[j] = c
		j++
		mask = false
	}

	return newcode[:j]
}

// Reads preprocessor command until end of line
func preprocessorLine(code []byte, i *int, lineNr, column int) Token {
	c := byte('0')
	var line string

	for *i < len(code) {
		c = code[*i]

		if byteArrayContains(new_line, c) {
			break
		}

		line += string(c)
		(*i)++
	}

	// read all new line characters (\r and \n)
	c = code[*i]

	for byteArrayContains(new_line, c) {
		(*i)++
		c = code[*i]
	}

	(*i)-- // for will count up 1, so subtract it here

	return Token{line, true, lineNr, column}
}

// Returns the next character in code starting at i.
// If no character is left, '0' will be returned.
func nextChar(code []byte, i int) byte {
	i++

	if i < len(code) {
		return code[i]
	}

	return '0'
}

// Used to skip a line if a single line comment was found.
func skipSingleLineComment(code []byte, i int) int {
	for i < len(code) && code[i] != '\n' {
		i++
	}

	return i
}

// Used to skip a block of characters if a multi line comment was found
func skipMultiLineComment(code []byte, i int) int {
	for i < len(code) && !(code[i] == '*' && nextChar(code, i) == '/') {
		i++
	}

	return i + 1
}

// Checks if a byte array (string) contains a delimeter.
func byteArrayContains(haystack []byte, needle byte) bool {
	for i := range haystack {
		if haystack[i] == needle {
			return true
		}
	}

	return false
}

// Checks if a byte array (string) contains a string delimeter.
func stringArrayContains(needle string) bool {
	for i := range keywords {
		if keywords[i] == needle {
			return true
		}
	}

	return false
}

// Checks if a character is allowed for identifiers.
func isIdentifierCharacter(c byte) bool {
	for i := range identifier {
		if identifier[i] == c {
			return true
		}
	}

	return false
}