From 281d3a270a73a5a8dd81e68d751cce5b9b6c2a30 Mon Sep 17 00:00:00 2001 From: brettlangdon Date: Mon, 14 Sep 2015 22:32:33 -0400 Subject: [PATCH] no longer using parser.tokenizer --- parser/helpers.go | 229 ---------------------- parser/tokenizer.go | 462 -------------------------------------------- 2 files changed, 691 deletions(-) delete mode 100644 parser/helpers.go delete mode 100644 parser/tokenizer.go diff --git a/parser/helpers.go b/parser/helpers.go deleted file mode 100644 index eeb6969..0000000 --- a/parser/helpers.go +++ /dev/null @@ -1,229 +0,0 @@ -package parser - -import "github.com/brettlangdon/gython/token" - -func IsLetter(r rune) bool { - return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') -} - -func IsDigit(r rune) bool { - return r >= '0' && r <= '9' -} - -func IsXDigit(r rune) bool { - return IsDigit(r) || (r >= 65 && r <= 70) || (r >= 97 && r <= 102) -} - -func IsAlphaNumeric(r rune) bool { - return IsLetter(r) || IsDigit(r) -} - -func IsIdentifierStart(r rune) bool { - return IsLetter(r) || r == '_' || r >= 128 -} - -func IsIdentifierChar(r rune) bool { - return IsIdentifierStart(r) || IsDigit(r) -} - -func IsQuote(r rune) bool { - return r == '"' || r == '\'' -} - -func GetTwoCharTokenID(curChar rune, nextChar rune) token.TokenID { - switch curChar { - case '=': - switch nextChar { - case '=': - return token.EQEQUAL - } - break - case '!': - switch nextChar { - case '=': - return token.NOTEQUAL - } - break - case '<': - switch nextChar { - case '>': - return token.NOTEQUAL - case '=': - return token.LESSEQUAL - case '<': - return token.LEFTSHIFT - } - break - case '>': - switch nextChar { - case '=': - return token.GREATEREQUAL - case '>': - return token.RIGHTSHIFT - } - break - case '+': - switch nextChar { - case '=': - return token.PLUSEQUAL - } - break - case '-': - switch nextChar { - case '=': - return token.MINEQUAL - case '>': - return token.RARROW - } - break - case '*': - switch nextChar { - case '*': - return token.DOUBLESTAR - case '=': - return token.STAREQUAL - } - break - case '/': - switch nextChar { - case '/': - return token.DOUBLESLASH - case '=': - return token.SLASHEQUAL - } - break - case '|': - switch nextChar { - case '=': - return token.VBAREQUAL - } - break - case '%': - switch nextChar { - case '=': - return token.PERCENTEQUAL - } - break - case '&': - switch nextChar { - case '=': - return token.AMPEREQUAL - } - break - case '^': - switch nextChar { - case '=': - return token.CIRCUMFLEXEQUAL - } - break - } - return token.OP -} - -func GetThreeCharTokenID(curChar rune, nextChar rune, thirdChar rune) token.TokenID { - switch curChar { - case '<': - switch nextChar { - case '<': - switch thirdChar { - case '=': - return token.LEFTSHIFTEQUAL - } - break - } - break - case '>': - switch nextChar { - case '>': - switch thirdChar { - case '=': - return token.RIGHTSHIFTEQUAL - } - break - } - break - case '*': - switch nextChar { - case '*': - switch thirdChar { - case '=': - return token.DOUBLESTAREQUAL - } - break - } - break - case '/': - switch nextChar { - case '/': - switch thirdChar { - case '=': - return token.DOUBLESLASHEQUAL - } - break - } - break - case '.': - switch nextChar { - case '.': - switch thirdChar { - case '.': - return token.ELLIPSIS - } - break - } - break - } - return token.OP -} - -func GetOneCharTokenID(curChar rune) token.TokenID { - switch curChar { - case '(': - return token.LPAR - case ')': - return token.RPAR - case '[': - return token.LSQB - case ']': - return token.RSQB - case ':': - return token.COLON - case ',': - return token.COMMA - case ';': - return token.SEMI - case '+': - return token.PLUS - case '-': - return token.MINUS - case '*': - return token.STAR - case '/': - return token.SLASH - case '|': - return token.VBAR - case '&': - return token.AMPER - case '<': - return token.LESS - case '>': - return token.GREATER - case '=': - return token.EQUAL - case '.': - return token.DOT - case '%': - return token.PERCENT - case '{': - return token.LBRACE - case '}': - return token.RBRACE - case '^': - return token.CIRCUMFLEX - case '~': - return token.TILDE - case '@': - return token.AT - } - return token.OP -} diff --git a/parser/tokenizer.go b/parser/tokenizer.go deleted file mode 100644 index ea99be4..0000000 --- a/parser/tokenizer.go +++ /dev/null @@ -1,462 +0,0 @@ -package parser - -import ( - "bufio" - "os" - - "github.com/brettlangdon/gython/token" -) - -var EOF rune = 0 -var MAXINDENT int = 100 - -type TokenizerState struct { - atBol bool - buffer *bufio.Reader - curColumn int - curLine int - curLiteral string - fp *os.File - indentationLevel int - indentationAltStack []int - indentationCurrent int - indentationPending int - indentationStack []int - offset int - tabsize int - tabsizeAlt int -} - -func newTokenizerState() *TokenizerState { - return &TokenizerState{ - atBol: true, - curColumn: 0, - curLine: 1, - curLiteral: "", - indentationAltStack: make([]int, MAXINDENT), - indentationCurrent: 0, - indentationLevel: 0, - indentationPending: 0, - indentationStack: make([]int, MAXINDENT), - offset: 0, - tabsize: 8, - } -} - -func TokenizerFromFileName(filename string) (*TokenizerState, error) { - state := newTokenizerState() - fp, err := os.Open(filename) - if err != nil { - return nil, err - } - state.fp = fp - - state.buffer = bufio.NewReader(state.fp) - return state, nil -} - -func (tokenizer *TokenizerState) readNext() rune { - next, _, err := tokenizer.buffer.ReadRune() - if err != nil { - next = EOF - } - tokenizer.offset += 1 - tokenizer.curColumn += 1 - if next != EOF { - tokenizer.curLiteral += string(next) - } - return next -} - -func (tokenizer *TokenizerState) unread() error { - err := tokenizer.buffer.UnreadRune() - tokenizer.offset -= 1 - tokenizer.curColumn -= 1 - if len(tokenizer.curLiteral) > 0 { - tokenizer.curLiteral = tokenizer.curLiteral[0 : len(tokenizer.curLiteral)-1] - } - return err -} - -func (tokenizer *TokenizerState) finalizeToken(tok *token.Token, tokId token.TokenID) *token.Token { - tok.ID = tokId - tok.LineEnd = tokenizer.curLine - tok.ColumnEnd = tokenizer.curColumn - tok.Literal = tokenizer.curLiteral - return tok -} - -func (tokenizer *TokenizerState) newToken() *token.Token { - tokenizer.curLiteral = "" - return &token.Token{ - ID: token.ERRORTOKEN, - LineStart: tokenizer.curLine, - ColumnStart: tokenizer.curColumn, - LineEnd: tokenizer.curLine, - ColumnEnd: tokenizer.curColumn, - Literal: tokenizer.curLiteral, - } -} - -func (tokenizer *TokenizerState) parseQuoted(curTok *token.Token, nextChar rune) *token.Token { - quote := nextChar - quoteSize := 1 - endQuoteSize := 0 - nextChar = tokenizer.readNext() - if nextChar == quote { - nextChar = tokenizer.readNext() - if nextChar == quote { - quoteSize = 3 - } else { - endQuoteSize = 1 - } - } - - if nextChar != quote { - tokenizer.unread() - } - - for { - if endQuoteSize == quoteSize { - break - } - nextChar = tokenizer.readNext() - if nextChar == EOF { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - if quoteSize == 1 && nextChar == '\n' { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - if nextChar == quote { - endQuoteSize += 1 - } else { - endQuoteSize = 0 - if nextChar == '\\' { - nextChar = tokenizer.readNext() - } - } - } - return tokenizer.finalizeToken(curTok, token.STRING) -} - -func (tokenizer *TokenizerState) parseNumber(curTok *token.Token, nextChar rune, fraction bool) *token.Token { - if fraction { - goto fraction - } - if nextChar == '0' { - nextChar = tokenizer.readNext() - if nextChar == '.' { - tokenizer.unread() - goto fraction - } - if nextChar == 'j' || nextChar == 'J' { - tokenizer.unread() - goto imaginary - } - if nextChar == 'x' || nextChar == 'X' { - // Hex - nextChar = tokenizer.readNext() - if !IsXDigit(nextChar) { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - for IsXDigit(nextChar) { - nextChar = tokenizer.readNext() - } - tokenizer.unread() - } else if nextChar == 'o' || nextChar == 'O' { - // Octal - nextChar = tokenizer.readNext() - if nextChar < '0' || nextChar >= '8' { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - for nextChar >= '0' && nextChar < '8' { - nextChar = tokenizer.readNext() - } - tokenizer.unread() - } else if nextChar == 'b' || nextChar == 'B' { - // Binary - nextChar = tokenizer.readNext() - if nextChar != '0' && nextChar != '1' { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - for nextChar == '0' || nextChar == '1' { - nextChar = tokenizer.readNext() - } - tokenizer.unread() - } else { - nonzero := false - for nextChar == '0' { - nextChar = tokenizer.readNext() - } - for IsDigit(nextChar) { - nonzero = true - nextChar = tokenizer.readNext() - } - tokenizer.unread() - - if nextChar == '.' { - goto fraction - } else if nextChar == 'e' || nextChar == 'E' { - goto exponent - } else if nextChar == 'j' || nextChar == 'J' { - goto imaginary - } else if nonzero { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - goto end - } - } else { - // Decimal - for IsDigit(nextChar) { - nextChar = tokenizer.readNext() - } - tokenizer.unread() - goto fraction - } -fraction: - if nextChar == '.' { - nextChar = tokenizer.readNext() - nextChar = tokenizer.readNext() - for IsDigit(nextChar) { - nextChar = tokenizer.readNext() - } - tokenizer.unread() - } -exponent: - if nextChar == 'e' || nextChar == 'E' { - nextChar = tokenizer.readNext() - nextChar = tokenizer.readNext() - if nextChar == '+' || nextChar == '-' { - nextChar = tokenizer.readNext() - if !IsDigit(nextChar) { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - } else if !IsDigit(nextChar) { - tokenizer.unread() - tokenizer.unread() - return tokenizer.finalizeToken(curTok, token.NUMBER) - } - for IsDigit(nextChar) { - nextChar = tokenizer.readNext() - } - tokenizer.unread() - } -imaginary: - if nextChar == 'j' || nextChar == 'J' { - nextChar = tokenizer.readNext() - nextChar = tokenizer.readNext() - } -end: - return tokenizer.finalizeToken(curTok, token.NUMBER) -} - -func (tokenizer *TokenizerState) Next() *token.Token { -next_line: - curTok := tokenizer.newToken() - nextChar := EOF - blankline := false - - if tokenizer.atBol { - // Get indentation level - col := 0 - altcol := 0 - tokenizer.atBol = false - for { - nextChar = tokenizer.readNext() - if nextChar == ' ' { - col++ - altcol++ - } else if nextChar == '\t' { - col = (col/tokenizer.tabsize + 1) * tokenizer.tabsize - altcol = (altcol/tokenizer.tabsizeAlt + 1) * tokenizer.tabsizeAlt - } else { - break - } - } - tokenizer.unread() - - if nextChar == '#' || nextChar == '\n' { - // Lines with only newline or comment, shouldn't affect indentation - if col == 0 && nextChar == '\n' { - blankline = false - } else { - blankline = true - } - } - if !blankline && tokenizer.indentationLevel == 0 { - if col == tokenizer.indentationStack[tokenizer.indentationCurrent] { - if altcol != tokenizer.indentationAltStack[tokenizer.indentationCurrent] { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - } else if col > tokenizer.indentationStack[tokenizer.indentationCurrent] { - if tokenizer.indentationCurrent+1 >= MAXINDENT { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - if altcol <= tokenizer.indentationAltStack[tokenizer.indentationCurrent] { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - tokenizer.indentationPending++ - tokenizer.indentationCurrent++ - tokenizer.indentationStack[tokenizer.indentationCurrent] = col - tokenizer.indentationAltStack[tokenizer.indentationCurrent] = altcol - - } else { - for tokenizer.indentationCurrent > 0 && col < tokenizer.indentationStack[tokenizer.indentationCurrent] { - tokenizer.indentationPending-- - tokenizer.indentationCurrent-- - } - if col != tokenizer.indentationStack[tokenizer.indentationCurrent] { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - if altcol != tokenizer.indentationAltStack[tokenizer.indentationCurrent] { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - } - } - } - } - - if tokenizer.indentationPending != 0 { - if tokenizer.indentationPending < 0 { - tokenizer.indentationPending++ - return tokenizer.finalizeToken(curTok, token.DEDENT) - } else { - tokenizer.indentationPending-- - return tokenizer.finalizeToken(curTok, token.INDENT) - } - } - -again: - // Skip spaces - for { - nextChar = tokenizer.readNext() - if !(nextChar == ' ' || nextChar == '\t') { - break - } - } - curTok.LineStart = tokenizer.curLine - curTok.ColumnStart = tokenizer.curColumn - 1 - tokenizer.curLiteral = string(nextChar) - - // Skip comments - if nextChar == '#' { - for { - nextChar = tokenizer.readNext() - if nextChar == EOF || nextChar == '\n' { - break - } - } - } - - // Check for EOF - if nextChar == EOF { - tokenizer.curLiteral = "" - return tokenizer.finalizeToken(curTok, token.ENDMARKER) - } - - if IsIdentifierStart(nextChar) { - saw_b, saw_r, saw_u := false, false, false - for { - if !(saw_b || saw_u) && (nextChar == 'b' || nextChar == 'B') { - saw_b = true - } else if !(saw_b || saw_u || saw_r) && (nextChar == 'u' || nextChar == 'U') { - saw_u = true - } else if !(saw_r || saw_u) && (nextChar == 'r' || nextChar == 'R') { - saw_r = true - } else { - break - } - nextChar = tokenizer.readNext() - if IsQuote(nextChar) { - goto letter_quote - } - } - for IsIdentifierChar(nextChar) { - nextChar = tokenizer.readNext() - } - tokenizer.unread() - return tokenizer.finalizeToken(curTok, token.NAME) - } - - // Newline - if nextChar == '\n' { - tokenizer.atBol = true - if blankline || tokenizer.indentationLevel > 0 { - goto next_line - } - tokenizer.curLine += 1 - tokenizer.curColumn = 0 - return tokenizer.finalizeToken(curTok, token.NEWLINE) - } - - // Dot or number starting with dot - if nextChar == '.' { - nextChar = tokenizer.readNext() - if IsDigit(nextChar) { - return tokenizer.parseNumber(curTok, nextChar, true) - } else if nextChar == '.' { - nextChar = tokenizer.readNext() - if nextChar == '.' { - return tokenizer.finalizeToken(curTok, token.ELLIPSIS) - } else { - tokenizer.unread() - } - tokenizer.unread() - } else { - tokenizer.unread() - } - - return tokenizer.finalizeToken(curTok, token.DOT) - } - - // Number - if IsDigit(nextChar) { - return tokenizer.parseNumber(curTok, nextChar, false) - } - -letter_quote: - // String - if IsQuote(nextChar) { - return tokenizer.parseQuoted(curTok, nextChar) - } - - // Line continuation - if nextChar == '\\' { - nextChar = tokenizer.readNext() - if nextChar != '\n' { - return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) - goto again - } - } - - { - // Check for two character tokens - curChar := nextChar - nextChar = tokenizer.readNext() - tokId := GetTwoCharTokenID(curChar, nextChar) - if tokId != token.OP { - thirdChar := tokenizer.readNext() - nextTokId := GetThreeCharTokenID(curChar, nextChar, thirdChar) - if nextTokId != token.OP { - tokId = nextTokId - } else { - tokenizer.unread() - } - return tokenizer.finalizeToken(curTok, tokId) - } - tokenizer.unread() - nextChar = curChar - tokenizer.curLiteral = string(curChar) - } - - switch nextChar { - case '(', '[', '{': - tokenizer.indentationLevel++ - break - case ')', ']', '}': - tokenizer.indentationLevel-- - break - } - - tokId := GetOneCharTokenID(nextChar) - return tokenizer.finalizeToken(curTok, tokId) -}