From 7a8c9699ee8d5f9b5585b142a99873951869e026 Mon Sep 17 00:00:00 2001 From: brettlangdon Date: Sat, 22 Aug 2015 22:18:27 -0400 Subject: [PATCH] basic port of tokenizer from CPython --- main.go | 23 +++ parser/helpers.go | 229 ++++++++++++++++++++++++++ parser/tokenizer.go | 383 ++++++++++++++++++++++++++++++++++++++++++++ token/id.go | 61 +++++++ token/names.go | 59 +++++++ token/token.go | 24 +++ 6 files changed, 779 insertions(+) create mode 100644 main.go create mode 100644 parser/helpers.go create mode 100644 parser/tokenizer.go create mode 100644 token/id.go create mode 100644 token/names.go create mode 100644 token/token.go diff --git a/main.go b/main.go new file mode 100644 index 0000000..e6a075c --- /dev/null +++ b/main.go @@ -0,0 +1,23 @@ +package main + +import ( + "fmt" + "os" + + "github.com/brettlangdon/gython/parser" + "github.com/brettlangdon/gython/token" +) + +func main() { + tokenizer, err := parser.TokenizerFromFileName(os.Args[1]) + if err != nil { + panic(err) + } + for { + tok := tokenizer.Next() + if tok.ID == token.ENDMARKER || tok.ID == token.ERRORTOKEN { + break + } + fmt.Println(fmt.Sprintf("<%s> %s", tok, tok.Repr())) + } +} diff --git a/parser/helpers.go b/parser/helpers.go new file mode 100644 index 0000000..eeb6969 --- /dev/null +++ b/parser/helpers.go @@ -0,0 +1,229 @@ +package parser + +import "github.com/brettlangdon/gython/token" + +func IsLetter(r rune) bool { + return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') +} + +func IsDigit(r rune) bool { + return r >= '0' && r <= '9' +} + +func IsXDigit(r rune) bool { + return IsDigit(r) || (r >= 65 && r <= 70) || (r >= 97 && r <= 102) +} + +func IsAlphaNumeric(r rune) bool { + return IsLetter(r) || IsDigit(r) +} + +func IsIdentifierStart(r rune) bool { + return IsLetter(r) || r == '_' || r >= 128 +} + +func IsIdentifierChar(r rune) bool { + return IsIdentifierStart(r) || IsDigit(r) +} + +func IsQuote(r rune) bool { + return r == '"' || r == '\'' +} + +func GetTwoCharTokenID(curChar rune, nextChar rune) token.TokenID { + switch curChar { + case '=': + switch nextChar { + case '=': + return token.EQEQUAL + } + break + case '!': + switch nextChar { + case '=': + return token.NOTEQUAL + } + break + case '<': + switch nextChar { + case '>': + return token.NOTEQUAL + case '=': + return token.LESSEQUAL + case '<': + return token.LEFTSHIFT + } + break + case '>': + switch nextChar { + case '=': + return token.GREATEREQUAL + case '>': + return token.RIGHTSHIFT + } + break + case '+': + switch nextChar { + case '=': + return token.PLUSEQUAL + } + break + case '-': + switch nextChar { + case '=': + return token.MINEQUAL + case '>': + return token.RARROW + } + break + case '*': + switch nextChar { + case '*': + return token.DOUBLESTAR + case '=': + return token.STAREQUAL + } + break + case '/': + switch nextChar { + case '/': + return token.DOUBLESLASH + case '=': + return token.SLASHEQUAL + } + break + case '|': + switch nextChar { + case '=': + return token.VBAREQUAL + } + break + case '%': + switch nextChar { + case '=': + return token.PERCENTEQUAL + } + break + case '&': + switch nextChar { + case '=': + return token.AMPEREQUAL + } + break + case '^': + switch nextChar { + case '=': + return token.CIRCUMFLEXEQUAL + } + break + } + return token.OP +} + +func GetThreeCharTokenID(curChar rune, nextChar rune, thirdChar rune) token.TokenID { + switch curChar { + case '<': + switch nextChar { + case '<': + switch thirdChar { + case '=': + return token.LEFTSHIFTEQUAL + } + break + } + break + case '>': + switch nextChar { + case '>': + switch thirdChar { + case '=': + return token.RIGHTSHIFTEQUAL + } + break + } + break + case '*': + switch nextChar { + case '*': + switch thirdChar { + case '=': + return token.DOUBLESTAREQUAL + } + break + } + break + case '/': + switch nextChar { + case '/': + switch thirdChar { + case '=': + return token.DOUBLESLASHEQUAL + } + break + } + break + case '.': + switch nextChar { + case '.': + switch thirdChar { + case '.': + return token.ELLIPSIS + } + break + } + break + } + return token.OP +} + +func GetOneCharTokenID(curChar rune) token.TokenID { + switch curChar { + case '(': + return token.LPAR + case ')': + return token.RPAR + case '[': + return token.LSQB + case ']': + return token.RSQB + case ':': + return token.COLON + case ',': + return token.COMMA + case ';': + return token.SEMI + case '+': + return token.PLUS + case '-': + return token.MINUS + case '*': + return token.STAR + case '/': + return token.SLASH + case '|': + return token.VBAR + case '&': + return token.AMPER + case '<': + return token.LESS + case '>': + return token.GREATER + case '=': + return token.EQUAL + case '.': + return token.DOT + case '%': + return token.PERCENT + case '{': + return token.LBRACE + case '}': + return token.RBRACE + case '^': + return token.CIRCUMFLEX + case '~': + return token.TILDE + case '@': + return token.AT + } + return token.OP +} diff --git a/parser/tokenizer.go b/parser/tokenizer.go new file mode 100644 index 0000000..5d3e23c --- /dev/null +++ b/parser/tokenizer.go @@ -0,0 +1,383 @@ +package parser + +import ( + "bufio" + "os" + + "github.com/brettlangdon/gython/token" +) + +var EOF rune = 0 + +type TokenizerState struct { + buffer *bufio.Reader + curColumn int + curIndent int + curLevel int + curLine int + curLiteral string + fp *os.File + nestingLevel int + offset int + tabsize int +} + +func newTokenizerState() *TokenizerState { + return &TokenizerState{ + curColumn: 1, + curIndent: 0, + curLevel: 0, + curLine: 1, + curLiteral: "", + nestingLevel: 0, + offset: 0, + tabsize: 8, + } +} + +func TokenizerFromFileName(filename string) (*TokenizerState, error) { + state := newTokenizerState() + fp, err := os.Open(filename) + if err != nil { + return nil, err + } + state.fp = fp + + state.buffer = bufio.NewReader(state.fp) + return state, nil +} + +func (tokenizer *TokenizerState) readNext() rune { + next, _, err := tokenizer.buffer.ReadRune() + if err != nil { + next = EOF + } + tokenizer.offset += 1 + if next != EOF { + tokenizer.curLiteral += string(next) + } + return next +} + +func (tokenizer *TokenizerState) unread() error { + err := tokenizer.buffer.UnreadRune() + tokenizer.offset -= 1 + if len(tokenizer.curLiteral) > 0 { + tokenizer.curLiteral = tokenizer.curLiteral[0 : len(tokenizer.curLiteral)-1] + } + return err +} + +func (tokenizer *TokenizerState) finalizeToken(tok *token.Token, tokId token.TokenID) *token.Token { + tok.ID = tokId + tok.End = tokenizer.offset + tok.Literal = tokenizer.curLiteral + return tok +} + +func (tokenizer *TokenizerState) newToken() *token.Token { + tokenizer.curLiteral = "" + return &token.Token{ + ID: token.ERRORTOKEN, + Start: tokenizer.offset, + End: tokenizer.offset - 1, + Literal: tokenizer.curLiteral, + } +} + +func (tokenizer *TokenizerState) parseQuoted(curTok *token.Token, nextChar rune) *token.Token { + quote := nextChar + quoteSize := 1 + endQuoteSize := 0 + nextChar = tokenizer.readNext() + if nextChar == quote { + nextChar = tokenizer.readNext() + if nextChar == quote { + quoteSize = 3 + } else { + endQuoteSize = 1 + } + } + + if nextChar != quote { + tokenizer.unread() + } + + for { + if endQuoteSize == quoteSize { + break + } + nextChar = tokenizer.readNext() + if nextChar == EOF { + return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) + } + if quoteSize == 1 && nextChar == '\n' { + return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) + } + if nextChar == quote { + endQuoteSize += 1 + } else { + endQuoteSize = 0 + if nextChar == '\\' { + nextChar = tokenizer.readNext() + } + } + } + return tokenizer.finalizeToken(curTok, token.STRING) +} + +func (tokenizer *TokenizerState) parseNumber(curTok *token.Token, nextChar rune, fraction bool) *token.Token { + if fraction { + goto fraction + } + if nextChar == '0' { + nextChar = tokenizer.readNext() + if nextChar == '.' { + tokenizer.unread() + goto fraction + } + if nextChar == 'j' || nextChar == 'J' { + tokenizer.unread() + goto imaginary + } + if nextChar == 'x' || nextChar == 'X' { + // Hex + nextChar = tokenizer.readNext() + if !IsXDigit(nextChar) { + return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) + } + for IsXDigit(nextChar) { + nextChar = tokenizer.readNext() + } + tokenizer.unread() + } else if nextChar == 'o' || nextChar == 'O' { + // Octal + nextChar = tokenizer.readNext() + if nextChar < '0' || nextChar >= '8' { + return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) + } + for nextChar >= '0' && nextChar < '8' { + nextChar = tokenizer.readNext() + } + tokenizer.unread() + } else if nextChar == 'b' || nextChar == 'B' { + // Binary + nextChar = tokenizer.readNext() + if nextChar != '0' && nextChar != '1' { + return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) + } + for nextChar == '0' || nextChar == '1' { + nextChar = tokenizer.readNext() + } + tokenizer.unread() + } else { + nonzero := false + for nextChar == '0' { + nextChar = tokenizer.readNext() + } + for IsDigit(nextChar) { + nonzero = true + nextChar = tokenizer.readNext() + } + tokenizer.unread() + + if nextChar == '.' { + goto fraction + } else if nextChar == 'e' || nextChar == 'E' { + goto exponent + } else if nextChar == 'j' || nextChar == 'J' { + goto imaginary + } else if nonzero { + return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) + } + goto end + } + } else { + // Decimal + for IsDigit(nextChar) { + nextChar = tokenizer.readNext() + } + tokenizer.unread() + goto fraction + } +fraction: + if nextChar == '.' { + nextChar = tokenizer.readNext() + nextChar = tokenizer.readNext() + for IsDigit(nextChar) { + nextChar = tokenizer.readNext() + } + tokenizer.unread() + } +exponent: + if nextChar == 'e' || nextChar == 'E' { + nextChar = tokenizer.readNext() + nextChar = tokenizer.readNext() + if nextChar == '+' || nextChar == '-' { + nextChar = tokenizer.readNext() + if !IsDigit(nextChar) { + return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) + } + } else if !IsDigit(nextChar) { + tokenizer.unread() + tokenizer.unread() + return tokenizer.finalizeToken(curTok, token.NUMBER) + } + for IsDigit(nextChar) { + nextChar = tokenizer.readNext() + } + tokenizer.unread() + } +imaginary: + if nextChar == 'j' || nextChar == 'J' { + nextChar = tokenizer.readNext() + nextChar = tokenizer.readNext() + } +end: + return tokenizer.finalizeToken(curTok, token.NUMBER) +} + +func (tokenizer *TokenizerState) Next() *token.Token { + curTok := tokenizer.newToken() + col := 0 + nextChar := EOF + // Get indentation level + for { + nextChar = tokenizer.readNext() + if nextChar == ' ' { + col += 1 + } else if nextChar == '\t' { + col = (col/tokenizer.tabsize + 1) * tokenizer.tabsize + } else { + break + } + } + tokenizer.unread() + +again: + // Skip spaces + for { + nextChar = tokenizer.readNext() + if !(nextChar == ' ' || nextChar == '\t') { + break + } + } + curTok.Start = tokenizer.offset - 1 + tokenizer.curLiteral = string(nextChar) + + // Skip comments + if nextChar == '#' { + for { + nextChar = tokenizer.readNext() + if nextChar == EOF || nextChar == '\n' { + break + } + } + } + + // Check for EOF + if nextChar == EOF { + return tokenizer.finalizeToken(curTok, token.ENDMARKER) + } + + if IsIdentifierStart(nextChar) { + saw_b, saw_r, saw_u := false, false, false + for { + if !(saw_b || saw_u) && (nextChar == 'b' || nextChar == 'B') { + saw_b = true + } else if !(saw_b || saw_u || saw_r) && (nextChar == 'u' || nextChar == 'U') { + saw_u = true + } else if !(saw_r || saw_u) && (nextChar == 'r' || nextChar == 'R') { + saw_r = true + } else { + break + } + nextChar = tokenizer.readNext() + if IsQuote(nextChar) { + goto letter_quote + } + } + for IsIdentifierChar(nextChar) { + nextChar = tokenizer.readNext() + } + tokenizer.unread() + return tokenizer.finalizeToken(curTok, token.NAME) + } + + // Newline + if nextChar == '\n' { + return tokenizer.finalizeToken(curTok, token.NEWLINE) + } + + // Dot or number starting with dot + if nextChar == '.' { + nextChar = tokenizer.readNext() + if IsDigit(nextChar) { + return tokenizer.parseNumber(curTok, nextChar, true) + } else if nextChar == '.' { + nextChar = tokenizer.readNext() + if nextChar == '.' { + return tokenizer.finalizeToken(curTok, token.ELLIPSIS) + } else { + tokenizer.unread() + } + tokenizer.unread() + } else { + tokenizer.unread() + } + + return tokenizer.finalizeToken(curTok, token.DOT) + } + + // Number + if IsDigit(nextChar) { + return tokenizer.parseNumber(curTok, nextChar, false) + } + +letter_quote: + // String + if IsQuote(nextChar) { + return tokenizer.parseQuoted(curTok, nextChar) + } + + // Line continuation + if nextChar == '\\' { + nextChar = tokenizer.readNext() + if nextChar != '\n' { + return tokenizer.finalizeToken(curTok, token.ERRORTOKEN) + goto again + } + } + + { + // Check for two character tokens + curChar := nextChar + nextChar = tokenizer.readNext() + tokId := GetTwoCharTokenID(curChar, nextChar) + if tokId != token.OP { + thirdChar := tokenizer.readNext() + nextTokId := GetThreeCharTokenID(curChar, nextChar, thirdChar) + if nextTokId != token.OP { + tokId = nextTokId + } else { + tokenizer.unread() + } + return tokenizer.finalizeToken(curTok, tokId) + } + tokenizer.unread() + nextChar = curChar + tokenizer.curLiteral = string(curChar) + } + + switch nextChar { + case '(', '[', '{': + tokenizer.nestingLevel++ + break + case ')', ']', '}': + tokenizer.nestingLevel-- + break + } + + tokId := GetOneCharTokenID(nextChar) + return tokenizer.finalizeToken(curTok, tokId) +} diff --git a/token/id.go b/token/id.go new file mode 100644 index 0000000..adf3bb7 --- /dev/null +++ b/token/id.go @@ -0,0 +1,61 @@ +package token + +type TokenID int + +const ( + ENDMARKER TokenID = iota + NAME + NUMBER + STRING + NEWLINE + INDENT + DEDENT + LPAR + RPAR + LSQB + RSQB + COLON + COMMA + SEMI + PLUS + MINUS + STAR + SLASH + VBAR + AMPER + LESS + GREATER + EQUAL + DOT + PERCENT + LBRACE + RBRACE + EQEQUAL + NOTEQUAL + LESSEQUAL + GREATEREQUAL + TILDE + CIRCUMFLEX + LEFTSHIFT + RIGHTSHIFT + DOUBLESTAR + PLUSEQUAL + MINEQUAL + STAREQUAL + SLASHEQUAL + PERCENTEQUAL + AMPEREQUAL + VBAREQUAL + CIRCUMFLEXEQUAL + LEFTSHIFTEQUAL + RIGHTSHIFTEQUAL + DOUBLESTAREQUAL + DOUBLESLASH + DOUBLESLASHEQUAL + AT + RARROW + ELLIPSIS + OP + ERRORTOKEN + N_TOKENS +) diff --git a/token/names.go b/token/names.go new file mode 100644 index 0000000..ed99731 --- /dev/null +++ b/token/names.go @@ -0,0 +1,59 @@ +package token + +var TokenNames = map[TokenID]string{ + ENDMARKER: "ENDMARKER", + NAME: "NAME", + NUMBER: "NUMBER", + STRING: "STRING", + NEWLINE: "NEWLINE", + INDENT: "INDENT", + DEDENT: "DEDENT", + LPAR: "LPAR", + RPAR: "RPAR", + LSQB: "LSQB", + RSQB: "RSQB", + COLON: "COLON", + COMMA: "COMMA", + SEMI: "SEMI", + PLUS: "PLUS", + MINUS: "MINUS", + STAR: "STAR", + SLASH: "SLASH", + VBAR: "VBAR", + AMPER: "AMPER", + LESS: "LESS", + GREATER: "GREATER", + EQUAL: "EQUAL", + DOT: "DOT", + PERCENT: "PERCENT", + LBRACE: "LBRACE", + RBRACE: "RBRACE", + EQEQUAL: "EQEQUAL", + NOTEQUAL: "NOTEQUAL", + LESSEQUAL: "LESSEQUAL", + GREATEREQUAL: "GREATEREQUAL", + TILDE: "TILDE", + CIRCUMFLEX: "CIRCUMFLEX", + LEFTSHIFT: "LEFTSHIFT", + RIGHTSHIFT: "RIGHTSHIFT", + DOUBLESTAR: "DOUBLESTAR", + PLUSEQUAL: "PLUSEQUAL", + MINEQUAL: "MINEQUAL", + STAREQUAL: "STAREQUAL", + SLASHEQUAL: "SLASHEQUAL", + PERCENTEQUAL: "PERCENTEQUAL", + AMPEREQUAL: "AMPEREQUAL", + VBAREQUAL: "VBAREQUAL", + CIRCUMFLEXEQUAL: "CIRCUMFLEXEQUAL", + LEFTSHIFTEQUAL: "LEFTSHIFTEQUAL", + RIGHTSHIFTEQUAL: "RIGHTSHIFTEQUAL", + DOUBLESTAREQUAL: "DOUBLESTAREQUAL", + DOUBLESLASH: "DOUBLESLASH", + DOUBLESLASHEQUAL: "DOUBLESLASHEQUAL", + AT: "AT", + RARROW: "RARROW", + ELLIPSIS: "ELLIPSIS", + OP: "OP", + ERRORTOKEN: "", + N_TOKENS: "", +} diff --git a/token/token.go b/token/token.go new file mode 100644 index 0000000..bf1e051 --- /dev/null +++ b/token/token.go @@ -0,0 +1,24 @@ +package token + +import "fmt" + +type Token struct { + End int + ID TokenID + Literal string + Start int +} + +func (token *Token) String() string { + return TokenNames[token.ID] +} + +func (token *Token) Repr() string { + return fmt.Sprintf( + "Token{ID: %#v, Literal: %#v, Start: %#v, End: %#v}", + token.ID, + token.Literal, + token.Start, + token.End, + ) +}