Browse Source

basic port of tokenizer from CPython

master
Brett Langdon 10 years ago
parent
commit
7a8c9699ee
6 changed files with 779 additions and 0 deletions
  1. +23
    -0
      main.go
  2. +229
    -0
      parser/helpers.go
  3. +383
    -0
      parser/tokenizer.go
  4. +61
    -0
      token/id.go
  5. +59
    -0
      token/names.go
  6. +24
    -0
      token/token.go

+ 23
- 0
main.go View File

@ -0,0 +1,23 @@
package main
import (
"fmt"
"os"
"github.com/brettlangdon/gython/parser"
"github.com/brettlangdon/gython/token"
)
func main() {
tokenizer, err := parser.TokenizerFromFileName(os.Args[1])
if err != nil {
panic(err)
}
for {
tok := tokenizer.Next()
if tok.ID == token.ENDMARKER || tok.ID == token.ERRORTOKEN {
break
}
fmt.Println(fmt.Sprintf("<%s> %s", tok, tok.Repr()))
}
}

+ 229
- 0
parser/helpers.go View File

@ -0,0 +1,229 @@
package parser
import "github.com/brettlangdon/gython/token"
func IsLetter(r rune) bool {
return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')
}
func IsDigit(r rune) bool {
return r >= '0' && r <= '9'
}
func IsXDigit(r rune) bool {
return IsDigit(r) || (r >= 65 && r <= 70) || (r >= 97 && r <= 102)
}
func IsAlphaNumeric(r rune) bool {
return IsLetter(r) || IsDigit(r)
}
func IsIdentifierStart(r rune) bool {
return IsLetter(r) || r == '_' || r >= 128
}
func IsIdentifierChar(r rune) bool {
return IsIdentifierStart(r) || IsDigit(r)
}
func IsQuote(r rune) bool {
return r == '"' || r == '\''
}
func GetTwoCharTokenID(curChar rune, nextChar rune) token.TokenID {
switch curChar {
case '=':
switch nextChar {
case '=':
return token.EQEQUAL
}
break
case '!':
switch nextChar {
case '=':
return token.NOTEQUAL
}
break
case '<':
switch nextChar {
case '>':
return token.NOTEQUAL
case '=':
return token.LESSEQUAL
case '<':
return token.LEFTSHIFT
}
break
case '>':
switch nextChar {
case '=':
return token.GREATEREQUAL
case '>':
return token.RIGHTSHIFT
}
break
case '+':
switch nextChar {
case '=':
return token.PLUSEQUAL
}
break
case '-':
switch nextChar {
case '=':
return token.MINEQUAL
case '>':
return token.RARROW
}
break
case '*':
switch nextChar {
case '*':
return token.DOUBLESTAR
case '=':
return token.STAREQUAL
}
break
case '/':
switch nextChar {
case '/':
return token.DOUBLESLASH
case '=':
return token.SLASHEQUAL
}
break
case '|':
switch nextChar {
case '=':
return token.VBAREQUAL
}
break
case '%':
switch nextChar {
case '=':
return token.PERCENTEQUAL
}
break
case '&':
switch nextChar {
case '=':
return token.AMPEREQUAL
}
break
case '^':
switch nextChar {
case '=':
return token.CIRCUMFLEXEQUAL
}
break
}
return token.OP
}
func GetThreeCharTokenID(curChar rune, nextChar rune, thirdChar rune) token.TokenID {
switch curChar {
case '<':
switch nextChar {
case '<':
switch thirdChar {
case '=':
return token.LEFTSHIFTEQUAL
}
break
}
break
case '>':
switch nextChar {
case '>':
switch thirdChar {
case '=':
return token.RIGHTSHIFTEQUAL
}
break
}
break
case '*':
switch nextChar {
case '*':
switch thirdChar {
case '=':
return token.DOUBLESTAREQUAL
}
break
}
break
case '/':
switch nextChar {
case '/':
switch thirdChar {
case '=':
return token.DOUBLESLASHEQUAL
}
break
}
break
case '.':
switch nextChar {
case '.':
switch thirdChar {
case '.':
return token.ELLIPSIS
}
break
}
break
}
return token.OP
}
func GetOneCharTokenID(curChar rune) token.TokenID {
switch curChar {
case '(':
return token.LPAR
case ')':
return token.RPAR
case '[':
return token.LSQB
case ']':
return token.RSQB
case ':':
return token.COLON
case ',':
return token.COMMA
case ';':
return token.SEMI
case '+':
return token.PLUS
case '-':
return token.MINUS
case '*':
return token.STAR
case '/':
return token.SLASH
case '|':
return token.VBAR
case '&':
return token.AMPER
case '<':
return token.LESS
case '>':
return token.GREATER
case '=':
return token.EQUAL
case '.':
return token.DOT
case '%':
return token.PERCENT
case '{':
return token.LBRACE
case '}':
return token.RBRACE
case '^':
return token.CIRCUMFLEX
case '~':
return token.TILDE
case '@':
return token.AT
}
return token.OP
}

+ 383
- 0
parser/tokenizer.go View File

@ -0,0 +1,383 @@
package parser
import (
"bufio"
"os"
"github.com/brettlangdon/gython/token"
)
var EOF rune = 0
type TokenizerState struct {
buffer *bufio.Reader
curColumn int
curIndent int
curLevel int
curLine int
curLiteral string
fp *os.File
nestingLevel int
offset int
tabsize int
}
func newTokenizerState() *TokenizerState {
return &TokenizerState{
curColumn: 1,
curIndent: 0,
curLevel: 0,
curLine: 1,
curLiteral: "",
nestingLevel: 0,
offset: 0,
tabsize: 8,
}
}
func TokenizerFromFileName(filename string) (*TokenizerState, error) {
state := newTokenizerState()
fp, err := os.Open(filename)
if err != nil {
return nil, err
}
state.fp = fp
state.buffer = bufio.NewReader(state.fp)
return state, nil
}
func (tokenizer *TokenizerState) readNext() rune {
next, _, err := tokenizer.buffer.ReadRune()
if err != nil {
next = EOF
}
tokenizer.offset += 1
if next != EOF {
tokenizer.curLiteral += string(next)
}
return next
}
func (tokenizer *TokenizerState) unread() error {
err := tokenizer.buffer.UnreadRune()
tokenizer.offset -= 1
if len(tokenizer.curLiteral) > 0 {
tokenizer.curLiteral = tokenizer.curLiteral[0 : len(tokenizer.curLiteral)-1]
}
return err
}
func (tokenizer *TokenizerState) finalizeToken(tok *token.Token, tokId token.TokenID) *token.Token {
tok.ID = tokId
tok.End = tokenizer.offset
tok.Literal = tokenizer.curLiteral
return tok
}
func (tokenizer *TokenizerState) newToken() *token.Token {
tokenizer.curLiteral = ""
return &token.Token{
ID: token.ERRORTOKEN,
Start: tokenizer.offset,
End: tokenizer.offset - 1,
Literal: tokenizer.curLiteral,
}
}
func (tokenizer *TokenizerState) parseQuoted(curTok *token.Token, nextChar rune) *token.Token {
quote := nextChar
quoteSize := 1
endQuoteSize := 0
nextChar = tokenizer.readNext()
if nextChar == quote {
nextChar = tokenizer.readNext()
if nextChar == quote {
quoteSize = 3
} else {
endQuoteSize = 1
}
}
if nextChar != quote {
tokenizer.unread()
}
for {
if endQuoteSize == quoteSize {
break
}
nextChar = tokenizer.readNext()
if nextChar == EOF {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
if quoteSize == 1 && nextChar == '\n' {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
if nextChar == quote {
endQuoteSize += 1
} else {
endQuoteSize = 0
if nextChar == '\\' {
nextChar = tokenizer.readNext()
}
}
}
return tokenizer.finalizeToken(curTok, token.STRING)
}
func (tokenizer *TokenizerState) parseNumber(curTok *token.Token, nextChar rune, fraction bool) *token.Token {
if fraction {
goto fraction
}
if nextChar == '0' {
nextChar = tokenizer.readNext()
if nextChar == '.' {
tokenizer.unread()
goto fraction
}
if nextChar == 'j' || nextChar == 'J' {
tokenizer.unread()
goto imaginary
}
if nextChar == 'x' || nextChar == 'X' {
// Hex
nextChar = tokenizer.readNext()
if !IsXDigit(nextChar) {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
for IsXDigit(nextChar) {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
} else if nextChar == 'o' || nextChar == 'O' {
// Octal
nextChar = tokenizer.readNext()
if nextChar < '0' || nextChar >= '8' {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
for nextChar >= '0' && nextChar < '8' {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
} else if nextChar == 'b' || nextChar == 'B' {
// Binary
nextChar = tokenizer.readNext()
if nextChar != '0' && nextChar != '1' {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
for nextChar == '0' || nextChar == '1' {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
} else {
nonzero := false
for nextChar == '0' {
nextChar = tokenizer.readNext()
}
for IsDigit(nextChar) {
nonzero = true
nextChar = tokenizer.readNext()
}
tokenizer.unread()
if nextChar == '.' {
goto fraction
} else if nextChar == 'e' || nextChar == 'E' {
goto exponent
} else if nextChar == 'j' || nextChar == 'J' {
goto imaginary
} else if nonzero {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
goto end
}
} else {
// Decimal
for IsDigit(nextChar) {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
goto fraction
}
fraction:
if nextChar == '.' {
nextChar = tokenizer.readNext()
nextChar = tokenizer.readNext()
for IsDigit(nextChar) {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
}
exponent:
if nextChar == 'e' || nextChar == 'E' {
nextChar = tokenizer.readNext()
nextChar = tokenizer.readNext()
if nextChar == '+' || nextChar == '-' {
nextChar = tokenizer.readNext()
if !IsDigit(nextChar) {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
} else if !IsDigit(nextChar) {
tokenizer.unread()
tokenizer.unread()
return tokenizer.finalizeToken(curTok, token.NUMBER)
}
for IsDigit(nextChar) {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
}
imaginary:
if nextChar == 'j' || nextChar == 'J' {
nextChar = tokenizer.readNext()
nextChar = tokenizer.readNext()
}
end:
return tokenizer.finalizeToken(curTok, token.NUMBER)
}
func (tokenizer *TokenizerState) Next() *token.Token {
curTok := tokenizer.newToken()
col := 0
nextChar := EOF
// Get indentation level
for {
nextChar = tokenizer.readNext()
if nextChar == ' ' {
col += 1
} else if nextChar == '\t' {
col = (col/tokenizer.tabsize + 1) * tokenizer.tabsize
} else {
break
}
}
tokenizer.unread()
again:
// Skip spaces
for {
nextChar = tokenizer.readNext()
if !(nextChar == ' ' || nextChar == '\t') {
break
}
}
curTok.Start = tokenizer.offset - 1
tokenizer.curLiteral = string(nextChar)
// Skip comments
if nextChar == '#' {
for {
nextChar = tokenizer.readNext()
if nextChar == EOF || nextChar == '\n' {
break
}
}
}
// Check for EOF
if nextChar == EOF {
return tokenizer.finalizeToken(curTok, token.ENDMARKER)
}
if IsIdentifierStart(nextChar) {
saw_b, saw_r, saw_u := false, false, false
for {
if !(saw_b || saw_u) && (nextChar == 'b' || nextChar == 'B') {
saw_b = true
} else if !(saw_b || saw_u || saw_r) && (nextChar == 'u' || nextChar == 'U') {
saw_u = true
} else if !(saw_r || saw_u) && (nextChar == 'r' || nextChar == 'R') {
saw_r = true
} else {
break
}
nextChar = tokenizer.readNext()
if IsQuote(nextChar) {
goto letter_quote
}
}
for IsIdentifierChar(nextChar) {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
return tokenizer.finalizeToken(curTok, token.NAME)
}
// Newline
if nextChar == '\n' {
return tokenizer.finalizeToken(curTok, token.NEWLINE)
}
// Dot or number starting with dot
if nextChar == '.' {
nextChar = tokenizer.readNext()
if IsDigit(nextChar) {
return tokenizer.parseNumber(curTok, nextChar, true)
} else if nextChar == '.' {
nextChar = tokenizer.readNext()
if nextChar == '.' {
return tokenizer.finalizeToken(curTok, token.ELLIPSIS)
} else {
tokenizer.unread()
}
tokenizer.unread()
} else {
tokenizer.unread()
}
return tokenizer.finalizeToken(curTok, token.DOT)
}
// Number
if IsDigit(nextChar) {
return tokenizer.parseNumber(curTok, nextChar, false)
}
letter_quote:
// String
if IsQuote(nextChar) {
return tokenizer.parseQuoted(curTok, nextChar)
}
// Line continuation
if nextChar == '\\' {
nextChar = tokenizer.readNext()
if nextChar != '\n' {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
goto again
}
}
{
// Check for two character tokens
curChar := nextChar
nextChar = tokenizer.readNext()
tokId := GetTwoCharTokenID(curChar, nextChar)
if tokId != token.OP {
thirdChar := tokenizer.readNext()
nextTokId := GetThreeCharTokenID(curChar, nextChar, thirdChar)
if nextTokId != token.OP {
tokId = nextTokId
} else {
tokenizer.unread()
}
return tokenizer.finalizeToken(curTok, tokId)
}
tokenizer.unread()
nextChar = curChar
tokenizer.curLiteral = string(curChar)
}
switch nextChar {
case '(', '[', '{':
tokenizer.nestingLevel++
break
case ')', ']', '}':
tokenizer.nestingLevel--
break
}
tokId := GetOneCharTokenID(nextChar)
return tokenizer.finalizeToken(curTok, tokId)
}

+ 61
- 0
token/id.go View File

@ -0,0 +1,61 @@
package token
type TokenID int
const (
ENDMARKER TokenID = iota
NAME
NUMBER
STRING
NEWLINE
INDENT
DEDENT
LPAR
RPAR
LSQB
RSQB
COLON
COMMA
SEMI
PLUS
MINUS
STAR
SLASH
VBAR
AMPER
LESS
GREATER
EQUAL
DOT
PERCENT
LBRACE
RBRACE
EQEQUAL
NOTEQUAL
LESSEQUAL
GREATEREQUAL
TILDE
CIRCUMFLEX
LEFTSHIFT
RIGHTSHIFT
DOUBLESTAR
PLUSEQUAL
MINEQUAL
STAREQUAL
SLASHEQUAL
PERCENTEQUAL
AMPEREQUAL
VBAREQUAL
CIRCUMFLEXEQUAL
LEFTSHIFTEQUAL
RIGHTSHIFTEQUAL
DOUBLESTAREQUAL
DOUBLESLASH
DOUBLESLASHEQUAL
AT
RARROW
ELLIPSIS
OP
ERRORTOKEN
N_TOKENS
)

+ 59
- 0
token/names.go View File

@ -0,0 +1,59 @@
package token
var TokenNames = map[TokenID]string{
ENDMARKER: "ENDMARKER",
NAME: "NAME",
NUMBER: "NUMBER",
STRING: "STRING",
NEWLINE: "NEWLINE",
INDENT: "INDENT",
DEDENT: "DEDENT",
LPAR: "LPAR",
RPAR: "RPAR",
LSQB: "LSQB",
RSQB: "RSQB",
COLON: "COLON",
COMMA: "COMMA",
SEMI: "SEMI",
PLUS: "PLUS",
MINUS: "MINUS",
STAR: "STAR",
SLASH: "SLASH",
VBAR: "VBAR",
AMPER: "AMPER",
LESS: "LESS",
GREATER: "GREATER",
EQUAL: "EQUAL",
DOT: "DOT",
PERCENT: "PERCENT",
LBRACE: "LBRACE",
RBRACE: "RBRACE",
EQEQUAL: "EQEQUAL",
NOTEQUAL: "NOTEQUAL",
LESSEQUAL: "LESSEQUAL",
GREATEREQUAL: "GREATEREQUAL",
TILDE: "TILDE",
CIRCUMFLEX: "CIRCUMFLEX",
LEFTSHIFT: "LEFTSHIFT",
RIGHTSHIFT: "RIGHTSHIFT",
DOUBLESTAR: "DOUBLESTAR",
PLUSEQUAL: "PLUSEQUAL",
MINEQUAL: "MINEQUAL",
STAREQUAL: "STAREQUAL",
SLASHEQUAL: "SLASHEQUAL",
PERCENTEQUAL: "PERCENTEQUAL",
AMPEREQUAL: "AMPEREQUAL",
VBAREQUAL: "VBAREQUAL",
CIRCUMFLEXEQUAL: "CIRCUMFLEXEQUAL",
LEFTSHIFTEQUAL: "LEFTSHIFTEQUAL",
RIGHTSHIFTEQUAL: "RIGHTSHIFTEQUAL",
DOUBLESTAREQUAL: "DOUBLESTAREQUAL",
DOUBLESLASH: "DOUBLESLASH",
DOUBLESLASHEQUAL: "DOUBLESLASHEQUAL",
AT: "AT",
RARROW: "RARROW",
ELLIPSIS: "ELLIPSIS",
OP: "OP",
ERRORTOKEN: "<ERRORTOKEN>",
N_TOKENS: "<N_TOKENS>",
}

+ 24
- 0
token/token.go View File

@ -0,0 +1,24 @@
package token
import "fmt"
type Token struct {
End int
ID TokenID
Literal string
Start int
}
func (token *Token) String() string {
return TokenNames[token.ID]
}
func (token *Token) Repr() string {
return fmt.Sprintf(
"Token{ID: %#v, Literal: %#v, Start: %#v, End: %#v}",
token.ID,
token.Literal,
token.Start,
token.End,
)
}

Loading…
Cancel
Save