Browse Source

no longer using parser.tokenizer

master
Brett Langdon 10 years ago
parent
commit
281d3a270a
2 changed files with 0 additions and 691 deletions
  1. +0
    -229
      parser/helpers.go
  2. +0
    -462
      parser/tokenizer.go

+ 0
- 229
parser/helpers.go View File

@ -1,229 +0,0 @@
package parser
import "github.com/brettlangdon/gython/token"
func IsLetter(r rune) bool {
return (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z')
}
func IsDigit(r rune) bool {
return r >= '0' && r <= '9'
}
func IsXDigit(r rune) bool {
return IsDigit(r) || (r >= 65 && r <= 70) || (r >= 97 && r <= 102)
}
func IsAlphaNumeric(r rune) bool {
return IsLetter(r) || IsDigit(r)
}
func IsIdentifierStart(r rune) bool {
return IsLetter(r) || r == '_' || r >= 128
}
func IsIdentifierChar(r rune) bool {
return IsIdentifierStart(r) || IsDigit(r)
}
func IsQuote(r rune) bool {
return r == '"' || r == '\''
}
func GetTwoCharTokenID(curChar rune, nextChar rune) token.TokenID {
switch curChar {
case '=':
switch nextChar {
case '=':
return token.EQEQUAL
}
break
case '!':
switch nextChar {
case '=':
return token.NOTEQUAL
}
break
case '<':
switch nextChar {
case '>':
return token.NOTEQUAL
case '=':
return token.LESSEQUAL
case '<':
return token.LEFTSHIFT
}
break
case '>':
switch nextChar {
case '=':
return token.GREATEREQUAL
case '>':
return token.RIGHTSHIFT
}
break
case '+':
switch nextChar {
case '=':
return token.PLUSEQUAL
}
break
case '-':
switch nextChar {
case '=':
return token.MINEQUAL
case '>':
return token.RARROW
}
break
case '*':
switch nextChar {
case '*':
return token.DOUBLESTAR
case '=':
return token.STAREQUAL
}
break
case '/':
switch nextChar {
case '/':
return token.DOUBLESLASH
case '=':
return token.SLASHEQUAL
}
break
case '|':
switch nextChar {
case '=':
return token.VBAREQUAL
}
break
case '%':
switch nextChar {
case '=':
return token.PERCENTEQUAL
}
break
case '&':
switch nextChar {
case '=':
return token.AMPEREQUAL
}
break
case '^':
switch nextChar {
case '=':
return token.CIRCUMFLEXEQUAL
}
break
}
return token.OP
}
func GetThreeCharTokenID(curChar rune, nextChar rune, thirdChar rune) token.TokenID {
switch curChar {
case '<':
switch nextChar {
case '<':
switch thirdChar {
case '=':
return token.LEFTSHIFTEQUAL
}
break
}
break
case '>':
switch nextChar {
case '>':
switch thirdChar {
case '=':
return token.RIGHTSHIFTEQUAL
}
break
}
break
case '*':
switch nextChar {
case '*':
switch thirdChar {
case '=':
return token.DOUBLESTAREQUAL
}
break
}
break
case '/':
switch nextChar {
case '/':
switch thirdChar {
case '=':
return token.DOUBLESLASHEQUAL
}
break
}
break
case '.':
switch nextChar {
case '.':
switch thirdChar {
case '.':
return token.ELLIPSIS
}
break
}
break
}
return token.OP
}
func GetOneCharTokenID(curChar rune) token.TokenID {
switch curChar {
case '(':
return token.LPAR
case ')':
return token.RPAR
case '[':
return token.LSQB
case ']':
return token.RSQB
case ':':
return token.COLON
case ',':
return token.COMMA
case ';':
return token.SEMI
case '+':
return token.PLUS
case '-':
return token.MINUS
case '*':
return token.STAR
case '/':
return token.SLASH
case '|':
return token.VBAR
case '&':
return token.AMPER
case '<':
return token.LESS
case '>':
return token.GREATER
case '=':
return token.EQUAL
case '.':
return token.DOT
case '%':
return token.PERCENT
case '{':
return token.LBRACE
case '}':
return token.RBRACE
case '^':
return token.CIRCUMFLEX
case '~':
return token.TILDE
case '@':
return token.AT
}
return token.OP
}

+ 0
- 462
parser/tokenizer.go View File

@ -1,462 +0,0 @@
package parser
import (
"bufio"
"os"
"github.com/brettlangdon/gython/token"
)
var EOF rune = 0
var MAXINDENT int = 100
type TokenizerState struct {
atBol bool
buffer *bufio.Reader
curColumn int
curLine int
curLiteral string
fp *os.File
indentationLevel int
indentationAltStack []int
indentationCurrent int
indentationPending int
indentationStack []int
offset int
tabsize int
tabsizeAlt int
}
func newTokenizerState() *TokenizerState {
return &TokenizerState{
atBol: true,
curColumn: 0,
curLine: 1,
curLiteral: "",
indentationAltStack: make([]int, MAXINDENT),
indentationCurrent: 0,
indentationLevel: 0,
indentationPending: 0,
indentationStack: make([]int, MAXINDENT),
offset: 0,
tabsize: 8,
}
}
func TokenizerFromFileName(filename string) (*TokenizerState, error) {
state := newTokenizerState()
fp, err := os.Open(filename)
if err != nil {
return nil, err
}
state.fp = fp
state.buffer = bufio.NewReader(state.fp)
return state, nil
}
func (tokenizer *TokenizerState) readNext() rune {
next, _, err := tokenizer.buffer.ReadRune()
if err != nil {
next = EOF
}
tokenizer.offset += 1
tokenizer.curColumn += 1
if next != EOF {
tokenizer.curLiteral += string(next)
}
return next
}
func (tokenizer *TokenizerState) unread() error {
err := tokenizer.buffer.UnreadRune()
tokenizer.offset -= 1
tokenizer.curColumn -= 1
if len(tokenizer.curLiteral) > 0 {
tokenizer.curLiteral = tokenizer.curLiteral[0 : len(tokenizer.curLiteral)-1]
}
return err
}
func (tokenizer *TokenizerState) finalizeToken(tok *token.Token, tokId token.TokenID) *token.Token {
tok.ID = tokId
tok.LineEnd = tokenizer.curLine
tok.ColumnEnd = tokenizer.curColumn
tok.Literal = tokenizer.curLiteral
return tok
}
func (tokenizer *TokenizerState) newToken() *token.Token {
tokenizer.curLiteral = ""
return &token.Token{
ID: token.ERRORTOKEN,
LineStart: tokenizer.curLine,
ColumnStart: tokenizer.curColumn,
LineEnd: tokenizer.curLine,
ColumnEnd: tokenizer.curColumn,
Literal: tokenizer.curLiteral,
}
}
func (tokenizer *TokenizerState) parseQuoted(curTok *token.Token, nextChar rune) *token.Token {
quote := nextChar
quoteSize := 1
endQuoteSize := 0
nextChar = tokenizer.readNext()
if nextChar == quote {
nextChar = tokenizer.readNext()
if nextChar == quote {
quoteSize = 3
} else {
endQuoteSize = 1
}
}
if nextChar != quote {
tokenizer.unread()
}
for {
if endQuoteSize == quoteSize {
break
}
nextChar = tokenizer.readNext()
if nextChar == EOF {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
if quoteSize == 1 && nextChar == '\n' {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
if nextChar == quote {
endQuoteSize += 1
} else {
endQuoteSize = 0
if nextChar == '\\' {
nextChar = tokenizer.readNext()
}
}
}
return tokenizer.finalizeToken(curTok, token.STRING)
}
func (tokenizer *TokenizerState) parseNumber(curTok *token.Token, nextChar rune, fraction bool) *token.Token {
if fraction {
goto fraction
}
if nextChar == '0' {
nextChar = tokenizer.readNext()
if nextChar == '.' {
tokenizer.unread()
goto fraction
}
if nextChar == 'j' || nextChar == 'J' {
tokenizer.unread()
goto imaginary
}
if nextChar == 'x' || nextChar == 'X' {
// Hex
nextChar = tokenizer.readNext()
if !IsXDigit(nextChar) {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
for IsXDigit(nextChar) {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
} else if nextChar == 'o' || nextChar == 'O' {
// Octal
nextChar = tokenizer.readNext()
if nextChar < '0' || nextChar >= '8' {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
for nextChar >= '0' && nextChar < '8' {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
} else if nextChar == 'b' || nextChar == 'B' {
// Binary
nextChar = tokenizer.readNext()
if nextChar != '0' && nextChar != '1' {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
for nextChar == '0' || nextChar == '1' {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
} else {
nonzero := false
for nextChar == '0' {
nextChar = tokenizer.readNext()
}
for IsDigit(nextChar) {
nonzero = true
nextChar = tokenizer.readNext()
}
tokenizer.unread()
if nextChar == '.' {
goto fraction
} else if nextChar == 'e' || nextChar == 'E' {
goto exponent
} else if nextChar == 'j' || nextChar == 'J' {
goto imaginary
} else if nonzero {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
goto end
}
} else {
// Decimal
for IsDigit(nextChar) {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
goto fraction
}
fraction:
if nextChar == '.' {
nextChar = tokenizer.readNext()
nextChar = tokenizer.readNext()
for IsDigit(nextChar) {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
}
exponent:
if nextChar == 'e' || nextChar == 'E' {
nextChar = tokenizer.readNext()
nextChar = tokenizer.readNext()
if nextChar == '+' || nextChar == '-' {
nextChar = tokenizer.readNext()
if !IsDigit(nextChar) {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
} else if !IsDigit(nextChar) {
tokenizer.unread()
tokenizer.unread()
return tokenizer.finalizeToken(curTok, token.NUMBER)
}
for IsDigit(nextChar) {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
}
imaginary:
if nextChar == 'j' || nextChar == 'J' {
nextChar = tokenizer.readNext()
nextChar = tokenizer.readNext()
}
end:
return tokenizer.finalizeToken(curTok, token.NUMBER)
}
func (tokenizer *TokenizerState) Next() *token.Token {
next_line:
curTok := tokenizer.newToken()
nextChar := EOF
blankline := false
if tokenizer.atBol {
// Get indentation level
col := 0
altcol := 0
tokenizer.atBol = false
for {
nextChar = tokenizer.readNext()
if nextChar == ' ' {
col++
altcol++
} else if nextChar == '\t' {
col = (col/tokenizer.tabsize + 1) * tokenizer.tabsize
altcol = (altcol/tokenizer.tabsizeAlt + 1) * tokenizer.tabsizeAlt
} else {
break
}
}
tokenizer.unread()
if nextChar == '#' || nextChar == '\n' {
// Lines with only newline or comment, shouldn't affect indentation
if col == 0 && nextChar == '\n' {
blankline = false
} else {
blankline = true
}
}
if !blankline && tokenizer.indentationLevel == 0 {
if col == tokenizer.indentationStack[tokenizer.indentationCurrent] {
if altcol != tokenizer.indentationAltStack[tokenizer.indentationCurrent] {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
} else if col > tokenizer.indentationStack[tokenizer.indentationCurrent] {
if tokenizer.indentationCurrent+1 >= MAXINDENT {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
if altcol <= tokenizer.indentationAltStack[tokenizer.indentationCurrent] {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
tokenizer.indentationPending++
tokenizer.indentationCurrent++
tokenizer.indentationStack[tokenizer.indentationCurrent] = col
tokenizer.indentationAltStack[tokenizer.indentationCurrent] = altcol
} else {
for tokenizer.indentationCurrent > 0 && col < tokenizer.indentationStack[tokenizer.indentationCurrent] {
tokenizer.indentationPending--
tokenizer.indentationCurrent--
}
if col != tokenizer.indentationStack[tokenizer.indentationCurrent] {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
if altcol != tokenizer.indentationAltStack[tokenizer.indentationCurrent] {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
}
}
}
}
if tokenizer.indentationPending != 0 {
if tokenizer.indentationPending < 0 {
tokenizer.indentationPending++
return tokenizer.finalizeToken(curTok, token.DEDENT)
} else {
tokenizer.indentationPending--
return tokenizer.finalizeToken(curTok, token.INDENT)
}
}
again:
// Skip spaces
for {
nextChar = tokenizer.readNext()
if !(nextChar == ' ' || nextChar == '\t') {
break
}
}
curTok.LineStart = tokenizer.curLine
curTok.ColumnStart = tokenizer.curColumn - 1
tokenizer.curLiteral = string(nextChar)
// Skip comments
if nextChar == '#' {
for {
nextChar = tokenizer.readNext()
if nextChar == EOF || nextChar == '\n' {
break
}
}
}
// Check for EOF
if nextChar == EOF {
tokenizer.curLiteral = ""
return tokenizer.finalizeToken(curTok, token.ENDMARKER)
}
if IsIdentifierStart(nextChar) {
saw_b, saw_r, saw_u := false, false, false
for {
if !(saw_b || saw_u) && (nextChar == 'b' || nextChar == 'B') {
saw_b = true
} else if !(saw_b || saw_u || saw_r) && (nextChar == 'u' || nextChar == 'U') {
saw_u = true
} else if !(saw_r || saw_u) && (nextChar == 'r' || nextChar == 'R') {
saw_r = true
} else {
break
}
nextChar = tokenizer.readNext()
if IsQuote(nextChar) {
goto letter_quote
}
}
for IsIdentifierChar(nextChar) {
nextChar = tokenizer.readNext()
}
tokenizer.unread()
return tokenizer.finalizeToken(curTok, token.NAME)
}
// Newline
if nextChar == '\n' {
tokenizer.atBol = true
if blankline || tokenizer.indentationLevel > 0 {
goto next_line
}
tokenizer.curLine += 1
tokenizer.curColumn = 0
return tokenizer.finalizeToken(curTok, token.NEWLINE)
}
// Dot or number starting with dot
if nextChar == '.' {
nextChar = tokenizer.readNext()
if IsDigit(nextChar) {
return tokenizer.parseNumber(curTok, nextChar, true)
} else if nextChar == '.' {
nextChar = tokenizer.readNext()
if nextChar == '.' {
return tokenizer.finalizeToken(curTok, token.ELLIPSIS)
} else {
tokenizer.unread()
}
tokenizer.unread()
} else {
tokenizer.unread()
}
return tokenizer.finalizeToken(curTok, token.DOT)
}
// Number
if IsDigit(nextChar) {
return tokenizer.parseNumber(curTok, nextChar, false)
}
letter_quote:
// String
if IsQuote(nextChar) {
return tokenizer.parseQuoted(curTok, nextChar)
}
// Line continuation
if nextChar == '\\' {
nextChar = tokenizer.readNext()
if nextChar != '\n' {
return tokenizer.finalizeToken(curTok, token.ERRORTOKEN)
goto again
}
}
{
// Check for two character tokens
curChar := nextChar
nextChar = tokenizer.readNext()
tokId := GetTwoCharTokenID(curChar, nextChar)
if tokId != token.OP {
thirdChar := tokenizer.readNext()
nextTokId := GetThreeCharTokenID(curChar, nextChar, thirdChar)
if nextTokId != token.OP {
tokId = nextTokId
} else {
tokenizer.unread()
}
return tokenizer.finalizeToken(curTok, tokId)
}
tokenizer.unread()
nextChar = curChar
tokenizer.curLiteral = string(curChar)
}
switch nextChar {
case '(', '[', '{':
tokenizer.indentationLevel++
break
case ')', ']', '}':
tokenizer.indentationLevel--
break
}
tokId := GetOneCharTokenID(nextChar)
return tokenizer.finalizeToken(curTok, tokId)
}

Loading…
Cancel
Save