From cda08168736997efbb6b0e2c0ff32e6bc1a78f36 Mon Sep 17 00:00:00 2001 From: brettlangdon Date: Sun, 13 Sep 2015 22:29:16 -0400 Subject: [PATCH] start port of tokenizer over to scanner --- scanner/scanner.go | 170 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 162 insertions(+), 8 deletions(-) diff --git a/scanner/scanner.go b/scanner/scanner.go index 7fdb729..e1db326 100644 --- a/scanner/scanner.go +++ b/scanner/scanner.go @@ -33,7 +33,7 @@ func NewScanner(r io.Reader) *Scanner { func (scanner *Scanner) nextPosition() *Position { if len(scanner.positionBuffer) > 0 { - last = len(scanner.positionBuffer) - 1 + last := len(scanner.positionBuffer) - 1 scanner.currentPosition = scanner.positionBuffer[last] scanner.positionBuffer = scanner.positionBuffer[0:last] return scanner.currentPosition @@ -43,27 +43,181 @@ func (scanner *Scanner) nextPosition() *Position { if err != nil { scanner.state = errorcode.E_EOF next = EOF - } - - scanner.currentColumn++ - if next == '\n' { scanner.currentLine++ scanner.currentColumn = 0 } - return &Position{ + pos := &Position{ Char: next, Line: scanner.currentLine, Column: scanner.currentColumn, } + + scanner.currentColumn++ + if next == '\n' || next == EOF { + scanner.currentLine++ + scanner.currentColumn = 0 + } + + return pos } func (scanner *Scanner) unreadPosition(pos *Position) { scanner.positionBuffer = append(scanner.positionBuffer, pos) } +func (scanner *Scanner) parseQuoted(positions *Positions, quote rune) *token.Token { + // Determine quote size, 1 or 3 (e.g. 'string', '''string''') + quoteSize := 1 + endQuoteSize := 0 + pos := scanner.nextPosition() + if pos.Char == quote { + pos2 := scanner.nextPosition() + if pos2.Char == quote { + positions.Append(pos) + positions.Append(pos2) + quoteSize = 3 + } else { + scanner.unreadPosition(pos2) + endQuoteSize = 1 + } + } else { + scanner.unreadPosition(pos) + } + + for { + if endQuoteSize == quoteSize { + break + } + pos = scanner.nextPosition() + positions.Append(pos) + if pos.Char == EOF { + return positions.AsToken(token.ERRORTOKEN) + } + if quoteSize == 1 && pos.Char == '\n' { + return positions.AsToken(token.ERRORTOKEN) + } + if pos.Char == quote { + endQuoteSize += 1 + } else { + endQuoteSize = 0 + if pos.Char == '\\' { + pos = scanner.nextPosition() + } + } + } + return positions.AsToken(token.STRING) +} + func (scanner *Scanner) NextToken() *token.Token { - return &Token{ - ID: token.ENDMARKER, + positions := NewPositions() + + pos := scanner.nextPosition() + // skip spaces + for { + if pos.Char != ' ' && pos.Char != '\t' { + break + } + pos = scanner.nextPosition() + } + + // skip comments + if pos.Char == '#' { + for { + pos = scanner.nextPosition() + if pos.Char == EOF || pos.Char == '\n' { + break + } + } } + + positions.Append(pos) + switch ch := pos.Char; { + case ch == EOF: + id := token.ENDMARKER + if scanner.state != errorcode.E_EOF { + id = token.ERRORTOKEN + } + return positions.AsToken(id) + case IsIdentifierStart(ch): + // Parse Identifier + saw_b, saw_r, saw_u := false, false, false + for { + if !(saw_b || saw_u) && (ch == 'b' || ch == 'B') { + saw_b = true + } else if !(saw_b || saw_u || saw_r) && (ch == 'u' || ch == 'U') { + saw_u = true + } else if !(saw_r || saw_u) && (ch == 'r' || ch == 'R') { + saw_r = true + } else { + break + } + pos = scanner.nextPosition() + if IsQuote(pos.Char) { + positions.Append(pos) + return scanner.parseQuoted(positions, pos.Char) + } + } + pos = scanner.nextPosition() + for IsIdentifierChar(pos.Char) { + positions.Append(pos) + pos = scanner.nextPosition() + } + scanner.unreadPosition(pos) + return positions.AsToken(token.NAME) + case ch == '\n': + return positions.AsToken(token.NEWLINE) + case ch == '.': + pos2 := scanner.nextPosition() + if IsDigit(pos2.Char) { + // Parse Number + } else if pos2.Char == '.' { + positions.Append(pos2) + pos3 := scanner.nextPosition() + if pos3.Char == '.' { + positions.Append(pos3) + return positions.AsToken(token.ELLIPSIS) + } + scanner.unreadPosition(pos3) + } + scanner.unreadPosition(pos2) + + return positions.AsToken(token.DOT) + case IsDigit(ch): + // Parse Number + case IsQuote(ch): + // Parse String + return scanner.parseQuoted(positions, ch) + case ch == '\\': + // Parse Continuation + default: + // Two and Three character operators + pos2 := scanner.nextPosition() + op2Id := GetTwoCharTokenID(pos.Char, pos2.Char) + if op2Id != token.OP { + positions.Append(pos2) + pos3 := scanner.nextPosition() + op3Id := GetThreeCharTokenID(pos.Char, pos2.Char, pos3.Char) + if op3Id != token.OP { + positions.Append(pos3) + return positions.AsToken(op3Id) + } + scanner.unreadPosition(pos3) + return positions.AsToken(op2Id) + } + scanner.unreadPosition(pos2) + } + switch pos.Char { + case '(', '[', '{': + // Increment indentation level + // scanner.indentationLevel++ + break + case ')', ']', '}': + // Decrement indentation level + // scanner.indentationLevel-- + break + } + + opId := GetOneCharTokenID(pos.Char) + return positions.AsToken(opId) }