Fast bytecode router

Swap out the naive "try all the routes in order" router with a "compile a trie down to bytecode" router. It's a ton faster, while providing all the same semantics. See the documentation at the top of web/fast_router.go for more.
12 years ago · 72c60e267e
--- a/web/atomic.go
+++ b/web/atomic.go
@ -0,0 +1,16 @@
 package web

 import (
 	"sync/atomic"
 	"unsafe"
 )

 func (rt *router) getMachine() routeMachine {
 	ptr := (*unsafe.Pointer)(unsafe.Pointer(&rt.machine))
 	sm := (*routeMachine)(atomic.LoadPointer(ptr))
 	return *sm
 }
 func (rt *router) setMachine(m *routeMachine) {
 	ptr := (*unsafe.Pointer)(unsafe.Pointer(&rt.machine))
 	atomic.StorePointer(ptr, unsafe.Pointer(m))
 }
--- a/web/fast_router.go
+++ b/web/fast_router.go
@ -0,0 +1,265 @@
 package web

 /*
 This file implements a fast router by encoding a list of routes first into a
 pseudo-trie, then encoding that pseudo-trie into a state machine realized as
 a routing bytecode.

 The most interesting part of this router is not its speed (it is quite fast),
 but the guarantees it provides. In a naive router, routes are examined one after
 another until a match is found, and this is the programming model we want to
 support. For any given request ("GET /hello/carl"), there is a list of
 "plausible" routes: routes which match the method ("GET"), and which have a
 prefix that is a prefix of the requested path ("/" and "/hello/", for instance,
 but not "/foobar"). Patterns also have some amount of arbitrary code associated
 with them, which tells us whether or not the route matched. Just like the naive
 router, our goal is to call each plausible pattern, in the order they were
 added, until we find one that matches. The "fast" part here is being smart about
 which non-plausible routes we can skip.

 First, we sort routes using a pairwise comparison function: sorting occurs as
 normal on the prefixes, with the caveat that a route may not be moved past a
 route that might also match the same string. Among other things, this means
 we're forced to use particularly dumb sorting algorithms, but it only has to
 happen once, and there probably aren't even that many routes to begin with. This
 logic appears inline in the router's handle() function.

 We then build a pseudo-trie from the sorted list of routes. It's not quite a
 normal trie because there are certain routes we cannot reorder around other
 routes (since we're providing identical semantics to the naive router), but it's
 close enough and the basic idea is the same.

 Finally, we lower this psuedo-trie from its tree representation to a state
 machine bytecode. The bytecode is pretty simple: it contains up to three bytes,
 a choice of a bunch of flags, and an index. The state machine is pretty simple:
 if the bytes match the next few bytes after the cursor, the instruction matches,
 and the state machine advances to the next instruction. If it does not match, it
 jumps to the instruction at the index. Various flags modify this basic behavior,
 the documentation for which can be found below.

 The thing we're optimizing for here over pretty much everything else is memory
 locality. We make an effort to lay out both the trie child selection logic and
 the matching of long strings consecutively in memory, making both operations
 very cheap. In fact, our matching logic isn't particularly asymptotically good,
 but in practice the benefits of memory locality outweigh just about everything
 else.

 Unfortunately, the code implementing all of this is pretty bad (both inefficient
 and hard to read). Maybe someday I'll come and take a second pass at it.
 */
 type state struct {
 	bs   [3]byte
 	mode smMode
 	i    int32
 }
 type stateMachine []state

 type smMode uint8

 // Many combinations of smModes don't make sense, but since this is interal to
 // the library I don't feel like documenting them.
 const (
 	// The two low bits of the mode are used as a length of how many bytes
 	// of bs are used. If the length is 0, the node is treated as a
 	// wildcard.
 	smLengthMask smMode = 3
 )

 const (
 	// Jump to the given index on a match. Ordinarily, the state machine
 	// will jump to the state given by the index if the characters do not
 	// match.
 	smJumpOnMatch smMode = 4 << iota
 	// The index is the index of a route to try. If running the route fails,
 	// the state machine advances by one.
 	smRoute
 	// Reset the state machine's cursor into the input string to the state's
 	// index value.
 	smSetCursor
 	// If this bit is set, the machine transitions into a non-accepting
 	// state if it matches.
 	smFail
 )

 type trie struct {
 	prefix   string
 	children []trieSegment
 }

 // A trie segment is a route matching this point (or -1), combined with a list
 // of trie children that follow that route.
 type trieSegment struct {
 	route    int
 	children []trie
 }

 func buildTrie(routes []route, dp, dr int) trie {
 	var t trie
 	ts := trieSegment{-1, nil}
 	for i, r := range routes {
 		if len(r.prefix) != dp {
 			continue
 		}

 		if i == 0 {
 			ts.route = 0
 		} else {
 			subroutes := routes[ts.route+1 : i]
 			ts.children = buildTrieSegment(subroutes, dp, dr+ts.route+1)
 			t.children = append(t.children, ts)
 			ts = trieSegment{i, nil}
 		}
 	}

 	// This could be a little DRYer...
 	subroutes := routes[ts.route+1:]
 	ts.children = buildTrieSegment(subroutes, dp, dr+ts.route+1)
 	t.children = append(t.children, ts)

 	for i := range t.children {
 		if t.children[i].route != -1 {
 			t.children[i].route += dr
 		}
 	}

 	return t
 }

 func commonPrefix(s1, s2 string) string {
 	if len(s1) > len(s2) {
 		return commonPrefix(s2, s1)
 	}
 	for i := 0; i < len(s1); i++ {
 		if s1[i] != s2[i] {
 			return s1[:i]
 		}
 	}
 	return s1
 }

 func buildTrieSegment(routes []route, dp, dr int) []trie {
 	if len(routes) == 0 {
 		return nil
 	}
 	var tries []trie

 	start := 0
 	p := routes[0].prefix[dp:]
 	for i := 1; i < len(routes); i++ {
 		ip := routes[i].prefix[dp:]
 		cp := commonPrefix(p, ip)
 		if len(cp) == 0 {
 			t := buildTrie(routes[start:i], dp+len(p), dr+start)
 			t.prefix = p
 			tries = append(tries, t)
 			start = i
 			p = ip
 		} else {
 			p = cp
 		}
 	}

 	t := buildTrie(routes[start:], dp+len(p), dr+start)
 	t.prefix = p
 	return append(tries, t)
 }

 // This is a bit confusing, since the encode method on a trie deals exclusively
 // with trieSegments (i.e., its children), and vice versa.
 //
 // These methods are also hideously inefficient, both in terms of memory usage
 // and algorithmic complexity. If it ever becomes a problem, maybe we can do
 // something smarter than stupid O(N^2) appends, but to be honest, I bet N is
 // small (it almost always is :P) and we only do it once at boot anyways.

 func (t trie) encode(dp, off int) stateMachine {
 	ms := make([]stateMachine, len(t.children))
 	subs := make([]stateMachine, len(t.children))
 	var l, msl, subl int

 	for i, ts := range t.children {
 		ms[i], subs[i] = ts.encode(dp, 0)
 		msl += len(ms[i])
 		l += len(ms[i]) + len(subs[i])
 	}

 	l++

 	m := make(stateMachine, 0, l)
 	for i, mm := range ms {
 		for j := range mm {
 			if mm[j].mode&(smRoute|smSetCursor) != 0 {
 				continue
 			}

 			mm[j].i += int32(off + msl + subl + 1)
 		}
 		m = append(m, mm...)
 		subl += len(subs[i])
 	}

 	m = append(m, state{mode: smJumpOnMatch, i: -1})

 	msl = 0
 	for i, sub := range subs {
 		msl += len(ms[i])
 		for j := range sub {
 			if sub[j].mode&(smRoute|smSetCursor) != 0 {
 				continue
 			}
 			if sub[j].i == -1 {
 				sub[j].i = int32(off + msl)
 			} else {
 				sub[j].i += int32(off + len(m))
 			}
 		}
 		m = append(m, sub...)
 	}

 	return m
 }

 func (ts trieSegment) encode(dp, off int) (me stateMachine, sub stateMachine) {
 	o := 1
 	if ts.route != -1 {
 		o++
 	}
 	me = make(stateMachine, len(ts.children)+o)

 	me[0] = state{mode: smSetCursor, i: int32(dp)}
 	if ts.route != -1 {
 		me[1] = state{mode: smRoute, i: int32(ts.route)}
 	}

 	for i, t := range ts.children {
 		p := t.prefix

 		bc := copy(me[i+o].bs[:], p)
 		me[i+o].mode = smMode(bc) | smJumpOnMatch
 		me[i+o].i = int32(off + len(sub))

 		for len(p) > bc {
 			var bs [3]byte
 			p = p[bc:]
 			bc = copy(bs[:], p)
 			sub = append(sub, state{bs: bs, mode: smMode(bc), i: -1})
 		}

 		sub = append(sub, t.encode(dp+len(t.prefix), off+len(sub))...)
 	}
 	return
 }

 func compile(routes []route) stateMachine {
 	if len(routes) == 0 {
 		return nil
 	}
 	t := buildTrie(routes, 0, 0)
 	m := t.encode(0, 0)
 	for i := range m {
 		if m[i].i == -1 {
 			m[i].mode = m[i].mode | smFail
 		}
 	}
 	return m
 }
--- a/web/router.go
+++ b/web/router.go
@ -59,6 +59,7 @@ type router struct {
 	lock     sync.Mutex
 	routes   []route
 	notFound Handler
 	machine  *routeMachine
 }

 // A Pattern determines whether or not a given request matches some criteria.
@ -137,22 +138,96 @@ func httpMethod(mname string) method {
 	return mIDK
 }

 func (rt *router) route(c C, w http.ResponseWriter, r *http.Request) {
 type routeMachine struct {
 	sm     stateMachine
 	routes []route
 }

 func matchRoute(route route, m method, ms *method, r *http.Request, c *C) bool {
 	if !route.pattern.Match(r, c, false) {
 		return false
 	}

 	if route.method&m != 0 {
 		return true
 	} else {
 		*ms |= route.method
 		return false
 	}
 }

 func (rm routeMachine) route(c *C, w http.ResponseWriter, r *http.Request) (method, bool) {
 	m := httpMethod(r.Method)
 	var methods method
 	for _, route := range rt.routes {
 		if !strings.HasPrefix(r.URL.Path, route.prefix) ||
 			!route.pattern.Match(r, &c, false) {
 	p := r.URL.Path

 	if len(rm.sm) == 0 {
 		return methods, false
 	}

 	var i int
 	for {
 		s := rm.sm[i]
 		if s.mode&smSetCursor != 0 {
 			p = r.URL.Path[s.i:]
 			i++
 			continue
 		}

 		if route.method&m != 0 {
 			route.handler.ServeHTTPC(c, w, r)
 			return
 		} else if route.pattern.Match(r, &c, true) {
 			methods |= route.method
 		length := int(s.mode & smLengthMask)
 		match := length <= len(p)
 		for j := 0; match && j < length; j++ {
 			match = match && p[j] == s.bs[j]
 		}

 		if match {
 			p = p[length:]
 		}

 		if match && s.mode&smRoute != 0 {
 			if matchRoute(rm.routes[s.i], m, &methods, r, c) {
 				rm.routes[s.i].handler.ServeHTTPC(*c, w, r)
 				return 0, true
 			} else {
 				i++
 			}
 		} else if (match && s.mode&smJumpOnMatch != 0) ||
 			(!match && s.mode&smJumpOnMatch == 0) {

 			if s.mode&smFail != 0 {
 				return methods, false
 			}
 			i = int(s.i)
 		} else {
 			i++
 		}
 	}

 	return methods, false
 }

 // Compile the list of routes into bytecode. This only needs to be done once
 // after all the routes have been added, and will be called automatically for
 // you (at some performance cost on the first request) if you do not call it
 // explicitly.
 func (rt *router) Compile() {
 	rt.lock.Lock()
 	defer rt.lock.Unlock()
 	sm := routeMachine{
 		sm:     compile(rt.routes),
 		routes: rt.routes,
 	}
 	rt.setMachine(&sm)
 }

 func (rt *router) route(c C, w http.ResponseWriter, r *http.Request) {
 	if rt.machine == nil {
 		rt.Compile()
 	}

 	methods, ok := rt.getMachine().route(&c, w, r)
 	if ok {
 		return
 	}

 	if methods == 0 {
@ -209,9 +284,7 @@ func (rt *router) handle(p Pattern, m method, h Handler) {
 	}
 	copy(newRoutes[i+1:], rt.routes[i:])

 	// We're being a bit sloppy here: we assume that pointer assignment is
 	// atomic with respect to other agents that don't acquire the lock. We
 	// should really just give up and use sync/atomic for this.
 	rt.setMachine(nil)
 	rt.routes = newRoutes
 }