From 72c60e267e7fecd9a1a3b4da941b78c398e3faaf Mon Sep 17 00:00:00 2001
From: Carl Jackson <carl@avtok.com>
Date: Sat, 3 May 2014 12:08:15 -0700
Subject: [PATCH] Fast bytecode router

Swap out the naive "try all the routes in order" router with a "compile
a trie down to bytecode" router. It's a ton faster, while providing all
the same semantics.

See the documentation at the top of web/fast_router.go for more.
---
 web/atomic.go      |  16 +++
 web/fast_router.go | 265 +++++++++++++++++++++++++++++++++++++++++++++
 web/router.go      |  97 +++++++++++++++--
 3 files changed, 366 insertions(+), 12 deletions(-)
 create mode 100644 web/atomic.go
 create mode 100644 web/fast_router.go

diff --git a/web/atomic.go b/web/atomic.go
new file mode 100644
index 0000000..1bbf48e
--- /dev/null
+++ b/web/atomic.go
@@ -0,0 +1,16 @@
+package web
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+func (rt *router) getMachine() routeMachine {
+	ptr := (*unsafe.Pointer)(unsafe.Pointer(&rt.machine))
+	sm := (*routeMachine)(atomic.LoadPointer(ptr))
+	return *sm
+}
+func (rt *router) setMachine(m *routeMachine) {
+	ptr := (*unsafe.Pointer)(unsafe.Pointer(&rt.machine))
+	atomic.StorePointer(ptr, unsafe.Pointer(m))
+}
diff --git a/web/fast_router.go b/web/fast_router.go
new file mode 100644
index 0000000..dea67db
--- /dev/null
+++ b/web/fast_router.go
@@ -0,0 +1,265 @@
+package web
+
+/*
+This file implements a fast router by encoding a list of routes first into a
+pseudo-trie, then encoding that pseudo-trie into a state machine realized as
+a routing bytecode.
+
+The most interesting part of this router is not its speed (it is quite fast),
+but the guarantees it provides. In a naive router, routes are examined one after
+another until a match is found, and this is the programming model we want to
+support. For any given request ("GET /hello/carl"), there is a list of
+"plausible" routes: routes which match the method ("GET"), and which have a
+prefix that is a prefix of the requested path ("/" and "/hello/", for instance,
+but not "/foobar"). Patterns also have some amount of arbitrary code associated
+with them, which tells us whether or not the route matched. Just like the naive
+router, our goal is to call each plausible pattern, in the order they were
+added, until we find one that matches. The "fast" part here is being smart about
+which non-plausible routes we can skip.
+
+First, we sort routes using a pairwise comparison function: sorting occurs as
+normal on the prefixes, with the caveat that a route may not be moved past a
+route that might also match the same string. Among other things, this means
+we're forced to use particularly dumb sorting algorithms, but it only has to
+happen once, and there probably aren't even that many routes to begin with. This
+logic appears inline in the router's handle() function.
+
+We then build a pseudo-trie from the sorted list of routes. It's not quite a
+normal trie because there are certain routes we cannot reorder around other
+routes (since we're providing identical semantics to the naive router), but it's
+close enough and the basic idea is the same.
+
+Finally, we lower this psuedo-trie from its tree representation to a state
+machine bytecode. The bytecode is pretty simple: it contains up to three bytes,
+a choice of a bunch of flags, and an index. The state machine is pretty simple:
+if the bytes match the next few bytes after the cursor, the instruction matches,
+and the state machine advances to the next instruction. If it does not match, it
+jumps to the instruction at the index. Various flags modify this basic behavior,
+the documentation for which can be found below.
+
+The thing we're optimizing for here over pretty much everything else is memory
+locality. We make an effort to lay out both the trie child selection logic and
+the matching of long strings consecutively in memory, making both operations
+very cheap. In fact, our matching logic isn't particularly asymptotically good,
+but in practice the benefits of memory locality outweigh just about everything
+else.
+
+Unfortunately, the code implementing all of this is pretty bad (both inefficient
+and hard to read). Maybe someday I'll come and take a second pass at it.
+*/
+type state struct {
+	bs   [3]byte
+	mode smMode
+	i    int32
+}
+type stateMachine []state
+
+type smMode uint8
+
+// Many combinations of smModes don't make sense, but since this is interal to
+// the library I don't feel like documenting them.
+const (
+	// The two low bits of the mode are used as a length of how many bytes
+	// of bs are used. If the length is 0, the node is treated as a
+	// wildcard.
+	smLengthMask smMode = 3
+)
+
+const (
+	// Jump to the given index on a match. Ordinarily, the state machine
+	// will jump to the state given by the index if the characters do not
+	// match.
+	smJumpOnMatch smMode = 4 << iota
+	// The index is the index of a route to try. If running the route fails,
+	// the state machine advances by one.
+	smRoute
+	// Reset the state machine's cursor into the input string to the state's
+	// index value.
+	smSetCursor
+	// If this bit is set, the machine transitions into a non-accepting
+	// state if it matches.
+	smFail
+)
+
+type trie struct {
+	prefix   string
+	children []trieSegment
+}
+
+// A trie segment is a route matching this point (or -1), combined with a list
+// of trie children that follow that route.
+type trieSegment struct {
+	route    int
+	children []trie
+}
+
+func buildTrie(routes []route, dp, dr int) trie {
+	var t trie
+	ts := trieSegment{-1, nil}
+	for i, r := range routes {
+		if len(r.prefix) != dp {
+			continue
+		}
+
+		if i == 0 {
+			ts.route = 0
+		} else {
+			subroutes := routes[ts.route+1 : i]
+			ts.children = buildTrieSegment(subroutes, dp, dr+ts.route+1)
+			t.children = append(t.children, ts)
+			ts = trieSegment{i, nil}
+		}
+	}
+
+	// This could be a little DRYer...
+	subroutes := routes[ts.route+1:]
+	ts.children = buildTrieSegment(subroutes, dp, dr+ts.route+1)
+	t.children = append(t.children, ts)
+
+	for i := range t.children {
+		if t.children[i].route != -1 {
+			t.children[i].route += dr
+		}
+	}
+
+	return t
+}
+
+func commonPrefix(s1, s2 string) string {
+	if len(s1) > len(s2) {
+		return commonPrefix(s2, s1)
+	}
+	for i := 0; i < len(s1); i++ {
+		if s1[i] != s2[i] {
+			return s1[:i]
+		}
+	}
+	return s1
+}
+
+func buildTrieSegment(routes []route, dp, dr int) []trie {
+	if len(routes) == 0 {
+		return nil
+	}
+	var tries []trie
+
+	start := 0
+	p := routes[0].prefix[dp:]
+	for i := 1; i < len(routes); i++ {
+		ip := routes[i].prefix[dp:]
+		cp := commonPrefix(p, ip)
+		if len(cp) == 0 {
+			t := buildTrie(routes[start:i], dp+len(p), dr+start)
+			t.prefix = p
+			tries = append(tries, t)
+			start = i
+			p = ip
+		} else {
+			p = cp
+		}
+	}
+
+	t := buildTrie(routes[start:], dp+len(p), dr+start)
+	t.prefix = p
+	return append(tries, t)
+}
+
+// This is a bit confusing, since the encode method on a trie deals exclusively
+// with trieSegments (i.e., its children), and vice versa.
+//
+// These methods are also hideously inefficient, both in terms of memory usage
+// and algorithmic complexity. If it ever becomes a problem, maybe we can do
+// something smarter than stupid O(N^2) appends, but to be honest, I bet N is
+// small (it almost always is :P) and we only do it once at boot anyways.
+
+func (t trie) encode(dp, off int) stateMachine {
+	ms := make([]stateMachine, len(t.children))
+	subs := make([]stateMachine, len(t.children))
+	var l, msl, subl int
+
+	for i, ts := range t.children {
+		ms[i], subs[i] = ts.encode(dp, 0)
+		msl += len(ms[i])
+		l += len(ms[i]) + len(subs[i])
+	}
+
+	l++
+
+	m := make(stateMachine, 0, l)
+	for i, mm := range ms {
+		for j := range mm {
+			if mm[j].mode&(smRoute|smSetCursor) != 0 {
+				continue
+			}
+
+			mm[j].i += int32(off + msl + subl + 1)
+		}
+		m = append(m, mm...)
+		subl += len(subs[i])
+	}
+
+	m = append(m, state{mode: smJumpOnMatch, i: -1})
+
+	msl = 0
+	for i, sub := range subs {
+		msl += len(ms[i])
+		for j := range sub {
+			if sub[j].mode&(smRoute|smSetCursor) != 0 {
+				continue
+			}
+			if sub[j].i == -1 {
+				sub[j].i = int32(off + msl)
+			} else {
+				sub[j].i += int32(off + len(m))
+			}
+		}
+		m = append(m, sub...)
+	}
+
+	return m
+}
+
+func (ts trieSegment) encode(dp, off int) (me stateMachine, sub stateMachine) {
+	o := 1
+	if ts.route != -1 {
+		o++
+	}
+	me = make(stateMachine, len(ts.children)+o)
+
+	me[0] = state{mode: smSetCursor, i: int32(dp)}
+	if ts.route != -1 {
+		me[1] = state{mode: smRoute, i: int32(ts.route)}
+	}
+
+	for i, t := range ts.children {
+		p := t.prefix
+
+		bc := copy(me[i+o].bs[:], p)
+		me[i+o].mode = smMode(bc) | smJumpOnMatch
+		me[i+o].i = int32(off + len(sub))
+
+		for len(p) > bc {
+			var bs [3]byte
+			p = p[bc:]
+			bc = copy(bs[:], p)
+			sub = append(sub, state{bs: bs, mode: smMode(bc), i: -1})
+		}
+
+		sub = append(sub, t.encode(dp+len(t.prefix), off+len(sub))...)
+	}
+	return
+}
+
+func compile(routes []route) stateMachine {
+	if len(routes) == 0 {
+		return nil
+	}
+	t := buildTrie(routes, 0, 0)
+	m := t.encode(0, 0)
+	for i := range m {
+		if m[i].i == -1 {
+			m[i].mode = m[i].mode | smFail
+		}
+	}
+	return m
+}
diff --git a/web/router.go b/web/router.go
index 3bf384d..6cf21e8 100644
--- a/web/router.go
+++ b/web/router.go
@@ -59,6 +59,7 @@ type router struct {
 	lock     sync.Mutex
 	routes   []route
 	notFound Handler
+	machine  *routeMachine
 }
 
 // A Pattern determines whether or not a given request matches some criteria.
@@ -137,22 +138,96 @@ func httpMethod(mname string) method {
 	return mIDK
 }
 
-func (rt *router) route(c C, w http.ResponseWriter, r *http.Request) {
+type routeMachine struct {
+	sm     stateMachine
+	routes []route
+}
+
+func matchRoute(route route, m method, ms *method, r *http.Request, c *C) bool {
+	if !route.pattern.Match(r, c, false) {
+		return false
+	}
+
+	if route.method&m != 0 {
+		return true
+	} else {
+		*ms |= route.method
+		return false
+	}
+}
+
+func (rm routeMachine) route(c *C, w http.ResponseWriter, r *http.Request) (method, bool) {
 	m := httpMethod(r.Method)
 	var methods method
-	for _, route := range rt.routes {
-		if !strings.HasPrefix(r.URL.Path, route.prefix) ||
-			!route.pattern.Match(r, &c, false) {
+	p := r.URL.Path
+
+	if len(rm.sm) == 0 {
+		return methods, false
+	}
 
+	var i int
+	for {
+		s := rm.sm[i]
+		if s.mode&smSetCursor != 0 {
+			p = r.URL.Path[s.i:]
+			i++
 			continue
 		}
 
-		if route.method&m != 0 {
-			route.handler.ServeHTTPC(c, w, r)
-			return
-		} else if route.pattern.Match(r, &c, true) {
-			methods |= route.method
+		length := int(s.mode & smLengthMask)
+		match := length <= len(p)
+		for j := 0; match && j < length; j++ {
+			match = match && p[j] == s.bs[j]
 		}
+
+		if match {
+			p = p[length:]
+		}
+
+		if match && s.mode&smRoute != 0 {
+			if matchRoute(rm.routes[s.i], m, &methods, r, c) {
+				rm.routes[s.i].handler.ServeHTTPC(*c, w, r)
+				return 0, true
+			} else {
+				i++
+			}
+		} else if (match && s.mode&smJumpOnMatch != 0) ||
+			(!match && s.mode&smJumpOnMatch == 0) {
+
+			if s.mode&smFail != 0 {
+				return methods, false
+			}
+			i = int(s.i)
+		} else {
+			i++
+		}
+	}
+
+	return methods, false
+}
+
+// Compile the list of routes into bytecode. This only needs to be done once
+// after all the routes have been added, and will be called automatically for
+// you (at some performance cost on the first request) if you do not call it
+// explicitly.
+func (rt *router) Compile() {
+	rt.lock.Lock()
+	defer rt.lock.Unlock()
+	sm := routeMachine{
+		sm:     compile(rt.routes),
+		routes: rt.routes,
+	}
+	rt.setMachine(&sm)
+}
+
+func (rt *router) route(c C, w http.ResponseWriter, r *http.Request) {
+	if rt.machine == nil {
+		rt.Compile()
+	}
+
+	methods, ok := rt.getMachine().route(&c, w, r)
+	if ok {
+		return
 	}
 
 	if methods == 0 {
@@ -209,9 +284,7 @@ func (rt *router) handle(p Pattern, m method, h Handler) {
 	}
 	copy(newRoutes[i+1:], rt.routes[i:])
 
-	// We're being a bit sloppy here: we assume that pointer assignment is
-	// atomic with respect to other agents that don't acquire the lock. We
-	// should really just give up and use sync/atomic for this.
+	rt.setMachine(nil)
 	rt.routes = newRoutes
 }