From 72c60e267e7fecd9a1a3b4da941b78c398e3faaf Mon Sep 17 00:00:00 2001 From: Carl Jackson Date: Sat, 3 May 2014 12:08:15 -0700 Subject: [PATCH] Fast bytecode router Swap out the naive "try all the routes in order" router with a "compile a trie down to bytecode" router. It's a ton faster, while providing all the same semantics. See the documentation at the top of web/fast_router.go for more. --- web/atomic.go | 16 +++ web/fast_router.go | 265 +++++++++++++++++++++++++++++++++++++++++++++ web/router.go | 97 +++++++++++++++-- 3 files changed, 366 insertions(+), 12 deletions(-) create mode 100644 web/atomic.go create mode 100644 web/fast_router.go diff --git a/web/atomic.go b/web/atomic.go new file mode 100644 index 0000000..1bbf48e --- /dev/null +++ b/web/atomic.go @@ -0,0 +1,16 @@ +package web + +import ( + "sync/atomic" + "unsafe" +) + +func (rt *router) getMachine() routeMachine { + ptr := (*unsafe.Pointer)(unsafe.Pointer(&rt.machine)) + sm := (*routeMachine)(atomic.LoadPointer(ptr)) + return *sm +} +func (rt *router) setMachine(m *routeMachine) { + ptr := (*unsafe.Pointer)(unsafe.Pointer(&rt.machine)) + atomic.StorePointer(ptr, unsafe.Pointer(m)) +} diff --git a/web/fast_router.go b/web/fast_router.go new file mode 100644 index 0000000..dea67db --- /dev/null +++ b/web/fast_router.go @@ -0,0 +1,265 @@ +package web + +/* +This file implements a fast router by encoding a list of routes first into a +pseudo-trie, then encoding that pseudo-trie into a state machine realized as +a routing bytecode. + +The most interesting part of this router is not its speed (it is quite fast), +but the guarantees it provides. In a naive router, routes are examined one after +another until a match is found, and this is the programming model we want to +support. For any given request ("GET /hello/carl"), there is a list of +"plausible" routes: routes which match the method ("GET"), and which have a +prefix that is a prefix of the requested path ("/" and "/hello/", for instance, +but not "/foobar"). Patterns also have some amount of arbitrary code associated +with them, which tells us whether or not the route matched. Just like the naive +router, our goal is to call each plausible pattern, in the order they were +added, until we find one that matches. The "fast" part here is being smart about +which non-plausible routes we can skip. + +First, we sort routes using a pairwise comparison function: sorting occurs as +normal on the prefixes, with the caveat that a route may not be moved past a +route that might also match the same string. Among other things, this means +we're forced to use particularly dumb sorting algorithms, but it only has to +happen once, and there probably aren't even that many routes to begin with. This +logic appears inline in the router's handle() function. + +We then build a pseudo-trie from the sorted list of routes. It's not quite a +normal trie because there are certain routes we cannot reorder around other +routes (since we're providing identical semantics to the naive router), but it's +close enough and the basic idea is the same. + +Finally, we lower this psuedo-trie from its tree representation to a state +machine bytecode. The bytecode is pretty simple: it contains up to three bytes, +a choice of a bunch of flags, and an index. The state machine is pretty simple: +if the bytes match the next few bytes after the cursor, the instruction matches, +and the state machine advances to the next instruction. If it does not match, it +jumps to the instruction at the index. Various flags modify this basic behavior, +the documentation for which can be found below. + +The thing we're optimizing for here over pretty much everything else is memory +locality. We make an effort to lay out both the trie child selection logic and +the matching of long strings consecutively in memory, making both operations +very cheap. In fact, our matching logic isn't particularly asymptotically good, +but in practice the benefits of memory locality outweigh just about everything +else. + +Unfortunately, the code implementing all of this is pretty bad (both inefficient +and hard to read). Maybe someday I'll come and take a second pass at it. +*/ +type state struct { + bs [3]byte + mode smMode + i int32 +} +type stateMachine []state + +type smMode uint8 + +// Many combinations of smModes don't make sense, but since this is interal to +// the library I don't feel like documenting them. +const ( + // The two low bits of the mode are used as a length of how many bytes + // of bs are used. If the length is 0, the node is treated as a + // wildcard. + smLengthMask smMode = 3 +) + +const ( + // Jump to the given index on a match. Ordinarily, the state machine + // will jump to the state given by the index if the characters do not + // match. + smJumpOnMatch smMode = 4 << iota + // The index is the index of a route to try. If running the route fails, + // the state machine advances by one. + smRoute + // Reset the state machine's cursor into the input string to the state's + // index value. + smSetCursor + // If this bit is set, the machine transitions into a non-accepting + // state if it matches. + smFail +) + +type trie struct { + prefix string + children []trieSegment +} + +// A trie segment is a route matching this point (or -1), combined with a list +// of trie children that follow that route. +type trieSegment struct { + route int + children []trie +} + +func buildTrie(routes []route, dp, dr int) trie { + var t trie + ts := trieSegment{-1, nil} + for i, r := range routes { + if len(r.prefix) != dp { + continue + } + + if i == 0 { + ts.route = 0 + } else { + subroutes := routes[ts.route+1 : i] + ts.children = buildTrieSegment(subroutes, dp, dr+ts.route+1) + t.children = append(t.children, ts) + ts = trieSegment{i, nil} + } + } + + // This could be a little DRYer... + subroutes := routes[ts.route+1:] + ts.children = buildTrieSegment(subroutes, dp, dr+ts.route+1) + t.children = append(t.children, ts) + + for i := range t.children { + if t.children[i].route != -1 { + t.children[i].route += dr + } + } + + return t +} + +func commonPrefix(s1, s2 string) string { + if len(s1) > len(s2) { + return commonPrefix(s2, s1) + } + for i := 0; i < len(s1); i++ { + if s1[i] != s2[i] { + return s1[:i] + } + } + return s1 +} + +func buildTrieSegment(routes []route, dp, dr int) []trie { + if len(routes) == 0 { + return nil + } + var tries []trie + + start := 0 + p := routes[0].prefix[dp:] + for i := 1; i < len(routes); i++ { + ip := routes[i].prefix[dp:] + cp := commonPrefix(p, ip) + if len(cp) == 0 { + t := buildTrie(routes[start:i], dp+len(p), dr+start) + t.prefix = p + tries = append(tries, t) + start = i + p = ip + } else { + p = cp + } + } + + t := buildTrie(routes[start:], dp+len(p), dr+start) + t.prefix = p + return append(tries, t) +} + +// This is a bit confusing, since the encode method on a trie deals exclusively +// with trieSegments (i.e., its children), and vice versa. +// +// These methods are also hideously inefficient, both in terms of memory usage +// and algorithmic complexity. If it ever becomes a problem, maybe we can do +// something smarter than stupid O(N^2) appends, but to be honest, I bet N is +// small (it almost always is :P) and we only do it once at boot anyways. + +func (t trie) encode(dp, off int) stateMachine { + ms := make([]stateMachine, len(t.children)) + subs := make([]stateMachine, len(t.children)) + var l, msl, subl int + + for i, ts := range t.children { + ms[i], subs[i] = ts.encode(dp, 0) + msl += len(ms[i]) + l += len(ms[i]) + len(subs[i]) + } + + l++ + + m := make(stateMachine, 0, l) + for i, mm := range ms { + for j := range mm { + if mm[j].mode&(smRoute|smSetCursor) != 0 { + continue + } + + mm[j].i += int32(off + msl + subl + 1) + } + m = append(m, mm...) + subl += len(subs[i]) + } + + m = append(m, state{mode: smJumpOnMatch, i: -1}) + + msl = 0 + for i, sub := range subs { + msl += len(ms[i]) + for j := range sub { + if sub[j].mode&(smRoute|smSetCursor) != 0 { + continue + } + if sub[j].i == -1 { + sub[j].i = int32(off + msl) + } else { + sub[j].i += int32(off + len(m)) + } + } + m = append(m, sub...) + } + + return m +} + +func (ts trieSegment) encode(dp, off int) (me stateMachine, sub stateMachine) { + o := 1 + if ts.route != -1 { + o++ + } + me = make(stateMachine, len(ts.children)+o) + + me[0] = state{mode: smSetCursor, i: int32(dp)} + if ts.route != -1 { + me[1] = state{mode: smRoute, i: int32(ts.route)} + } + + for i, t := range ts.children { + p := t.prefix + + bc := copy(me[i+o].bs[:], p) + me[i+o].mode = smMode(bc) | smJumpOnMatch + me[i+o].i = int32(off + len(sub)) + + for len(p) > bc { + var bs [3]byte + p = p[bc:] + bc = copy(bs[:], p) + sub = append(sub, state{bs: bs, mode: smMode(bc), i: -1}) + } + + sub = append(sub, t.encode(dp+len(t.prefix), off+len(sub))...) + } + return +} + +func compile(routes []route) stateMachine { + if len(routes) == 0 { + return nil + } + t := buildTrie(routes, 0, 0) + m := t.encode(0, 0) + for i := range m { + if m[i].i == -1 { + m[i].mode = m[i].mode | smFail + } + } + return m +} diff --git a/web/router.go b/web/router.go index 3bf384d..6cf21e8 100644 --- a/web/router.go +++ b/web/router.go @@ -59,6 +59,7 @@ type router struct { lock sync.Mutex routes []route notFound Handler + machine *routeMachine } // A Pattern determines whether or not a given request matches some criteria. @@ -137,22 +138,96 @@ func httpMethod(mname string) method { return mIDK } -func (rt *router) route(c C, w http.ResponseWriter, r *http.Request) { +type routeMachine struct { + sm stateMachine + routes []route +} + +func matchRoute(route route, m method, ms *method, r *http.Request, c *C) bool { + if !route.pattern.Match(r, c, false) { + return false + } + + if route.method&m != 0 { + return true + } else { + *ms |= route.method + return false + } +} + +func (rm routeMachine) route(c *C, w http.ResponseWriter, r *http.Request) (method, bool) { m := httpMethod(r.Method) var methods method - for _, route := range rt.routes { - if !strings.HasPrefix(r.URL.Path, route.prefix) || - !route.pattern.Match(r, &c, false) { + p := r.URL.Path + + if len(rm.sm) == 0 { + return methods, false + } + var i int + for { + s := rm.sm[i] + if s.mode&smSetCursor != 0 { + p = r.URL.Path[s.i:] + i++ continue } - if route.method&m != 0 { - route.handler.ServeHTTPC(c, w, r) - return - } else if route.pattern.Match(r, &c, true) { - methods |= route.method + length := int(s.mode & smLengthMask) + match := length <= len(p) + for j := 0; match && j < length; j++ { + match = match && p[j] == s.bs[j] } + + if match { + p = p[length:] + } + + if match && s.mode&smRoute != 0 { + if matchRoute(rm.routes[s.i], m, &methods, r, c) { + rm.routes[s.i].handler.ServeHTTPC(*c, w, r) + return 0, true + } else { + i++ + } + } else if (match && s.mode&smJumpOnMatch != 0) || + (!match && s.mode&smJumpOnMatch == 0) { + + if s.mode&smFail != 0 { + return methods, false + } + i = int(s.i) + } else { + i++ + } + } + + return methods, false +} + +// Compile the list of routes into bytecode. This only needs to be done once +// after all the routes have been added, and will be called automatically for +// you (at some performance cost on the first request) if you do not call it +// explicitly. +func (rt *router) Compile() { + rt.lock.Lock() + defer rt.lock.Unlock() + sm := routeMachine{ + sm: compile(rt.routes), + routes: rt.routes, + } + rt.setMachine(&sm) +} + +func (rt *router) route(c C, w http.ResponseWriter, r *http.Request) { + if rt.machine == nil { + rt.Compile() + } + + methods, ok := rt.getMachine().route(&c, w, r) + if ok { + return } if methods == 0 { @@ -209,9 +284,7 @@ func (rt *router) handle(p Pattern, m method, h Handler) { } copy(newRoutes[i+1:], rt.routes[i:]) - // We're being a bit sloppy here: we assume that pointer assignment is - // atomic with respect to other agents that don't acquire the lock. We - // should really just give up and use sync/atomic for this. + rt.setMachine(nil) rt.routes = newRoutes }