capnproto

FORK: Cap'n Proto serialization/RPC system - core tools and C++ library
git clone https://git.neptards.moe/neptards/capnproto.git
Log | Files | Refs | README | LICENSE

table.c++ (32147B)


      1 // Copyright (c) 2018 Kenton Varda and contributors
      2 // Licensed under the MIT License:
      3 //
      4 // Permission is hereby granted, free of charge, to any person obtaining a copy
      5 // of this software and associated documentation files (the "Software"), to deal
      6 // in the Software without restriction, including without limitation the rights
      7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      8 // copies of the Software, and to permit persons to whom the Software is
      9 // furnished to do so, subject to the following conditions:
     10 //
     11 // The above copyright notice and this permission notice shall be included in
     12 // all copies or substantial portions of the Software.
     13 //
     14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     20 // THE SOFTWARE.
     21 
     22 #include "table.h"
     23 #include "debug.h"
     24 #include <stdlib.h>
     25 
     26 #if KJ_DEBUG_TABLE_IMPL
     27 #undef KJ_DASSERT
     28 #define KJ_DASSERT KJ_ASSERT
     29 #endif
     30 
     31 namespace kj {
     32 namespace _ {
     33 
     34 static inline uint lg(uint value) {
     35   // Compute floor(log2(value)).
     36   //
     37   // Undefined for value = 0.
     38 #if _MSC_VER && !defined(__clang__)
     39   unsigned long i;
     40   auto found = _BitScanReverse(&i, value);
     41   KJ_DASSERT(found);  // !found means value = 0
     42   return i;
     43 #else
     44   return sizeof(uint) * 8 - 1 - __builtin_clz(value);
     45 #endif
     46 }
     47 
     48 void throwDuplicateTableRow() {
     49   KJ_FAIL_REQUIRE("inserted row already exists in table");
     50 }
     51 
     52 void logHashTableInconsistency() {
     53   KJ_LOG(ERROR,
     54       "HashIndex detected hash table inconsistency. This can happen if you create a kj::Table "
     55       "with a hash index and you modify the rows in the table post-indexing in a way that would "
     56       "change their hash. This is a serious bug which will lead to undefined behavior."
     57       "\nstack: ", kj::getStackTrace());
     58 }
     59 
     60 // List of primes where each element is roughly double the previous.  Obtained
     61 // from:
     62 //   http://planetmath.org/goodhashtableprimes
     63 // Primes < 53 were added to ensure that small tables don't allocate excessive memory.
     64 static const size_t PRIMES[] = {
     65            1,  // 2^ 0 = 1
     66            3,  // 2^ 1 = 2
     67            5,  // 2^ 2 = 4
     68           11,  // 2^ 3 = 8
     69           23,  // 2^ 4 = 16
     70           53,  // 2^ 5 = 32
     71           97,  // 2^ 6 = 64
     72          193,  // 2^ 7 = 128
     73          389,  // 2^ 8 = 256
     74          769,  // 2^ 9 = 512
     75         1543,  // 2^10 = 1024
     76         3079,  // 2^11 = 2048
     77         6151,  // 2^12 = 4096
     78        12289,  // 2^13 = 8192
     79        24593,  // 2^14 = 16384
     80        49157,  // 2^15 = 32768
     81        98317,  // 2^16 = 65536
     82       196613,  // 2^17 = 131072
     83       393241,  // 2^18 = 262144
     84       786433,  // 2^19 = 524288
     85      1572869,  // 2^20 = 1048576
     86      3145739,  // 2^21 = 2097152
     87      6291469,  // 2^22 = 4194304
     88     12582917,  // 2^23 = 8388608
     89     25165843,  // 2^24 = 16777216
     90     50331653,  // 2^25 = 33554432
     91    100663319,  // 2^26 = 67108864
     92    201326611,  // 2^27 = 134217728
     93    402653189,  // 2^28 = 268435456
     94    805306457,  // 2^29 = 536870912
     95   1610612741,  // 2^30 = 1073741824
     96 };
     97 
     98 uint chooseBucket(uint hash, uint count) {
     99   // Integer modulus is really, really slow. It turns out that the compiler can generate much
    100   // faster code if the denominator is a constant. Since we have a fixed set of possible
    101   // denominators, a big old switch() statement is a win.
    102 
    103   // TODO(perf): Consider using power-of-two bucket sizes. We can safely do so as long as we demand
    104   //   high-quality hash functions -- kj::hashCode() needs good diffusion even for integers, can't
    105   //   just be a cast. Also be sure to implement Robin Hood hashing to avoid extremely bad negative
    106   //   lookup time when elements have sequential hashes (otherwise, it could be necessary to scan
    107   //   the entire list to determine that an element isn't present).
    108 
    109   switch (count) {
    110 #define HANDLE(i) case i##u: return hash % i##u
    111     HANDLE(         1);
    112     HANDLE(         3);
    113     HANDLE(         5);
    114     HANDLE(        11);
    115     HANDLE(        23);
    116     HANDLE(        53);
    117     HANDLE(        97);
    118     HANDLE(       193);
    119     HANDLE(       389);
    120     HANDLE(       769);
    121     HANDLE(      1543);
    122     HANDLE(      3079);
    123     HANDLE(      6151);
    124     HANDLE(     12289);
    125     HANDLE(     24593);
    126     HANDLE(     49157);
    127     HANDLE(     98317);
    128     HANDLE(    196613);
    129     HANDLE(    393241);
    130     HANDLE(    786433);
    131     HANDLE(   1572869);
    132     HANDLE(   3145739);
    133     HANDLE(   6291469);
    134     HANDLE(  12582917);
    135     HANDLE(  25165843);
    136     HANDLE(  50331653);
    137     HANDLE( 100663319);
    138     HANDLE( 201326611);
    139     HANDLE( 402653189);
    140     HANDLE( 805306457);
    141     HANDLE(1610612741);
    142 #undef HANDLE
    143     default: return hash % count;
    144   }
    145 }
    146 
    147 size_t chooseHashTableSize(uint size) {
    148   if (size == 0) return 0;
    149 
    150   // Add 1 to compensate for the floor() above, then look up the best prime bucket size for that
    151   // target size.
    152   return PRIMES[lg(size) + 1];
    153 }
    154 
    155 kj::Array<HashBucket> rehash(kj::ArrayPtr<const HashBucket> oldBuckets, size_t targetSize) {
    156   // Rehash the whole table.
    157 
    158   KJ_REQUIRE(targetSize < (1 << 30), "hash table has reached maximum size");
    159 
    160   size_t size = chooseHashTableSize(targetSize);
    161 
    162   if (size < oldBuckets.size()) {
    163     size = oldBuckets.size();
    164   }
    165 
    166   auto newBuckets = kj::heapArray<HashBucket>(size);
    167   memset(newBuckets.begin(), 0, sizeof(HashBucket) * size);
    168 
    169   uint entryCount = 0;
    170   uint collisionCount = 0;
    171 
    172   for (auto& oldBucket: oldBuckets) {
    173     if (oldBucket.isOccupied()) {
    174       ++entryCount;
    175       for (uint i = oldBucket.hash % newBuckets.size();; i = probeHash(newBuckets, i)) {
    176         auto& newBucket = newBuckets[i];
    177         if (newBucket.isEmpty()) {
    178           newBucket = oldBucket;
    179           break;
    180         }
    181         ++collisionCount;
    182       }
    183     }
    184   }
    185 
    186   if (collisionCount > 16 + entryCount * 4) {
    187     static bool warned = false;
    188     if (!warned) {
    189       KJ_LOG(WARNING, "detected excessive collisions in hash table; is your hash function OK?",
    190           entryCount, collisionCount, kj::getStackTrace());
    191       warned = true;
    192     }
    193   }
    194 
    195   return newBuckets;
    196 }
    197 
    198 // =======================================================================================
    199 // BTree
    200 
    201 #if _WIN32
    202 #define aligned_free _aligned_free
    203 #else
    204 #define aligned_free ::free
    205 #endif
    206 
    207 BTreeImpl::BTreeImpl()
    208     : tree(const_cast<NodeUnion*>(&EMPTY_NODE)),
    209       treeCapacity(1),
    210       height(0),
    211       freelistHead(1),
    212       freelistSize(0),
    213       beginLeaf(0),
    214       endLeaf(0) {}
    215 
    216 BTreeImpl::~BTreeImpl() noexcept(false) {
    217   if (tree != &EMPTY_NODE) {
    218     aligned_free(tree);
    219   }
    220 }
    221 
    222 BTreeImpl::BTreeImpl(BTreeImpl&& other)
    223   : BTreeImpl() {
    224   *this = kj::mv(other);
    225 }
    226 
    227 BTreeImpl& BTreeImpl::operator=(BTreeImpl&& other) {
    228   KJ_DASSERT(&other != this);
    229 
    230   if (tree != &EMPTY_NODE) {
    231     aligned_free(tree);
    232   }
    233   tree = other.tree;
    234   treeCapacity = other.treeCapacity;
    235   height = other.height;
    236   freelistHead = other.freelistHead;
    237   freelistSize = other.freelistSize;
    238   beginLeaf = other.beginLeaf;
    239   endLeaf = other.endLeaf;
    240 
    241   other.tree = const_cast<NodeUnion*>(&EMPTY_NODE);
    242   other.treeCapacity = 1;
    243   other.height = 0;
    244   other.freelistHead = 1;
    245   other.freelistSize = 0;
    246   other.beginLeaf = 0;
    247   other.endLeaf = 0;
    248 
    249   return *this;
    250 }
    251 
    252 const BTreeImpl::NodeUnion BTreeImpl::EMPTY_NODE = {{{0, {0}}}};
    253 
    254 void BTreeImpl::verify(size_t size, FunctionParam<bool(uint, uint)> f) {
    255   KJ_ASSERT(verifyNode(size, f, 0, height, nullptr) == size);
    256 }
    257 size_t BTreeImpl::verifyNode(size_t size, FunctionParam<bool(uint, uint)>& f,
    258                              uint pos, uint height, MaybeUint maxRow) {
    259   if (height > 0) {
    260     auto& parent = tree[pos].parent;
    261 
    262     auto n = parent.keyCount();
    263     size_t total = 0;
    264     for (auto i: kj::zeroTo(n)) {
    265       KJ_ASSERT(*parent.keys[i] < size, n, i);
    266       total += verifyNode(size, f, parent.children[i], height - 1, parent.keys[i]);
    267       if (i + 1 < n) {
    268         KJ_ASSERT(f(*parent.keys[i], *parent.keys[i + 1]),
    269             n, i, parent.keys[i], parent.keys[i + 1]);
    270       }
    271     }
    272     total += verifyNode(size, f, parent.children[n], height - 1, maxRow);
    273     if (maxRow != nullptr) {
    274       KJ_ASSERT(f(*parent.keys[n-1], *maxRow), n, parent.keys[n-1], maxRow);
    275     }
    276     return total;
    277   } else {
    278     auto& leaf = tree[pos].leaf;
    279     auto n = leaf.size();
    280     for (auto i: kj::zeroTo(n)) {
    281       KJ_ASSERT(*leaf.rows[i] < size, n, i);
    282       if (i + 1 < n) {
    283         KJ_ASSERT(f(*leaf.rows[i], *leaf.rows[i + 1]),
    284             n, i, leaf.rows[i], leaf.rows[i + 1]);
    285       } else if (maxRow != nullptr) {
    286         KJ_ASSERT(leaf.rows[n-1] == maxRow);
    287       }
    288     }
    289     return n;
    290   }
    291 }
    292 
    293 kj::String BTreeImpl::MaybeUint::toString() const {
    294   return i == 0 ? kj::str("(null)") : kj::str(i - 1);
    295 }
    296 
    297 void BTreeImpl::logInconsistency() const {
    298   KJ_LOG(ERROR,
    299       "BTreeIndex detected tree state inconsistency. This can happen if you create a kj::Table "
    300       "with a b-tree index and you modify the rows in the table post-indexing in a way that would "
    301       "change their ordering. This is a serious bug which will lead to undefined behavior."
    302       "\nstack: ", kj::getStackTrace());
    303 }
    304 
    305 void BTreeImpl::reserve(size_t size) {
    306   KJ_REQUIRE(size < (1u << 31), "b-tree has reached maximum size");
    307 
    308   // Calculate the worst-case number of leaves to cover the size, given that a leaf is always at
    309   // least half-full. (Note that it's correct for this calculation to round down, not up: The
    310   // remainder will necessarily be distributed among the non-full leaves, rather than creating a
    311   // new leaf, because if it went into a new leaf, that leaf would be less than half-full.)
    312   uint leaves = size / (Leaf::NROWS / 2);
    313 
    314   // Calculate the worst-case number of parents to cover the leaves, given that a parent is always
    315   // at least half-full. Since the parents form a tree with branching factor B, the size of the
    316   // tree is N/B + N/B^2 + N/B^3 + N/B^4 + ... = N / (B - 1). Math.
    317   constexpr uint branchingFactor = Parent::NCHILDREN / 2;
    318   uint parents = leaves / (branchingFactor - 1);
    319 
    320   // Height is log-base-branching-factor of leaves, plus 1 for the root node.
    321   uint height = lg(leaves | 1) / lg(branchingFactor) + 1;
    322 
    323   size_t newSize = leaves +
    324       parents + 1 +  // + 1 for the root
    325       height + 2;    // minimum freelist size needed by insert()
    326 
    327   if (treeCapacity < newSize) {
    328     growTree(newSize);
    329   }
    330 }
    331 
    332 void BTreeImpl::clear() {
    333   if (tree != &EMPTY_NODE) {
    334     azero(tree, treeCapacity);
    335     height = 0;
    336     freelistHead = 1;
    337     freelistSize = treeCapacity;
    338     beginLeaf = 0;
    339     endLeaf = 0;
    340   }
    341 }
    342 
    343 void BTreeImpl::growTree(uint minCapacity) {
    344   uint newCapacity = kj::max(kj::max(minCapacity, treeCapacity * 2), 4);
    345   freelistSize += newCapacity - treeCapacity;
    346 
    347   // Allocate some aligned memory! In theory this should be as simple as calling the C11 standard
    348   // aligned_alloc() function. Unfortunately, many platforms don't implement it. Luckily, there
    349   // are usually alternatives.
    350 
    351 #if _WIN32
    352   // Windows lacks aligned_alloc() but has its own _aligned_malloc() (which requires freeing using
    353   // _aligned_free()).
    354   // WATCH OUT: The argument order for _aligned_malloc() is opposite of aligned_alloc()!
    355   NodeUnion* newTree = reinterpret_cast<NodeUnion*>(
    356       _aligned_malloc(newCapacity * sizeof(BTreeImpl::NodeUnion), sizeof(BTreeImpl::NodeUnion)));
    357   KJ_ASSERT(newTree != nullptr, "memory allocation failed", newCapacity);
    358 #else
    359   // macOS, OpenBSD, and Android lack aligned_alloc(), but have posix_memalign(). Fine.
    360   void* allocPtr;
    361   int error = posix_memalign(&allocPtr,
    362       sizeof(BTreeImpl::NodeUnion), newCapacity * sizeof(BTreeImpl::NodeUnion));
    363   if (error != 0) {
    364     KJ_FAIL_SYSCALL("posix_memalign", error);
    365   }
    366   NodeUnion* newTree = reinterpret_cast<NodeUnion*>(allocPtr);
    367 #endif
    368 
    369   // Note: C11 introduces aligned_alloc() as a standard, but it's still missing on many platforms,
    370   //   so we don't use it. But if you wanted to use it, you'd do this:
    371 //  NodeUnion* newTree = reinterpret_cast<NodeUnion*>(
    372 //      aligned_alloc(sizeof(BTreeImpl::NodeUnion), newCapacity * sizeof(BTreeImpl::NodeUnion)));
    373 //  KJ_ASSERT(newTree != nullptr, "memory allocation failed", newCapacity);
    374 
    375   acopy(newTree, tree, treeCapacity);
    376   azero(newTree + treeCapacity, newCapacity - treeCapacity);
    377   if (tree != &EMPTY_NODE) aligned_free(tree);
    378   tree = newTree;
    379   treeCapacity = newCapacity;
    380 }
    381 
    382 BTreeImpl::Iterator BTreeImpl::search(const SearchKey& searchKey) const {
    383   // Find the "first" row number (in sorted order) for which searchKey.isAfter(rowNumber) returns
    384   // false.
    385 
    386   uint pos = 0;
    387 
    388   for (auto i KJ_UNUSED: zeroTo(height)) {
    389     auto& parent = tree[pos].parent;
    390     pos = parent.children[searchKey.search(parent)];
    391   }
    392 
    393   auto& leaf = tree[pos].leaf;
    394   return { tree, &leaf, searchKey.search(leaf) };
    395 }
    396 
    397 template <typename T>
    398 struct BTreeImpl::AllocResult {
    399   uint index;
    400   T& node;
    401 };
    402 
    403 template <typename T>
    404 inline BTreeImpl::AllocResult<T> BTreeImpl::alloc() {
    405   // Allocate a new item from the freelist. Guaranteed to be zero'd except for the first member.
    406   uint i = freelistHead;
    407   NodeUnion* ptr = &tree[i];
    408   freelistHead = i + 1 + ptr->freelist.nextOffset;
    409   --freelistSize;
    410   return { i, *ptr };
    411 }
    412 
    413 inline void BTreeImpl::free(uint pos) {
    414   // Add the given node to the freelist.
    415 
    416   // HACK: This is typically called on a node immediately after copying its contents away, but the
    417   //   pointer used to copy it away may be a different pointer pointing to a different union member
    418   //   which the compiler may not recgonize as aliasing with this object. Just to be extra-safe,
    419   //   insert a compiler barrier.
    420   compilerBarrier();
    421 
    422   auto& node = tree[pos];
    423   node.freelist.nextOffset = freelistHead - pos - 1;
    424   azero(node.freelist.zero, kj::size(node.freelist.zero));
    425   freelistHead = pos;
    426   ++freelistSize;
    427 }
    428 
    429 BTreeImpl::Iterator BTreeImpl::insert(const SearchKey& searchKey) {
    430   // Like search() but ensures that there is room in the leaf node to insert a new row.
    431 
    432   // If we split the root node it will generate two new nodes. If we split any other node in the
    433   // path it will generate one new node. `height` doesn't count leaf nodes, but we can equivalently
    434   // think of it as not counting the root node, so in the worst case we may allocate height + 2
    435   // new nodes.
    436   //
    437   // (Also note that if the tree is currently empty, then `tree` points to a dummy root node in
    438   // read-only memory. We definitely need to allocate a real tree node array in this case, and
    439   // we'll start out allocating space for four nodes, which will be all we need up to 28 rows.)
    440   if (freelistSize < height + 2) {
    441     if (height > 0 && !tree[0].parent.isFull() && freelistSize >= height) {
    442       // Slight optimization: The root node is not full, so we're definitely not going to split it.
    443       // That means that the maximum allocations we might do is equal to `height`, not
    444       // `height + 2`, and we have that much space, so no need to grow yet.
    445       //
    446       // This optimization is particularly important for small trees, e.g. when treeCapacity is 4
    447       // and the tree so far consists of a root and two children, we definitely don't need to grow
    448       // the tree yet.
    449     } else {
    450       growTree();
    451 
    452       if (freelistHead == 0) {
    453         // We have no root yet. Allocate one.
    454         KJ_ASSERT(alloc<Parent>().index == 0);
    455       }
    456     }
    457   }
    458 
    459   uint pos = 0;
    460 
    461   // Track grandparent node and child index within grandparent.
    462   Parent* parent = nullptr;
    463   uint indexInParent = 0;
    464 
    465   for (auto i KJ_UNUSED: zeroTo(height)) {
    466     Parent& node = insertHelper(searchKey, tree[pos].parent, parent, indexInParent, pos);
    467 
    468     parent = &node;
    469     indexInParent = searchKey.search(node);
    470     pos = node.children[indexInParent];
    471   }
    472 
    473   Leaf& leaf = insertHelper(searchKey, tree[pos].leaf, parent, indexInParent, pos);
    474 
    475   // Fun fact: Unlike erase(), there's no need to climb back up the tree modifying keys, because
    476   // either the newly-inserted node will not be the last in the leaf (and thus parent keys aren't
    477   // modified), or the leaf is the last leaf in the tree (and thus there's no parent key to
    478   // modify).
    479 
    480   return { tree, &leaf, searchKey.search(leaf) };
    481 }
    482 
    483 template <typename Node>
    484 Node& BTreeImpl::insertHelper(const SearchKey& searchKey,
    485     Node& node, Parent* parent, uint indexInParent, uint pos) {
    486   if (node.isFull()) {
    487     // This node is full. Need to split.
    488     if (parent == nullptr) {
    489       // This is the root node. We need to split into two nodes and create a new root.
    490       auto n1 = alloc<Node>();
    491       auto n2 = alloc<Node>();
    492 
    493       uint pivot = split(n2.node, n2.index, node, pos);
    494       move(n1.node, n1.index, node);
    495 
    496       // Rewrite root to have the two children.
    497       tree[0].parent.initRoot(pivot, n1.index, n2.index);
    498 
    499       // Increased height.
    500       ++height;
    501 
    502       // Decide which new branch has our search key.
    503       if (searchKey.isAfter(pivot)) {
    504         // the right one
    505         return n2.node;
    506       } else {
    507         // the left one
    508         return n1.node;
    509       }
    510     } else {
    511       // This is a non-root parent node. We need to split it into two and insert the new node
    512       // into the grandparent.
    513       auto n = alloc<Node>();
    514       uint pivot = split(n.node, n.index, node, pos);
    515 
    516       // Insert new child into grandparent.
    517       parent->insertAfter(indexInParent, pivot, n.index);
    518 
    519       // Decide which new branch has our search key.
    520       if (searchKey.isAfter(pivot)) {
    521         // the new one, which is right of the original
    522         return n.node;
    523       } else {
    524         // the original one, which is left of the new one
    525         return node;
    526       }
    527     }
    528   } else {
    529     // No split needed.
    530     return node;
    531   }
    532 }
    533 
    534 void BTreeImpl::erase(uint row, const SearchKey& searchKey) {
    535   // Erase the given row number from the tree. predicate() returns true for the given row and all
    536   // rows after it.
    537 
    538   uint pos = 0;
    539 
    540   // Track grandparent node and child index within grandparent.
    541   Parent* parent = nullptr;
    542   uint indexInParent = 0;
    543 
    544   MaybeUint* fixup = nullptr;
    545 
    546   for (auto i KJ_UNUSED: zeroTo(height)) {
    547     Parent& node = eraseHelper(tree[pos].parent, parent, indexInParent, pos, fixup);
    548 
    549     parent = &node;
    550     indexInParent = searchKey.search(node);
    551     pos = node.children[indexInParent];
    552 
    553     if (indexInParent < kj::size(node.keys) && node.keys[indexInParent] == row) {
    554       // Oh look, the row is a key in this node! We'll need to come back and fix this up later.
    555       // Note that any particular row can only appear as *one* key value anywhere in the tree, so
    556       // we only need one fixup pointer, which is nice.
    557       MaybeUint* newFixup = &node.keys[indexInParent];
    558       if (fixup == newFixup) {
    559         // The fixup pointer was already set while processing a parent node, and then a merge or
    560         // rotate caused it to be moved, but the fixup pointer was updated... so it's already set
    561         // to point at the slot we wanted it to point to, so nothing to see here.
    562       } else {
    563         KJ_DASSERT(fixup == nullptr);
    564         fixup = newFixup;
    565       }
    566     }
    567   }
    568 
    569   Leaf& leaf = eraseHelper(tree[pos].leaf, parent, indexInParent, pos, fixup);
    570 
    571   uint r = searchKey.search(leaf);
    572   if (leaf.rows[r] == row) {
    573     leaf.erase(r);
    574 
    575     if (fixup != nullptr) {
    576       // There's a key in a parent node that needs fixup. This is only possible if the removed
    577       // node is the last in its leaf.
    578       KJ_DASSERT(leaf.rows[r] == nullptr);
    579       KJ_DASSERT(r > 0);  // non-root nodes must be at least half full so this can't be item 0
    580       KJ_DASSERT(*fixup == row);
    581       *fixup = leaf.rows[r - 1];
    582     }
    583   } else {
    584     logInconsistency();
    585   }
    586 }
    587 
    588 template <typename Node>
    589 Node& BTreeImpl::eraseHelper(
    590     Node& node, Parent* parent, uint indexInParent, uint pos, MaybeUint*& fixup) {
    591   if (parent != nullptr && !node.isMostlyFull()) {
    592     // This is not the root, but it's only half-full. Rebalance.
    593     KJ_DASSERT(node.isHalfFull());
    594 
    595     if (indexInParent > 0) {
    596       // There's a sibling to the left.
    597       uint sibPos = parent->children[indexInParent - 1];
    598       Node& sib = tree[sibPos];
    599       if (sib.isMostlyFull()) {
    600         // Left sibling is more than half full. Steal one member.
    601         rotateRight(sib, node, *parent, indexInParent - 1);
    602         return node;
    603       } else {
    604         // Left sibling is half full, too. Merge.
    605         KJ_ASSERT(sib.isHalfFull());
    606         merge(sib, sibPos, *parent->keys[indexInParent - 1], node);
    607         parent->eraseAfter(indexInParent - 1);
    608         free(pos);
    609         if (fixup == &parent->keys[indexInParent]) --fixup;
    610 
    611         if (parent->keys[0] == nullptr) {
    612           // Oh hah, the parent has no keys left. It must be the root. We can eliminate it.
    613           KJ_DASSERT(parent == &tree->parent);
    614           compilerBarrier();  // don't reorder any writes to parent below here
    615           move(tree[0], 0, sib);
    616           free(sibPos);
    617           --height;
    618           return tree[0];
    619         } else {
    620           return sib;
    621         }
    622       }
    623     } else if (indexInParent < Parent::NKEYS && parent->keys[indexInParent] != nullptr) {
    624       // There's a sibling to the right.
    625       uint sibPos = parent->children[indexInParent + 1];
    626       Node& sib = tree[sibPos];
    627       if (sib.isMostlyFull()) {
    628         // Right sibling is more than half full. Steal one member.
    629         rotateLeft(node, sib, *parent, indexInParent, fixup);
    630         return node;
    631       } else {
    632         // Right sibling is half full, too. Merge.
    633         KJ_ASSERT(sib.isHalfFull());
    634         merge(node, pos, *parent->keys[indexInParent], sib);
    635         parent->eraseAfter(indexInParent);
    636         free(sibPos);
    637         if (fixup == &parent->keys[indexInParent]) fixup = nullptr;
    638 
    639         if (parent->keys[0] == nullptr) {
    640           // Oh hah, the parent has no keys left. It must be the root. We can eliminate it.
    641           KJ_DASSERT(parent == &tree->parent);
    642           compilerBarrier();  // don't reorder any writes to parent below here
    643           move(tree[0], 0, node);
    644           free(pos);
    645           --height;
    646           return tree[0];
    647         } else {
    648           return node;
    649         }
    650       }
    651     } else {
    652       KJ_FAIL_ASSERT("inconsistent b-tree");
    653     }
    654   }
    655 
    656   return node;
    657 }
    658 
    659 void BTreeImpl::renumber(uint oldRow, uint newRow, const SearchKey& searchKey) {
    660   // Renumber the given row from oldRow to newRow. predicate() returns true for oldRow and all
    661   // rows after it. (It will not be called on newRow.)
    662 
    663   uint pos = 0;
    664 
    665   for (auto i KJ_UNUSED: zeroTo(height)) {
    666     auto& node = tree[pos].parent;
    667     uint indexInParent = searchKey.search(node);
    668     pos = node.children[indexInParent];
    669     if (indexInParent < kj::size(node.keys) && node.keys[indexInParent] == oldRow) {
    670       node.keys[indexInParent] = newRow;
    671     }
    672     KJ_DASSERT(pos != 0);
    673   }
    674 
    675   auto& leaf = tree[pos].leaf;
    676   uint r = searchKey.search(leaf);
    677   if (leaf.rows[r] == oldRow) {
    678     leaf.rows[r] = newRow;
    679   } else {
    680     logInconsistency();
    681   }
    682 }
    683 
    684 uint BTreeImpl::split(Parent& dst, uint dstPos, Parent& src, uint srcPos) {
    685   constexpr size_t mid = Parent::NKEYS / 2;
    686   uint pivot = *src.keys[mid];
    687   acopy(dst.keys, src.keys + mid + 1, Parent::NKEYS - mid - 1);
    688   azero(src.keys + mid, Parent::NKEYS - mid);
    689   acopy(dst.children, src.children + mid + 1, Parent::NCHILDREN - mid - 1);
    690   azero(src.children + mid + 1, Parent::NCHILDREN - mid - 1);
    691   return pivot;
    692 }
    693 
    694 uint BTreeImpl::split(Leaf& dst, uint dstPos, Leaf& src, uint srcPos) {
    695   constexpr size_t mid = Leaf::NROWS / 2;
    696   uint pivot = *src.rows[mid - 1];
    697   acopy(dst.rows, src.rows + mid, Leaf::NROWS - mid);
    698   azero(src.rows + mid, Leaf::NROWS - mid);
    699 
    700   if (src.next == 0) {
    701     endLeaf = dstPos;
    702   } else {
    703     tree[src.next].leaf.prev = dstPos;
    704   }
    705   dst.next = src.next;
    706   dst.prev = srcPos;
    707   src.next = dstPos;
    708 
    709   return pivot;
    710 }
    711 
    712 void BTreeImpl::merge(Parent& dst, uint dstPos, uint pivot, Parent& src) {
    713   // merge() is only legal if both nodes are half-empty. Meanwhile, B-tree invariants
    714   // guarantee that the node can't be more than half-empty, or we would have merged it sooner.
    715   // (The root can be more than half-empty, but it is never merged with anything.)
    716   KJ_DASSERT(src.isHalfFull());
    717   KJ_DASSERT(dst.isHalfFull());
    718 
    719   constexpr size_t mid = Parent::NKEYS/2;
    720   dst.keys[mid] = pivot;
    721   acopy(dst.keys + mid + 1, src.keys, mid);
    722   acopy(dst.children + mid + 1, src.children, mid + 1);
    723 }
    724 
    725 void BTreeImpl::merge(Leaf& dst, uint dstPos, uint pivot, Leaf& src) {
    726   // merge() is only legal if both nodes are half-empty. Meanwhile, B-tree invariants
    727   // guarantee that the node can't be more than half-empty, or we would have merged it sooner.
    728   // (The root can be more than half-empty, but it is never merged with anything.)
    729   KJ_DASSERT(src.isHalfFull());
    730   KJ_DASSERT(dst.isHalfFull());
    731 
    732   constexpr size_t mid = Leaf::NROWS/2;
    733   dst.rows[mid] = pivot;
    734   acopy(dst.rows + mid, src.rows, mid);
    735 
    736   dst.next = src.next;
    737   if (dst.next == 0) {
    738     endLeaf = dstPos;
    739   } else {
    740     tree[dst.next].leaf.prev = dstPos;
    741   }
    742 }
    743 
    744 void BTreeImpl::move(Parent& dst, uint dstPos, Parent& src) {
    745   dst = src;
    746 }
    747 
    748 void BTreeImpl::move(Leaf& dst, uint dstPos, Leaf& src) {
    749   dst = src;
    750   if (src.next == 0) {
    751     endLeaf = dstPos;
    752   } else {
    753     tree[src.next].leaf.prev = dstPos;
    754   }
    755   if (src.prev == 0) {
    756     beginLeaf = dstPos;
    757   } else {
    758     tree[src.prev].leaf.next = dstPos;
    759   }
    760 }
    761 
    762 void BTreeImpl::rotateLeft(
    763     Parent& left, Parent& right, Parent& parent, uint indexInParent, MaybeUint*& fixup) {
    764   // Steal one item from the right node and move it to the left node.
    765 
    766   // Like merge(), this is only called on an exactly-half-empty node.
    767   KJ_DASSERT(left.isHalfFull());
    768   KJ_DASSERT(right.isMostlyFull());
    769 
    770   constexpr size_t mid = Parent::NKEYS/2;
    771   left.keys[mid] = parent.keys[indexInParent];
    772   if (fixup == &parent.keys[indexInParent]) fixup = &left.keys[mid];
    773   parent.keys[indexInParent] = right.keys[0];
    774   left.children[mid + 1] = right.children[0];
    775   amove(right.keys, right.keys + 1, Parent::NKEYS - 1);
    776   right.keys[Parent::NKEYS - 1] = nullptr;
    777   amove(right.children, right.children + 1, Parent::NCHILDREN - 1);
    778   right.children[Parent::NCHILDREN - 1] = 0;
    779 }
    780 
    781 void BTreeImpl::rotateLeft(
    782     Leaf& left, Leaf& right, Parent& parent, uint indexInParent, MaybeUint*& fixup) {
    783   // Steal one item from the right node and move it to the left node.
    784 
    785   // Like merge(), this is only called on an exactly-half-empty node.
    786   KJ_DASSERT(left.isHalfFull());
    787   KJ_DASSERT(right.isMostlyFull());
    788 
    789   constexpr size_t mid = Leaf::NROWS/2;
    790   parent.keys[indexInParent] = left.rows[mid] = right.rows[0];
    791   if (fixup == &parent.keys[indexInParent]) fixup = nullptr;
    792   amove(right.rows, right.rows + 1, Leaf::NROWS - 1);
    793   right.rows[Leaf::NROWS - 1] = nullptr;
    794 }
    795 
    796 void BTreeImpl::rotateRight(Parent& left, Parent& right, Parent& parent, uint indexInParent) {
    797   // Steal one item from the left node and move it to the right node.
    798 
    799   // Like merge(), this is only called on an exactly-half-empty node.
    800   KJ_DASSERT(right.isHalfFull());
    801   KJ_DASSERT(left.isMostlyFull());
    802 
    803   constexpr size_t mid = Parent::NKEYS/2;
    804   amove(right.keys + 1, right.keys, mid);
    805   amove(right.children + 1, right.children, mid + 1);
    806 
    807   uint back = left.keyCount() - 1;
    808 
    809   right.keys[0] = parent.keys[indexInParent];
    810   parent.keys[indexInParent] = left.keys[back];
    811   right.children[0] = left.children[back + 1];
    812   left.keys[back] = nullptr;
    813   left.children[back + 1] = 0;
    814 }
    815 
    816 void BTreeImpl::rotateRight(Leaf& left, Leaf& right, Parent& parent, uint indexInParent) {
    817   // Steal one item from the left node and move it to the right node.
    818 
    819   // Like mergeFrom(), this is only called on an exactly-half-empty node.
    820   KJ_DASSERT(right.isHalfFull());
    821   KJ_DASSERT(left.isMostlyFull());
    822 
    823   constexpr size_t mid = Leaf::NROWS/2;
    824   amove(right.rows + 1, right.rows, mid);
    825 
    826   uint back = left.size() - 1;
    827 
    828   right.rows[0] = left.rows[back];
    829   parent.keys[indexInParent] = left.rows[back - 1];
    830   left.rows[back] = nullptr;
    831 }
    832 
    833 void BTreeImpl::Parent::initRoot(uint key, uint leftChild, uint rightChild) {
    834   // HACK: This is typically called on the root node immediately after copying its contents away,
    835   //   but the pointer used to copy it away may be a different pointer pointing to a different
    836   //   union member which the compiler may not recgonize as aliasing with this object. Just to
    837   //   be extra-safe, insert a compiler barrier.
    838   compilerBarrier();
    839 
    840   keys[0] = key;
    841   children[0] = leftChild;
    842   children[1] = rightChild;
    843   azero(keys + 1, Parent::NKEYS - 1);
    844   azero(children + 2, Parent::NCHILDREN - 2);
    845 }
    846 
    847 void BTreeImpl::Parent::insertAfter(uint i, uint splitKey, uint child) {
    848   KJ_IREQUIRE(children[Parent::NCHILDREN - 1] == 0);  // check not full
    849 
    850   amove(keys + i + 1, keys + i, Parent::NKEYS - (i + 1));
    851   keys[i] = splitKey;
    852 
    853   amove(children + i + 2, children + i + 1, Parent::NCHILDREN - (i + 2));
    854   children[i + 1] = child;
    855 }
    856 
    857 void BTreeImpl::Parent::eraseAfter(uint i) {
    858   amove(keys + i, keys + i + 1, Parent::NKEYS - (i + 1));
    859   keys[Parent::NKEYS - 1] = nullptr;
    860   amove(children + i + 1, children + i + 2, Parent::NCHILDREN - (i + 2));
    861   children[Parent::NCHILDREN - 1] = 0;
    862 }
    863 
    864 }  // namespace _
    865 
    866 // =======================================================================================
    867 // Insertion order
    868 
    869 const InsertionOrderIndex::Link InsertionOrderIndex::EMPTY_LINK = { 0, 0 };
    870 
    871 InsertionOrderIndex::InsertionOrderIndex(): capacity(0), links(const_cast<Link*>(&EMPTY_LINK)) {}
    872 InsertionOrderIndex::InsertionOrderIndex(InsertionOrderIndex&& other)
    873     : capacity(other.capacity), links(other.links) {
    874   other.capacity = 0;
    875   other.links = const_cast<Link*>(&EMPTY_LINK);
    876 }
    877 InsertionOrderIndex& InsertionOrderIndex::operator=(InsertionOrderIndex&& other) {
    878   KJ_DASSERT(&other != this);
    879   capacity = other.capacity;
    880   links = other.links;
    881   other.capacity = 0;
    882   other.links = const_cast<Link*>(&EMPTY_LINK);
    883   return *this;
    884 }
    885 InsertionOrderIndex::~InsertionOrderIndex() noexcept(false) {
    886   if (links != &EMPTY_LINK) delete[] links;
    887 }
    888 
    889 void InsertionOrderIndex::reserve(size_t size) {
    890   KJ_ASSERT(size < (1u << 31), "Table too big for InsertionOrderIndex");
    891 
    892   if (size > capacity) {
    893     // Need to grow.
    894     // Note that `size` and `capacity` do not include the special link[0].
    895 
    896     // Round up to the next power of 2.
    897     size_t allocation = 1u << (_::lg(size) + 1);
    898     KJ_DASSERT(allocation > size);
    899     KJ_DASSERT(allocation <= size * 2);
    900 
    901     // Round first allocation up to 8.
    902     allocation = kj::max(allocation, 8);
    903 
    904     Link* newLinks = new Link[allocation];
    905 #ifdef KJ_DEBUG
    906     // To catch bugs, fill unused links with 0xff.
    907     memset(newLinks, 0xff, allocation * sizeof(Link));
    908 #endif
    909     _::acopy(newLinks, links, capacity + 1);
    910     if (links != &EMPTY_LINK) delete[] links;
    911     links = newLinks;
    912     capacity = allocation - 1;
    913   }
    914 }
    915 
    916 void InsertionOrderIndex::clear() {
    917   links[0] = Link { 0, 0 };
    918 
    919 #ifdef KJ_DEBUG
    920   // To catch bugs, fill unused links with 0xff.
    921   memset(links + 1, 0xff, capacity * sizeof(Link));
    922 #endif
    923 }
    924 
    925 kj::Maybe<size_t> InsertionOrderIndex::insertImpl(size_t pos) {
    926   if (pos >= capacity) {
    927     reserve(pos + 1);
    928   }
    929 
    930   links[pos + 1].prev = links[0].prev;
    931   links[pos + 1].next = 0;
    932   links[links[0].prev].next = pos + 1;
    933   links[0].prev = pos + 1;
    934 
    935   return nullptr;
    936 }
    937 
    938 void InsertionOrderIndex::eraseImpl(size_t pos) {
    939   Link& link = links[pos + 1];
    940   links[link.next].prev = link.prev;
    941   links[link.prev].next = link.next;
    942 
    943 #ifdef KJ_DEBUG
    944   memset(&link, 0xff, sizeof(Link));
    945 #endif
    946 }
    947 
    948 void InsertionOrderIndex::moveImpl(size_t oldPos, size_t newPos) {
    949   Link& link = links[oldPos + 1];
    950   Link& newLink = links[newPos + 1];
    951 
    952   newLink = link;
    953 
    954   KJ_DASSERT(links[link.next].prev == oldPos + 1);
    955   KJ_DASSERT(links[link.prev].next == oldPos + 1);
    956   links[link.next].prev = newPos + 1;
    957   links[link.prev].next = newPos + 1;
    958 
    959 #ifdef KJ_DEBUG
    960   memset(&link, 0xff, sizeof(Link));
    961 #endif
    962 }
    963 
    964 }  // namespace kj