table.c++ (32147B)
1 // Copyright (c) 2018 Kenton Varda and contributors 2 // Licensed under the MIT License: 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE. 21 22 #include "table.h" 23 #include "debug.h" 24 #include <stdlib.h> 25 26 #if KJ_DEBUG_TABLE_IMPL 27 #undef KJ_DASSERT 28 #define KJ_DASSERT KJ_ASSERT 29 #endif 30 31 namespace kj { 32 namespace _ { 33 34 static inline uint lg(uint value) { 35 // Compute floor(log2(value)). 36 // 37 // Undefined for value = 0. 38 #if _MSC_VER && !defined(__clang__) 39 unsigned long i; 40 auto found = _BitScanReverse(&i, value); 41 KJ_DASSERT(found); // !found means value = 0 42 return i; 43 #else 44 return sizeof(uint) * 8 - 1 - __builtin_clz(value); 45 #endif 46 } 47 48 void throwDuplicateTableRow() { 49 KJ_FAIL_REQUIRE("inserted row already exists in table"); 50 } 51 52 void logHashTableInconsistency() { 53 KJ_LOG(ERROR, 54 "HashIndex detected hash table inconsistency. This can happen if you create a kj::Table " 55 "with a hash index and you modify the rows in the table post-indexing in a way that would " 56 "change their hash. This is a serious bug which will lead to undefined behavior." 57 "\nstack: ", kj::getStackTrace()); 58 } 59 60 // List of primes where each element is roughly double the previous. Obtained 61 // from: 62 // http://planetmath.org/goodhashtableprimes 63 // Primes < 53 were added to ensure that small tables don't allocate excessive memory. 64 static const size_t PRIMES[] = { 65 1, // 2^ 0 = 1 66 3, // 2^ 1 = 2 67 5, // 2^ 2 = 4 68 11, // 2^ 3 = 8 69 23, // 2^ 4 = 16 70 53, // 2^ 5 = 32 71 97, // 2^ 6 = 64 72 193, // 2^ 7 = 128 73 389, // 2^ 8 = 256 74 769, // 2^ 9 = 512 75 1543, // 2^10 = 1024 76 3079, // 2^11 = 2048 77 6151, // 2^12 = 4096 78 12289, // 2^13 = 8192 79 24593, // 2^14 = 16384 80 49157, // 2^15 = 32768 81 98317, // 2^16 = 65536 82 196613, // 2^17 = 131072 83 393241, // 2^18 = 262144 84 786433, // 2^19 = 524288 85 1572869, // 2^20 = 1048576 86 3145739, // 2^21 = 2097152 87 6291469, // 2^22 = 4194304 88 12582917, // 2^23 = 8388608 89 25165843, // 2^24 = 16777216 90 50331653, // 2^25 = 33554432 91 100663319, // 2^26 = 67108864 92 201326611, // 2^27 = 134217728 93 402653189, // 2^28 = 268435456 94 805306457, // 2^29 = 536870912 95 1610612741, // 2^30 = 1073741824 96 }; 97 98 uint chooseBucket(uint hash, uint count) { 99 // Integer modulus is really, really slow. It turns out that the compiler can generate much 100 // faster code if the denominator is a constant. Since we have a fixed set of possible 101 // denominators, a big old switch() statement is a win. 102 103 // TODO(perf): Consider using power-of-two bucket sizes. We can safely do so as long as we demand 104 // high-quality hash functions -- kj::hashCode() needs good diffusion even for integers, can't 105 // just be a cast. Also be sure to implement Robin Hood hashing to avoid extremely bad negative 106 // lookup time when elements have sequential hashes (otherwise, it could be necessary to scan 107 // the entire list to determine that an element isn't present). 108 109 switch (count) { 110 #define HANDLE(i) case i##u: return hash % i##u 111 HANDLE( 1); 112 HANDLE( 3); 113 HANDLE( 5); 114 HANDLE( 11); 115 HANDLE( 23); 116 HANDLE( 53); 117 HANDLE( 97); 118 HANDLE( 193); 119 HANDLE( 389); 120 HANDLE( 769); 121 HANDLE( 1543); 122 HANDLE( 3079); 123 HANDLE( 6151); 124 HANDLE( 12289); 125 HANDLE( 24593); 126 HANDLE( 49157); 127 HANDLE( 98317); 128 HANDLE( 196613); 129 HANDLE( 393241); 130 HANDLE( 786433); 131 HANDLE( 1572869); 132 HANDLE( 3145739); 133 HANDLE( 6291469); 134 HANDLE( 12582917); 135 HANDLE( 25165843); 136 HANDLE( 50331653); 137 HANDLE( 100663319); 138 HANDLE( 201326611); 139 HANDLE( 402653189); 140 HANDLE( 805306457); 141 HANDLE(1610612741); 142 #undef HANDLE 143 default: return hash % count; 144 } 145 } 146 147 size_t chooseHashTableSize(uint size) { 148 if (size == 0) return 0; 149 150 // Add 1 to compensate for the floor() above, then look up the best prime bucket size for that 151 // target size. 152 return PRIMES[lg(size) + 1]; 153 } 154 155 kj::Array<HashBucket> rehash(kj::ArrayPtr<const HashBucket> oldBuckets, size_t targetSize) { 156 // Rehash the whole table. 157 158 KJ_REQUIRE(targetSize < (1 << 30), "hash table has reached maximum size"); 159 160 size_t size = chooseHashTableSize(targetSize); 161 162 if (size < oldBuckets.size()) { 163 size = oldBuckets.size(); 164 } 165 166 auto newBuckets = kj::heapArray<HashBucket>(size); 167 memset(newBuckets.begin(), 0, sizeof(HashBucket) * size); 168 169 uint entryCount = 0; 170 uint collisionCount = 0; 171 172 for (auto& oldBucket: oldBuckets) { 173 if (oldBucket.isOccupied()) { 174 ++entryCount; 175 for (uint i = oldBucket.hash % newBuckets.size();; i = probeHash(newBuckets, i)) { 176 auto& newBucket = newBuckets[i]; 177 if (newBucket.isEmpty()) { 178 newBucket = oldBucket; 179 break; 180 } 181 ++collisionCount; 182 } 183 } 184 } 185 186 if (collisionCount > 16 + entryCount * 4) { 187 static bool warned = false; 188 if (!warned) { 189 KJ_LOG(WARNING, "detected excessive collisions in hash table; is your hash function OK?", 190 entryCount, collisionCount, kj::getStackTrace()); 191 warned = true; 192 } 193 } 194 195 return newBuckets; 196 } 197 198 // ======================================================================================= 199 // BTree 200 201 #if _WIN32 202 #define aligned_free _aligned_free 203 #else 204 #define aligned_free ::free 205 #endif 206 207 BTreeImpl::BTreeImpl() 208 : tree(const_cast<NodeUnion*>(&EMPTY_NODE)), 209 treeCapacity(1), 210 height(0), 211 freelistHead(1), 212 freelistSize(0), 213 beginLeaf(0), 214 endLeaf(0) {} 215 216 BTreeImpl::~BTreeImpl() noexcept(false) { 217 if (tree != &EMPTY_NODE) { 218 aligned_free(tree); 219 } 220 } 221 222 BTreeImpl::BTreeImpl(BTreeImpl&& other) 223 : BTreeImpl() { 224 *this = kj::mv(other); 225 } 226 227 BTreeImpl& BTreeImpl::operator=(BTreeImpl&& other) { 228 KJ_DASSERT(&other != this); 229 230 if (tree != &EMPTY_NODE) { 231 aligned_free(tree); 232 } 233 tree = other.tree; 234 treeCapacity = other.treeCapacity; 235 height = other.height; 236 freelistHead = other.freelistHead; 237 freelistSize = other.freelistSize; 238 beginLeaf = other.beginLeaf; 239 endLeaf = other.endLeaf; 240 241 other.tree = const_cast<NodeUnion*>(&EMPTY_NODE); 242 other.treeCapacity = 1; 243 other.height = 0; 244 other.freelistHead = 1; 245 other.freelistSize = 0; 246 other.beginLeaf = 0; 247 other.endLeaf = 0; 248 249 return *this; 250 } 251 252 const BTreeImpl::NodeUnion BTreeImpl::EMPTY_NODE = {{{0, {0}}}}; 253 254 void BTreeImpl::verify(size_t size, FunctionParam<bool(uint, uint)> f) { 255 KJ_ASSERT(verifyNode(size, f, 0, height, nullptr) == size); 256 } 257 size_t BTreeImpl::verifyNode(size_t size, FunctionParam<bool(uint, uint)>& f, 258 uint pos, uint height, MaybeUint maxRow) { 259 if (height > 0) { 260 auto& parent = tree[pos].parent; 261 262 auto n = parent.keyCount(); 263 size_t total = 0; 264 for (auto i: kj::zeroTo(n)) { 265 KJ_ASSERT(*parent.keys[i] < size, n, i); 266 total += verifyNode(size, f, parent.children[i], height - 1, parent.keys[i]); 267 if (i + 1 < n) { 268 KJ_ASSERT(f(*parent.keys[i], *parent.keys[i + 1]), 269 n, i, parent.keys[i], parent.keys[i + 1]); 270 } 271 } 272 total += verifyNode(size, f, parent.children[n], height - 1, maxRow); 273 if (maxRow != nullptr) { 274 KJ_ASSERT(f(*parent.keys[n-1], *maxRow), n, parent.keys[n-1], maxRow); 275 } 276 return total; 277 } else { 278 auto& leaf = tree[pos].leaf; 279 auto n = leaf.size(); 280 for (auto i: kj::zeroTo(n)) { 281 KJ_ASSERT(*leaf.rows[i] < size, n, i); 282 if (i + 1 < n) { 283 KJ_ASSERT(f(*leaf.rows[i], *leaf.rows[i + 1]), 284 n, i, leaf.rows[i], leaf.rows[i + 1]); 285 } else if (maxRow != nullptr) { 286 KJ_ASSERT(leaf.rows[n-1] == maxRow); 287 } 288 } 289 return n; 290 } 291 } 292 293 kj::String BTreeImpl::MaybeUint::toString() const { 294 return i == 0 ? kj::str("(null)") : kj::str(i - 1); 295 } 296 297 void BTreeImpl::logInconsistency() const { 298 KJ_LOG(ERROR, 299 "BTreeIndex detected tree state inconsistency. This can happen if you create a kj::Table " 300 "with a b-tree index and you modify the rows in the table post-indexing in a way that would " 301 "change their ordering. This is a serious bug which will lead to undefined behavior." 302 "\nstack: ", kj::getStackTrace()); 303 } 304 305 void BTreeImpl::reserve(size_t size) { 306 KJ_REQUIRE(size < (1u << 31), "b-tree has reached maximum size"); 307 308 // Calculate the worst-case number of leaves to cover the size, given that a leaf is always at 309 // least half-full. (Note that it's correct for this calculation to round down, not up: The 310 // remainder will necessarily be distributed among the non-full leaves, rather than creating a 311 // new leaf, because if it went into a new leaf, that leaf would be less than half-full.) 312 uint leaves = size / (Leaf::NROWS / 2); 313 314 // Calculate the worst-case number of parents to cover the leaves, given that a parent is always 315 // at least half-full. Since the parents form a tree with branching factor B, the size of the 316 // tree is N/B + N/B^2 + N/B^3 + N/B^4 + ... = N / (B - 1). Math. 317 constexpr uint branchingFactor = Parent::NCHILDREN / 2; 318 uint parents = leaves / (branchingFactor - 1); 319 320 // Height is log-base-branching-factor of leaves, plus 1 for the root node. 321 uint height = lg(leaves | 1) / lg(branchingFactor) + 1; 322 323 size_t newSize = leaves + 324 parents + 1 + // + 1 for the root 325 height + 2; // minimum freelist size needed by insert() 326 327 if (treeCapacity < newSize) { 328 growTree(newSize); 329 } 330 } 331 332 void BTreeImpl::clear() { 333 if (tree != &EMPTY_NODE) { 334 azero(tree, treeCapacity); 335 height = 0; 336 freelistHead = 1; 337 freelistSize = treeCapacity; 338 beginLeaf = 0; 339 endLeaf = 0; 340 } 341 } 342 343 void BTreeImpl::growTree(uint minCapacity) { 344 uint newCapacity = kj::max(kj::max(minCapacity, treeCapacity * 2), 4); 345 freelistSize += newCapacity - treeCapacity; 346 347 // Allocate some aligned memory! In theory this should be as simple as calling the C11 standard 348 // aligned_alloc() function. Unfortunately, many platforms don't implement it. Luckily, there 349 // are usually alternatives. 350 351 #if _WIN32 352 // Windows lacks aligned_alloc() but has its own _aligned_malloc() (which requires freeing using 353 // _aligned_free()). 354 // WATCH OUT: The argument order for _aligned_malloc() is opposite of aligned_alloc()! 355 NodeUnion* newTree = reinterpret_cast<NodeUnion*>( 356 _aligned_malloc(newCapacity * sizeof(BTreeImpl::NodeUnion), sizeof(BTreeImpl::NodeUnion))); 357 KJ_ASSERT(newTree != nullptr, "memory allocation failed", newCapacity); 358 #else 359 // macOS, OpenBSD, and Android lack aligned_alloc(), but have posix_memalign(). Fine. 360 void* allocPtr; 361 int error = posix_memalign(&allocPtr, 362 sizeof(BTreeImpl::NodeUnion), newCapacity * sizeof(BTreeImpl::NodeUnion)); 363 if (error != 0) { 364 KJ_FAIL_SYSCALL("posix_memalign", error); 365 } 366 NodeUnion* newTree = reinterpret_cast<NodeUnion*>(allocPtr); 367 #endif 368 369 // Note: C11 introduces aligned_alloc() as a standard, but it's still missing on many platforms, 370 // so we don't use it. But if you wanted to use it, you'd do this: 371 // NodeUnion* newTree = reinterpret_cast<NodeUnion*>( 372 // aligned_alloc(sizeof(BTreeImpl::NodeUnion), newCapacity * sizeof(BTreeImpl::NodeUnion))); 373 // KJ_ASSERT(newTree != nullptr, "memory allocation failed", newCapacity); 374 375 acopy(newTree, tree, treeCapacity); 376 azero(newTree + treeCapacity, newCapacity - treeCapacity); 377 if (tree != &EMPTY_NODE) aligned_free(tree); 378 tree = newTree; 379 treeCapacity = newCapacity; 380 } 381 382 BTreeImpl::Iterator BTreeImpl::search(const SearchKey& searchKey) const { 383 // Find the "first" row number (in sorted order) for which searchKey.isAfter(rowNumber) returns 384 // false. 385 386 uint pos = 0; 387 388 for (auto i KJ_UNUSED: zeroTo(height)) { 389 auto& parent = tree[pos].parent; 390 pos = parent.children[searchKey.search(parent)]; 391 } 392 393 auto& leaf = tree[pos].leaf; 394 return { tree, &leaf, searchKey.search(leaf) }; 395 } 396 397 template <typename T> 398 struct BTreeImpl::AllocResult { 399 uint index; 400 T& node; 401 }; 402 403 template <typename T> 404 inline BTreeImpl::AllocResult<T> BTreeImpl::alloc() { 405 // Allocate a new item from the freelist. Guaranteed to be zero'd except for the first member. 406 uint i = freelistHead; 407 NodeUnion* ptr = &tree[i]; 408 freelistHead = i + 1 + ptr->freelist.nextOffset; 409 --freelistSize; 410 return { i, *ptr }; 411 } 412 413 inline void BTreeImpl::free(uint pos) { 414 // Add the given node to the freelist. 415 416 // HACK: This is typically called on a node immediately after copying its contents away, but the 417 // pointer used to copy it away may be a different pointer pointing to a different union member 418 // which the compiler may not recgonize as aliasing with this object. Just to be extra-safe, 419 // insert a compiler barrier. 420 compilerBarrier(); 421 422 auto& node = tree[pos]; 423 node.freelist.nextOffset = freelistHead - pos - 1; 424 azero(node.freelist.zero, kj::size(node.freelist.zero)); 425 freelistHead = pos; 426 ++freelistSize; 427 } 428 429 BTreeImpl::Iterator BTreeImpl::insert(const SearchKey& searchKey) { 430 // Like search() but ensures that there is room in the leaf node to insert a new row. 431 432 // If we split the root node it will generate two new nodes. If we split any other node in the 433 // path it will generate one new node. `height` doesn't count leaf nodes, but we can equivalently 434 // think of it as not counting the root node, so in the worst case we may allocate height + 2 435 // new nodes. 436 // 437 // (Also note that if the tree is currently empty, then `tree` points to a dummy root node in 438 // read-only memory. We definitely need to allocate a real tree node array in this case, and 439 // we'll start out allocating space for four nodes, which will be all we need up to 28 rows.) 440 if (freelistSize < height + 2) { 441 if (height > 0 && !tree[0].parent.isFull() && freelistSize >= height) { 442 // Slight optimization: The root node is not full, so we're definitely not going to split it. 443 // That means that the maximum allocations we might do is equal to `height`, not 444 // `height + 2`, and we have that much space, so no need to grow yet. 445 // 446 // This optimization is particularly important for small trees, e.g. when treeCapacity is 4 447 // and the tree so far consists of a root and two children, we definitely don't need to grow 448 // the tree yet. 449 } else { 450 growTree(); 451 452 if (freelistHead == 0) { 453 // We have no root yet. Allocate one. 454 KJ_ASSERT(alloc<Parent>().index == 0); 455 } 456 } 457 } 458 459 uint pos = 0; 460 461 // Track grandparent node and child index within grandparent. 462 Parent* parent = nullptr; 463 uint indexInParent = 0; 464 465 for (auto i KJ_UNUSED: zeroTo(height)) { 466 Parent& node = insertHelper(searchKey, tree[pos].parent, parent, indexInParent, pos); 467 468 parent = &node; 469 indexInParent = searchKey.search(node); 470 pos = node.children[indexInParent]; 471 } 472 473 Leaf& leaf = insertHelper(searchKey, tree[pos].leaf, parent, indexInParent, pos); 474 475 // Fun fact: Unlike erase(), there's no need to climb back up the tree modifying keys, because 476 // either the newly-inserted node will not be the last in the leaf (and thus parent keys aren't 477 // modified), or the leaf is the last leaf in the tree (and thus there's no parent key to 478 // modify). 479 480 return { tree, &leaf, searchKey.search(leaf) }; 481 } 482 483 template <typename Node> 484 Node& BTreeImpl::insertHelper(const SearchKey& searchKey, 485 Node& node, Parent* parent, uint indexInParent, uint pos) { 486 if (node.isFull()) { 487 // This node is full. Need to split. 488 if (parent == nullptr) { 489 // This is the root node. We need to split into two nodes and create a new root. 490 auto n1 = alloc<Node>(); 491 auto n2 = alloc<Node>(); 492 493 uint pivot = split(n2.node, n2.index, node, pos); 494 move(n1.node, n1.index, node); 495 496 // Rewrite root to have the two children. 497 tree[0].parent.initRoot(pivot, n1.index, n2.index); 498 499 // Increased height. 500 ++height; 501 502 // Decide which new branch has our search key. 503 if (searchKey.isAfter(pivot)) { 504 // the right one 505 return n2.node; 506 } else { 507 // the left one 508 return n1.node; 509 } 510 } else { 511 // This is a non-root parent node. We need to split it into two and insert the new node 512 // into the grandparent. 513 auto n = alloc<Node>(); 514 uint pivot = split(n.node, n.index, node, pos); 515 516 // Insert new child into grandparent. 517 parent->insertAfter(indexInParent, pivot, n.index); 518 519 // Decide which new branch has our search key. 520 if (searchKey.isAfter(pivot)) { 521 // the new one, which is right of the original 522 return n.node; 523 } else { 524 // the original one, which is left of the new one 525 return node; 526 } 527 } 528 } else { 529 // No split needed. 530 return node; 531 } 532 } 533 534 void BTreeImpl::erase(uint row, const SearchKey& searchKey) { 535 // Erase the given row number from the tree. predicate() returns true for the given row and all 536 // rows after it. 537 538 uint pos = 0; 539 540 // Track grandparent node and child index within grandparent. 541 Parent* parent = nullptr; 542 uint indexInParent = 0; 543 544 MaybeUint* fixup = nullptr; 545 546 for (auto i KJ_UNUSED: zeroTo(height)) { 547 Parent& node = eraseHelper(tree[pos].parent, parent, indexInParent, pos, fixup); 548 549 parent = &node; 550 indexInParent = searchKey.search(node); 551 pos = node.children[indexInParent]; 552 553 if (indexInParent < kj::size(node.keys) && node.keys[indexInParent] == row) { 554 // Oh look, the row is a key in this node! We'll need to come back and fix this up later. 555 // Note that any particular row can only appear as *one* key value anywhere in the tree, so 556 // we only need one fixup pointer, which is nice. 557 MaybeUint* newFixup = &node.keys[indexInParent]; 558 if (fixup == newFixup) { 559 // The fixup pointer was already set while processing a parent node, and then a merge or 560 // rotate caused it to be moved, but the fixup pointer was updated... so it's already set 561 // to point at the slot we wanted it to point to, so nothing to see here. 562 } else { 563 KJ_DASSERT(fixup == nullptr); 564 fixup = newFixup; 565 } 566 } 567 } 568 569 Leaf& leaf = eraseHelper(tree[pos].leaf, parent, indexInParent, pos, fixup); 570 571 uint r = searchKey.search(leaf); 572 if (leaf.rows[r] == row) { 573 leaf.erase(r); 574 575 if (fixup != nullptr) { 576 // There's a key in a parent node that needs fixup. This is only possible if the removed 577 // node is the last in its leaf. 578 KJ_DASSERT(leaf.rows[r] == nullptr); 579 KJ_DASSERT(r > 0); // non-root nodes must be at least half full so this can't be item 0 580 KJ_DASSERT(*fixup == row); 581 *fixup = leaf.rows[r - 1]; 582 } 583 } else { 584 logInconsistency(); 585 } 586 } 587 588 template <typename Node> 589 Node& BTreeImpl::eraseHelper( 590 Node& node, Parent* parent, uint indexInParent, uint pos, MaybeUint*& fixup) { 591 if (parent != nullptr && !node.isMostlyFull()) { 592 // This is not the root, but it's only half-full. Rebalance. 593 KJ_DASSERT(node.isHalfFull()); 594 595 if (indexInParent > 0) { 596 // There's a sibling to the left. 597 uint sibPos = parent->children[indexInParent - 1]; 598 Node& sib = tree[sibPos]; 599 if (sib.isMostlyFull()) { 600 // Left sibling is more than half full. Steal one member. 601 rotateRight(sib, node, *parent, indexInParent - 1); 602 return node; 603 } else { 604 // Left sibling is half full, too. Merge. 605 KJ_ASSERT(sib.isHalfFull()); 606 merge(sib, sibPos, *parent->keys[indexInParent - 1], node); 607 parent->eraseAfter(indexInParent - 1); 608 free(pos); 609 if (fixup == &parent->keys[indexInParent]) --fixup; 610 611 if (parent->keys[0] == nullptr) { 612 // Oh hah, the parent has no keys left. It must be the root. We can eliminate it. 613 KJ_DASSERT(parent == &tree->parent); 614 compilerBarrier(); // don't reorder any writes to parent below here 615 move(tree[0], 0, sib); 616 free(sibPos); 617 --height; 618 return tree[0]; 619 } else { 620 return sib; 621 } 622 } 623 } else if (indexInParent < Parent::NKEYS && parent->keys[indexInParent] != nullptr) { 624 // There's a sibling to the right. 625 uint sibPos = parent->children[indexInParent + 1]; 626 Node& sib = tree[sibPos]; 627 if (sib.isMostlyFull()) { 628 // Right sibling is more than half full. Steal one member. 629 rotateLeft(node, sib, *parent, indexInParent, fixup); 630 return node; 631 } else { 632 // Right sibling is half full, too. Merge. 633 KJ_ASSERT(sib.isHalfFull()); 634 merge(node, pos, *parent->keys[indexInParent], sib); 635 parent->eraseAfter(indexInParent); 636 free(sibPos); 637 if (fixup == &parent->keys[indexInParent]) fixup = nullptr; 638 639 if (parent->keys[0] == nullptr) { 640 // Oh hah, the parent has no keys left. It must be the root. We can eliminate it. 641 KJ_DASSERT(parent == &tree->parent); 642 compilerBarrier(); // don't reorder any writes to parent below here 643 move(tree[0], 0, node); 644 free(pos); 645 --height; 646 return tree[0]; 647 } else { 648 return node; 649 } 650 } 651 } else { 652 KJ_FAIL_ASSERT("inconsistent b-tree"); 653 } 654 } 655 656 return node; 657 } 658 659 void BTreeImpl::renumber(uint oldRow, uint newRow, const SearchKey& searchKey) { 660 // Renumber the given row from oldRow to newRow. predicate() returns true for oldRow and all 661 // rows after it. (It will not be called on newRow.) 662 663 uint pos = 0; 664 665 for (auto i KJ_UNUSED: zeroTo(height)) { 666 auto& node = tree[pos].parent; 667 uint indexInParent = searchKey.search(node); 668 pos = node.children[indexInParent]; 669 if (indexInParent < kj::size(node.keys) && node.keys[indexInParent] == oldRow) { 670 node.keys[indexInParent] = newRow; 671 } 672 KJ_DASSERT(pos != 0); 673 } 674 675 auto& leaf = tree[pos].leaf; 676 uint r = searchKey.search(leaf); 677 if (leaf.rows[r] == oldRow) { 678 leaf.rows[r] = newRow; 679 } else { 680 logInconsistency(); 681 } 682 } 683 684 uint BTreeImpl::split(Parent& dst, uint dstPos, Parent& src, uint srcPos) { 685 constexpr size_t mid = Parent::NKEYS / 2; 686 uint pivot = *src.keys[mid]; 687 acopy(dst.keys, src.keys + mid + 1, Parent::NKEYS - mid - 1); 688 azero(src.keys + mid, Parent::NKEYS - mid); 689 acopy(dst.children, src.children + mid + 1, Parent::NCHILDREN - mid - 1); 690 azero(src.children + mid + 1, Parent::NCHILDREN - mid - 1); 691 return pivot; 692 } 693 694 uint BTreeImpl::split(Leaf& dst, uint dstPos, Leaf& src, uint srcPos) { 695 constexpr size_t mid = Leaf::NROWS / 2; 696 uint pivot = *src.rows[mid - 1]; 697 acopy(dst.rows, src.rows + mid, Leaf::NROWS - mid); 698 azero(src.rows + mid, Leaf::NROWS - mid); 699 700 if (src.next == 0) { 701 endLeaf = dstPos; 702 } else { 703 tree[src.next].leaf.prev = dstPos; 704 } 705 dst.next = src.next; 706 dst.prev = srcPos; 707 src.next = dstPos; 708 709 return pivot; 710 } 711 712 void BTreeImpl::merge(Parent& dst, uint dstPos, uint pivot, Parent& src) { 713 // merge() is only legal if both nodes are half-empty. Meanwhile, B-tree invariants 714 // guarantee that the node can't be more than half-empty, or we would have merged it sooner. 715 // (The root can be more than half-empty, but it is never merged with anything.) 716 KJ_DASSERT(src.isHalfFull()); 717 KJ_DASSERT(dst.isHalfFull()); 718 719 constexpr size_t mid = Parent::NKEYS/2; 720 dst.keys[mid] = pivot; 721 acopy(dst.keys + mid + 1, src.keys, mid); 722 acopy(dst.children + mid + 1, src.children, mid + 1); 723 } 724 725 void BTreeImpl::merge(Leaf& dst, uint dstPos, uint pivot, Leaf& src) { 726 // merge() is only legal if both nodes are half-empty. Meanwhile, B-tree invariants 727 // guarantee that the node can't be more than half-empty, or we would have merged it sooner. 728 // (The root can be more than half-empty, but it is never merged with anything.) 729 KJ_DASSERT(src.isHalfFull()); 730 KJ_DASSERT(dst.isHalfFull()); 731 732 constexpr size_t mid = Leaf::NROWS/2; 733 dst.rows[mid] = pivot; 734 acopy(dst.rows + mid, src.rows, mid); 735 736 dst.next = src.next; 737 if (dst.next == 0) { 738 endLeaf = dstPos; 739 } else { 740 tree[dst.next].leaf.prev = dstPos; 741 } 742 } 743 744 void BTreeImpl::move(Parent& dst, uint dstPos, Parent& src) { 745 dst = src; 746 } 747 748 void BTreeImpl::move(Leaf& dst, uint dstPos, Leaf& src) { 749 dst = src; 750 if (src.next == 0) { 751 endLeaf = dstPos; 752 } else { 753 tree[src.next].leaf.prev = dstPos; 754 } 755 if (src.prev == 0) { 756 beginLeaf = dstPos; 757 } else { 758 tree[src.prev].leaf.next = dstPos; 759 } 760 } 761 762 void BTreeImpl::rotateLeft( 763 Parent& left, Parent& right, Parent& parent, uint indexInParent, MaybeUint*& fixup) { 764 // Steal one item from the right node and move it to the left node. 765 766 // Like merge(), this is only called on an exactly-half-empty node. 767 KJ_DASSERT(left.isHalfFull()); 768 KJ_DASSERT(right.isMostlyFull()); 769 770 constexpr size_t mid = Parent::NKEYS/2; 771 left.keys[mid] = parent.keys[indexInParent]; 772 if (fixup == &parent.keys[indexInParent]) fixup = &left.keys[mid]; 773 parent.keys[indexInParent] = right.keys[0]; 774 left.children[mid + 1] = right.children[0]; 775 amove(right.keys, right.keys + 1, Parent::NKEYS - 1); 776 right.keys[Parent::NKEYS - 1] = nullptr; 777 amove(right.children, right.children + 1, Parent::NCHILDREN - 1); 778 right.children[Parent::NCHILDREN - 1] = 0; 779 } 780 781 void BTreeImpl::rotateLeft( 782 Leaf& left, Leaf& right, Parent& parent, uint indexInParent, MaybeUint*& fixup) { 783 // Steal one item from the right node and move it to the left node. 784 785 // Like merge(), this is only called on an exactly-half-empty node. 786 KJ_DASSERT(left.isHalfFull()); 787 KJ_DASSERT(right.isMostlyFull()); 788 789 constexpr size_t mid = Leaf::NROWS/2; 790 parent.keys[indexInParent] = left.rows[mid] = right.rows[0]; 791 if (fixup == &parent.keys[indexInParent]) fixup = nullptr; 792 amove(right.rows, right.rows + 1, Leaf::NROWS - 1); 793 right.rows[Leaf::NROWS - 1] = nullptr; 794 } 795 796 void BTreeImpl::rotateRight(Parent& left, Parent& right, Parent& parent, uint indexInParent) { 797 // Steal one item from the left node and move it to the right node. 798 799 // Like merge(), this is only called on an exactly-half-empty node. 800 KJ_DASSERT(right.isHalfFull()); 801 KJ_DASSERT(left.isMostlyFull()); 802 803 constexpr size_t mid = Parent::NKEYS/2; 804 amove(right.keys + 1, right.keys, mid); 805 amove(right.children + 1, right.children, mid + 1); 806 807 uint back = left.keyCount() - 1; 808 809 right.keys[0] = parent.keys[indexInParent]; 810 parent.keys[indexInParent] = left.keys[back]; 811 right.children[0] = left.children[back + 1]; 812 left.keys[back] = nullptr; 813 left.children[back + 1] = 0; 814 } 815 816 void BTreeImpl::rotateRight(Leaf& left, Leaf& right, Parent& parent, uint indexInParent) { 817 // Steal one item from the left node and move it to the right node. 818 819 // Like mergeFrom(), this is only called on an exactly-half-empty node. 820 KJ_DASSERT(right.isHalfFull()); 821 KJ_DASSERT(left.isMostlyFull()); 822 823 constexpr size_t mid = Leaf::NROWS/2; 824 amove(right.rows + 1, right.rows, mid); 825 826 uint back = left.size() - 1; 827 828 right.rows[0] = left.rows[back]; 829 parent.keys[indexInParent] = left.rows[back - 1]; 830 left.rows[back] = nullptr; 831 } 832 833 void BTreeImpl::Parent::initRoot(uint key, uint leftChild, uint rightChild) { 834 // HACK: This is typically called on the root node immediately after copying its contents away, 835 // but the pointer used to copy it away may be a different pointer pointing to a different 836 // union member which the compiler may not recgonize as aliasing with this object. Just to 837 // be extra-safe, insert a compiler barrier. 838 compilerBarrier(); 839 840 keys[0] = key; 841 children[0] = leftChild; 842 children[1] = rightChild; 843 azero(keys + 1, Parent::NKEYS - 1); 844 azero(children + 2, Parent::NCHILDREN - 2); 845 } 846 847 void BTreeImpl::Parent::insertAfter(uint i, uint splitKey, uint child) { 848 KJ_IREQUIRE(children[Parent::NCHILDREN - 1] == 0); // check not full 849 850 amove(keys + i + 1, keys + i, Parent::NKEYS - (i + 1)); 851 keys[i] = splitKey; 852 853 amove(children + i + 2, children + i + 1, Parent::NCHILDREN - (i + 2)); 854 children[i + 1] = child; 855 } 856 857 void BTreeImpl::Parent::eraseAfter(uint i) { 858 amove(keys + i, keys + i + 1, Parent::NKEYS - (i + 1)); 859 keys[Parent::NKEYS - 1] = nullptr; 860 amove(children + i + 1, children + i + 2, Parent::NCHILDREN - (i + 2)); 861 children[Parent::NCHILDREN - 1] = 0; 862 } 863 864 } // namespace _ 865 866 // ======================================================================================= 867 // Insertion order 868 869 const InsertionOrderIndex::Link InsertionOrderIndex::EMPTY_LINK = { 0, 0 }; 870 871 InsertionOrderIndex::InsertionOrderIndex(): capacity(0), links(const_cast<Link*>(&EMPTY_LINK)) {} 872 InsertionOrderIndex::InsertionOrderIndex(InsertionOrderIndex&& other) 873 : capacity(other.capacity), links(other.links) { 874 other.capacity = 0; 875 other.links = const_cast<Link*>(&EMPTY_LINK); 876 } 877 InsertionOrderIndex& InsertionOrderIndex::operator=(InsertionOrderIndex&& other) { 878 KJ_DASSERT(&other != this); 879 capacity = other.capacity; 880 links = other.links; 881 other.capacity = 0; 882 other.links = const_cast<Link*>(&EMPTY_LINK); 883 return *this; 884 } 885 InsertionOrderIndex::~InsertionOrderIndex() noexcept(false) { 886 if (links != &EMPTY_LINK) delete[] links; 887 } 888 889 void InsertionOrderIndex::reserve(size_t size) { 890 KJ_ASSERT(size < (1u << 31), "Table too big for InsertionOrderIndex"); 891 892 if (size > capacity) { 893 // Need to grow. 894 // Note that `size` and `capacity` do not include the special link[0]. 895 896 // Round up to the next power of 2. 897 size_t allocation = 1u << (_::lg(size) + 1); 898 KJ_DASSERT(allocation > size); 899 KJ_DASSERT(allocation <= size * 2); 900 901 // Round first allocation up to 8. 902 allocation = kj::max(allocation, 8); 903 904 Link* newLinks = new Link[allocation]; 905 #ifdef KJ_DEBUG 906 // To catch bugs, fill unused links with 0xff. 907 memset(newLinks, 0xff, allocation * sizeof(Link)); 908 #endif 909 _::acopy(newLinks, links, capacity + 1); 910 if (links != &EMPTY_LINK) delete[] links; 911 links = newLinks; 912 capacity = allocation - 1; 913 } 914 } 915 916 void InsertionOrderIndex::clear() { 917 links[0] = Link { 0, 0 }; 918 919 #ifdef KJ_DEBUG 920 // To catch bugs, fill unused links with 0xff. 921 memset(links + 1, 0xff, capacity * sizeof(Link)); 922 #endif 923 } 924 925 kj::Maybe<size_t> InsertionOrderIndex::insertImpl(size_t pos) { 926 if (pos >= capacity) { 927 reserve(pos + 1); 928 } 929 930 links[pos + 1].prev = links[0].prev; 931 links[pos + 1].next = 0; 932 links[links[0].prev].next = pos + 1; 933 links[0].prev = pos + 1; 934 935 return nullptr; 936 } 937 938 void InsertionOrderIndex::eraseImpl(size_t pos) { 939 Link& link = links[pos + 1]; 940 links[link.next].prev = link.prev; 941 links[link.prev].next = link.next; 942 943 #ifdef KJ_DEBUG 944 memset(&link, 0xff, sizeof(Link)); 945 #endif 946 } 947 948 void InsertionOrderIndex::moveImpl(size_t oldPos, size_t newPos) { 949 Link& link = links[oldPos + 1]; 950 Link& newLink = links[newPos + 1]; 951 952 newLink = link; 953 954 KJ_DASSERT(links[link.next].prev == oldPos + 1); 955 KJ_DASSERT(links[link.prev].next == oldPos + 1); 956 links[link.next].prev = newPos + 1; 957 links[link.prev].next = newPos + 1; 958 959 #ifdef KJ_DEBUG 960 memset(&link, 0xff, sizeof(Link)); 961 #endif 962 } 963 964 } // namespace kj