string.c++ (20336B)
1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors 2 // Licensed under the MIT License: 3 // 4 // Permission is hereby granted, free of charge, to any person obtaining a copy 5 // of this software and associated documentation files (the "Software"), to deal 6 // in the Software without restriction, including without limitation the rights 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 // copies of the Software, and to permit persons to whom the Software is 9 // furnished to do so, subject to the following conditions: 10 // 11 // The above copyright notice and this permission notice shall be included in 12 // all copies or substantial portions of the Software. 13 // 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 // THE SOFTWARE. 21 22 #include "string.h" 23 #include "debug.h" 24 #include <stdio.h> 25 #include <float.h> 26 #include <errno.h> 27 #include <stdlib.h> 28 #include <stdint.h> 29 30 namespace kj { 31 32 #if _MSC_VER && !defined(__clang__) 33 #pragma warning(disable: 4996) 34 // Warns that sprintf() is buffer-overrunny. We know that, it's cool. 35 #endif 36 37 namespace { 38 bool isHex(const char *s) { 39 if (*s == '-') s++; 40 return s[0] == '0' && (s[1] == 'x' || s[1] == 'X'); 41 } 42 43 long long parseSigned(const StringPtr& s, long long min, long long max) { 44 KJ_REQUIRE(s != nullptr, "String does not contain valid number", s) { return 0; } 45 char *endPtr; 46 errno = 0; 47 auto value = strtoll(s.begin(), &endPtr, isHex(s.cStr()) ? 16 : 10); 48 KJ_REQUIRE(endPtr == s.end(), "String does not contain valid number", s) { return 0; } 49 KJ_REQUIRE(errno != ERANGE, "Value out-of-range", s) { return 0; } 50 KJ_REQUIRE(value >= min && value <= max, "Value out-of-range", value, min, max) { return 0; } 51 return value; 52 } 53 54 unsigned long long parseUnsigned(const StringPtr& s, unsigned long long max) { 55 KJ_REQUIRE(s != nullptr, "String does not contain valid number", s) { return 0; } 56 char *endPtr; 57 errno = 0; 58 auto value = strtoull(s.begin(), &endPtr, isHex(s.cStr()) ? 16 : 10); 59 KJ_REQUIRE(endPtr == s.end(), "String does not contain valid number", s) { return 0; } 60 KJ_REQUIRE(errno != ERANGE, "Value out-of-range", s) { return 0; } 61 KJ_REQUIRE(value <= max, "Value out-of-range", value, max) { return 0; } 62 //strtoull("-1") does not fail with ERANGE 63 KJ_REQUIRE(s[0] != '-', "Value out-of-range", s) { return 0; } 64 return value; 65 } 66 67 template <typename T> 68 T parseInteger(const StringPtr& s) { 69 if (static_cast<T>(minValue) < 0) { 70 long long min = static_cast<T>(minValue); 71 long long max = static_cast<T>(maxValue); 72 return static_cast<T>(parseSigned(s, min, max)); 73 } else { 74 unsigned long long max = static_cast<T>(maxValue); 75 return static_cast<T>(parseUnsigned(s, max)); 76 } 77 } 78 79 } // namespace 80 81 #define PARSE_AS_INTEGER(T) \ 82 template <> T StringPtr::parseAs<T>() const { return parseInteger<T>(*this); } 83 PARSE_AS_INTEGER(char); 84 PARSE_AS_INTEGER(signed char); 85 PARSE_AS_INTEGER(unsigned char); 86 PARSE_AS_INTEGER(short); 87 PARSE_AS_INTEGER(unsigned short); 88 PARSE_AS_INTEGER(int); 89 PARSE_AS_INTEGER(unsigned int); 90 PARSE_AS_INTEGER(long); 91 PARSE_AS_INTEGER(unsigned long); 92 PARSE_AS_INTEGER(long long); 93 PARSE_AS_INTEGER(unsigned long long); 94 #undef PARSE_AS_INTEGER 95 96 String heapString(size_t size) { 97 char* buffer = _::HeapArrayDisposer::allocate<char>(size + 1); 98 buffer[size] = '\0'; 99 return String(buffer, size, _::HeapArrayDisposer::instance); 100 } 101 102 String heapString(const char* value, size_t size) { 103 char* buffer = _::HeapArrayDisposer::allocate<char>(size + 1); 104 if (size != 0u) { 105 memcpy(buffer, value, size); 106 } 107 buffer[size] = '\0'; 108 return String(buffer, size, _::HeapArrayDisposer::instance); 109 } 110 111 template <typename T> 112 static CappedArray<char, sizeof(T) * 2 + 1> hexImpl(T i) { 113 // We don't use sprintf() because it's not async-signal-safe (for strPreallocated()). 114 CappedArray<char, sizeof(T) * 2 + 1> result; 115 uint8_t reverse[sizeof(T) * 2]; 116 uint8_t* p = reverse; 117 if (i == 0) { 118 *p++ = 0; 119 } else { 120 while (i > 0) { 121 *p++ = i % 16; 122 i /= 16; 123 } 124 } 125 126 char* p2 = result.begin(); 127 while (p > reverse) { 128 *p2++ = "0123456789abcdef"[*--p]; 129 } 130 result.setSize(p2 - result.begin()); 131 return result; 132 } 133 134 #define HEXIFY_INT(type) \ 135 CappedArray<char, sizeof(type) * 2 + 1> hex(type i) { \ 136 return hexImpl<type>(i); \ 137 } 138 139 HEXIFY_INT(unsigned char); 140 HEXIFY_INT(unsigned short); 141 HEXIFY_INT(unsigned int); 142 HEXIFY_INT(unsigned long); 143 HEXIFY_INT(unsigned long long); 144 145 #undef HEXIFY_INT 146 147 namespace _ { // private 148 149 StringPtr Stringifier::operator*(decltype(nullptr)) const { 150 return "nullptr"; 151 } 152 153 StringPtr Stringifier::operator*(bool b) const { 154 return b ? StringPtr("true") : StringPtr("false"); 155 } 156 157 template <typename T, typename Unsigned> 158 static CappedArray<char, sizeof(T) * 3 + 2> stringifyImpl(T i) { 159 // We don't use sprintf() because it's not async-signal-safe (for strPreallocated()). 160 CappedArray<char, sizeof(T) * 3 + 2> result; 161 bool negative = i < 0; 162 // Note that if `i` is the most-negative value, negating it produces the same bit value. But 163 // since it's a signed integer, this is considered an overflow. We therefore must make it 164 // unsigned first, then negate it, to avoid ubsan complaining. 165 Unsigned u = i; 166 if (negative) u = -u; 167 uint8_t reverse[sizeof(T) * 3 + 1]; 168 uint8_t* p = reverse; 169 if (u == 0) { 170 *p++ = 0; 171 } else { 172 while (u > 0) { 173 *p++ = u % 10; 174 u /= 10; 175 } 176 } 177 178 char* p2 = result.begin(); 179 if (negative) *p2++ = '-'; 180 while (p > reverse) { 181 *p2++ = '0' + *--p; 182 } 183 result.setSize(p2 - result.begin()); 184 return result; 185 } 186 187 #define STRINGIFY_INT(type, unsigned) \ 188 CappedArray<char, sizeof(type) * 3 + 2> Stringifier::operator*(type i) const { \ 189 return stringifyImpl<type, unsigned>(i); \ 190 } 191 192 STRINGIFY_INT(signed char, uint); 193 STRINGIFY_INT(unsigned char, uint); 194 STRINGIFY_INT(short, uint); 195 STRINGIFY_INT(unsigned short, uint); 196 STRINGIFY_INT(int, uint); 197 STRINGIFY_INT(unsigned int, uint); 198 STRINGIFY_INT(long, unsigned long); 199 STRINGIFY_INT(unsigned long, unsigned long); 200 STRINGIFY_INT(long long, unsigned long long); 201 STRINGIFY_INT(unsigned long long, unsigned long long); 202 203 #undef STRINGIFY_INT 204 205 CappedArray<char, sizeof(const void*) * 2 + 1> Stringifier::operator*(const void* i) const { \ 206 return hexImpl<uintptr_t>(reinterpret_cast<uintptr_t>(i)); 207 } 208 209 namespace { 210 211 // ---------------------------------------------------------------------- 212 // DoubleToBuffer() 213 // FloatToBuffer() 214 // Copied from Protocol Buffers, (C) Google, BSD license. 215 // Kenton wrote this code originally. The following commentary is 216 // from the original. 217 // 218 // Description: converts a double or float to a string which, if 219 // passed to NoLocaleStrtod(), will produce the exact same original double 220 // (except in case of NaN; all NaNs are considered the same value). 221 // We try to keep the string short but it's not guaranteed to be as 222 // short as possible. 223 // 224 // DoubleToBuffer() and FloatToBuffer() write the text to the given 225 // buffer and return it. The buffer must be at least 226 // kDoubleToBufferSize bytes for doubles and kFloatToBufferSize 227 // bytes for floats. kFastToBufferSize is also guaranteed to be large 228 // enough to hold either. 229 // 230 // We want to print the value without losing precision, but we also do 231 // not want to print more digits than necessary. This turns out to be 232 // trickier than it sounds. Numbers like 0.2 cannot be represented 233 // exactly in binary. If we print 0.2 with a very large precision, 234 // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167". 235 // On the other hand, if we set the precision too low, we lose 236 // significant digits when printing numbers that actually need them. 237 // It turns out there is no precision value that does the right thing 238 // for all numbers. 239 // 240 // Our strategy is to first try printing with a precision that is never 241 // over-precise, then parse the result with strtod() to see if it 242 // matches. If not, we print again with a precision that will always 243 // give a precise result, but may use more digits than necessary. 244 // 245 // An arguably better strategy would be to use the algorithm described 246 // in "How to Print Floating-Point Numbers Accurately" by Steele & 247 // White, e.g. as implemented by David M. Gay's dtoa(). It turns out, 248 // however, that the following implementation is about as fast as 249 // DMG's code. Furthermore, DMG's code locks mutexes, which means it 250 // will not scale well on multi-core machines. DMG's code is slightly 251 // more accurate (in that it will never use more digits than 252 // necessary), but this is probably irrelevant for most users. 253 // 254 // Rob Pike and Ken Thompson also have an implementation of dtoa() in 255 // third_party/fmt/fltfmt.cc. Their implementation is similar to this 256 // one in that it makes guesses and then uses strtod() to check them. 257 // Their implementation is faster because they use their own code to 258 // generate the digits in the first place rather than use snprintf(), 259 // thus avoiding format string parsing overhead. However, this makes 260 // it considerably more complicated than the following implementation, 261 // and it is embedded in a larger library. If speed turns out to be 262 // an issue, we could re-implement this in terms of their 263 // implementation. 264 // ---------------------------------------------------------------------- 265 266 #ifdef _WIN32 267 // MSVC has only _snprintf, not snprintf. 268 // 269 // MinGW has both snprintf and _snprintf, but they appear to be different 270 // functions. The former is buggy. When invoked like so: 271 // char buffer[32]; 272 // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f); 273 // it prints "1.23000e+10". This is plainly wrong: %g should never print 274 // trailing zeros after the decimal point. For some reason this bug only 275 // occurs with some input values, not all. In any case, _snprintf does the 276 // right thing, so we use it. 277 #define snprintf _snprintf 278 #endif 279 280 inline bool IsNaN(double value) { 281 // NaN is never equal to anything, even itself. 282 return value != value; 283 } 284 285 // In practice, doubles should never need more than 24 bytes and floats 286 // should never need more than 14 (including null terminators), but we 287 // overestimate to be safe. 288 static const int kDoubleToBufferSize = 32; 289 static const int kFloatToBufferSize = 24; 290 291 static inline bool IsValidFloatChar(char c) { 292 return ('0' <= c && c <= '9') || 293 c == 'e' || c == 'E' || 294 c == '+' || c == '-'; 295 } 296 297 void DelocalizeRadix(char* buffer) { 298 // Fast check: if the buffer has a normal decimal point, assume no 299 // translation is needed. 300 if (strchr(buffer, '.') != NULL) return; 301 302 // Find the first unknown character. 303 while (IsValidFloatChar(*buffer)) ++buffer; 304 305 if (*buffer == '\0') { 306 // No radix character found. 307 return; 308 } 309 310 // We are now pointing at the locale-specific radix character. Replace it 311 // with '.'. 312 *buffer = '.'; 313 ++buffer; 314 315 if (!IsValidFloatChar(*buffer) && *buffer != '\0') { 316 // It appears the radix was a multi-byte character. We need to remove the 317 // extra bytes. 318 char* target = buffer; 319 do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0'); 320 memmove(target, buffer, strlen(buffer) + 1); 321 } 322 } 323 324 void RemovePlus(char* buffer) { 325 // Remove any + characters because they are redundant and ugly. 326 327 for (;;) { 328 buffer = strchr(buffer, '+'); 329 if (buffer == NULL) { 330 return; 331 } 332 memmove(buffer, buffer + 1, strlen(buffer + 1) + 1); 333 } 334 } 335 336 #if _WIN32 337 void RemoveE0(char* buffer) { 338 // Remove redundant leading 0's after an e, e.g. 1e012. Seems to appear on 339 // Windows. 340 341 // Find and skip 'e'. 342 char* ptr = strchr(buffer, 'e'); 343 if (ptr == nullptr) return; 344 ++ptr; 345 346 // Skip '-'. 347 if (*ptr == '-') ++ptr; 348 349 // Skip '0's. 350 char* ptr2 = ptr; 351 while (*ptr2 == '0') ++ptr2; 352 353 // If we went past the last digit, back up one. 354 if (*ptr2 < '0' || *ptr2 > '9') --ptr2; 355 356 // Move bytes backwards. 357 if (ptr2 > ptr) { 358 memmove(ptr, ptr2, strlen(ptr2) + 1); 359 } 360 } 361 #endif 362 363 char* DoubleToBuffer(double value, char* buffer) { 364 // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all 365 // platforms these days. Just in case some system exists where DBL_DIG 366 // is significantly larger -- and risks overflowing our buffer -- we have 367 // this assert. 368 static_assert(DBL_DIG < 20, "DBL_DIG is too big."); 369 370 if (value == inf()) { 371 strcpy(buffer, "inf"); 372 return buffer; 373 } else if (value == -inf()) { 374 strcpy(buffer, "-inf"); 375 return buffer; 376 } else if (IsNaN(value)) { 377 strcpy(buffer, "nan"); 378 return buffer; 379 } 380 381 int snprintf_result KJ_UNUSED = 382 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value); 383 384 // The snprintf should never overflow because the buffer is significantly 385 // larger than the precision we asked for. 386 KJ_DASSERT(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize); 387 388 // We need to make parsed_value volatile in order to force the compiler to 389 // write it out to the stack. Otherwise, it may keep the value in a 390 // register, and if it does that, it may keep it as a long double instead 391 // of a double. This long double may have extra bits that make it compare 392 // unequal to "value" even though it would be exactly equal if it were 393 // truncated to a double. 394 volatile double parsed_value = strtod(buffer, NULL); 395 if (parsed_value != value) { 396 int snprintf_result2 KJ_UNUSED = 397 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value); 398 399 // Should never overflow; see above. 400 KJ_DASSERT(snprintf_result2 > 0 && snprintf_result2 < kDoubleToBufferSize); 401 } 402 403 DelocalizeRadix(buffer); 404 RemovePlus(buffer); 405 #if _WIN32 406 RemoveE0(buffer); 407 #endif // _WIN32 408 return buffer; 409 } 410 411 bool safe_strtof(const char* str, float* value) { 412 char* endptr; 413 errno = 0; // errno only gets set on errors 414 #if defined(_WIN32) || defined (__hpux) // has no strtof() 415 *value = static_cast<float>(strtod(str, &endptr)); 416 #else 417 *value = strtof(str, &endptr); 418 #endif 419 return *str != 0 && *endptr == 0 && errno == 0; 420 } 421 422 char* FloatToBuffer(float value, char* buffer) { 423 // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all 424 // platforms these days. Just in case some system exists where FLT_DIG 425 // is significantly larger -- and risks overflowing our buffer -- we have 426 // this assert. 427 static_assert(FLT_DIG < 10, "FLT_DIG is too big"); 428 429 if (value == inf()) { 430 strcpy(buffer, "inf"); 431 return buffer; 432 } else if (value == -inf()) { 433 strcpy(buffer, "-inf"); 434 return buffer; 435 } else if (IsNaN(value)) { 436 strcpy(buffer, "nan"); 437 return buffer; 438 } 439 440 int snprintf_result KJ_UNUSED = 441 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value); 442 443 // The snprintf should never overflow because the buffer is significantly 444 // larger than the precision we asked for. 445 KJ_DASSERT(snprintf_result > 0 && snprintf_result < kFloatToBufferSize); 446 447 float parsed_value; 448 if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) { 449 int snprintf_result2 KJ_UNUSED = 450 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+2, value); 451 452 // Should never overflow; see above. 453 KJ_DASSERT(snprintf_result2 > 0 && snprintf_result2 < kFloatToBufferSize); 454 } 455 456 DelocalizeRadix(buffer); 457 RemovePlus(buffer); 458 #if _WIN32 459 RemoveE0(buffer); 460 #endif // _WIN32 461 return buffer; 462 } 463 464 // ---------------------------------------------------------------------- 465 // NoLocaleStrtod() 466 // This code will make you cry. 467 // ---------------------------------------------------------------------- 468 469 namespace { 470 471 // Returns a string identical to *input except that the character pointed to 472 // by radix_pos (which should be '.') is replaced with the locale-specific 473 // radix character. 474 kj::String LocalizeRadix(const char* input, const char* radix_pos) { 475 // Determine the locale-specific radix character by calling sprintf() to 476 // print the number 1.5, then stripping off the digits. As far as I can 477 // tell, this is the only portable, thread-safe way to get the C library 478 // to divuldge the locale's radix character. No, localeconv() is NOT 479 // thread-safe. 480 char temp[16]; 481 int size = sprintf(temp, "%.1f", 1.5); 482 KJ_ASSERT(temp[0] == '1'); 483 KJ_ASSERT(temp[size-1] == '5'); 484 KJ_ASSERT(size <= 6); 485 486 // Now replace the '.' in the input with it. 487 return kj::str( 488 kj::arrayPtr(input, radix_pos), 489 kj::arrayPtr(temp + 1, size - 2), 490 kj::StringPtr(radix_pos + 1)); 491 } 492 493 } // namespace 494 495 double NoLocaleStrtod(const char* text, char** original_endptr) { 496 // We cannot simply set the locale to "C" temporarily with setlocale() 497 // as this is not thread-safe. Instead, we try to parse in the current 498 // locale first. If parsing stops at a '.' character, then this is a 499 // pretty good hint that we're actually in some other locale in which 500 // '.' is not the radix character. 501 502 char* temp_endptr; 503 double result = strtod(text, &temp_endptr); 504 if (original_endptr != NULL) *original_endptr = temp_endptr; 505 if (*temp_endptr != '.') return result; 506 507 // Parsing halted on a '.'. Perhaps we're in a different locale? Let's 508 // try to replace the '.' with a locale-specific radix character and 509 // try again. 510 kj::String localized = LocalizeRadix(text, temp_endptr); 511 const char* localized_cstr = localized.cStr(); 512 char* localized_endptr; 513 result = strtod(localized_cstr, &localized_endptr); 514 if ((localized_endptr - localized_cstr) > 515 (temp_endptr - text)) { 516 // This attempt got further, so replacing the decimal must have helped. 517 // Update original_endptr to point at the right location. 518 if (original_endptr != NULL) { 519 // size_diff is non-zero if the localized radix has multiple bytes. 520 int size_diff = localized.size() - strlen(text); 521 // const_cast is necessary to match the strtod() interface. 522 *original_endptr = const_cast<char*>( 523 text + (localized_endptr - localized_cstr - size_diff)); 524 } 525 } 526 527 return result; 528 } 529 530 // ---------------------------------------------------------------------- 531 // End of code copied from Protobuf 532 // ---------------------------------------------------------------------- 533 534 } // namespace 535 536 CappedArray<char, kFloatToBufferSize> Stringifier::operator*(float f) const { 537 CappedArray<char, kFloatToBufferSize> result; 538 result.setSize(strlen(FloatToBuffer(f, result.begin()))); 539 return result; 540 } 541 542 CappedArray<char, kDoubleToBufferSize> Stringifier::operator*(double f) const { 543 CappedArray<char, kDoubleToBufferSize> result; 544 result.setSize(strlen(DoubleToBuffer(f, result.begin()))); 545 return result; 546 } 547 548 double parseDouble(const StringPtr& s) { 549 KJ_REQUIRE(s != nullptr, "String does not contain valid number", s) { return 0; } 550 char *endPtr; 551 errno = 0; 552 auto value = _::NoLocaleStrtod(s.begin(), &endPtr); 553 KJ_REQUIRE(endPtr == s.end(), "String does not contain valid floating number", s) { return 0; } 554 #if _WIN32 || __CYGWIN__ || __BIONIC__ 555 // When Windows' strtod() parses "nan", it returns a value with the sign bit set. But, our 556 // preferred canonical value for NaN does not have the sign bit set, and all other platforms 557 // return one without the sign bit set. So, on Windows, detect NaN and return our preferred 558 // version. 559 // 560 // Cygwin seemingly does not try to emulate Linux behavior here, but rather allows Windows' 561 // behavior to leak through. (Conversely, WINE actually produces the Linux behavior despite 562 // trying to behave like Win32...) 563 // 564 // Bionic (Android) failed the unit test and so I added it to the list without investigating 565 // further. 566 if (isNaN(value)) { 567 // NaN 568 return kj::nan(); 569 } 570 #endif 571 return value; 572 } 573 574 } // namespace _ (private) 575 576 template <> double StringPtr::parseAs<double>() const { return _::parseDouble(*this); } 577 template <> float StringPtr::parseAs<float>() const { return _::parseDouble(*this); } 578 579 } // namespace kj