duckstation

duckstation, but archived from the revision just before upstream changed it to a proprietary software project, this version is the libre one
git clone https://git.neptards.moe/u3shit/duckstation.git
Log | Files | Refs | README | LICENSE

ConvertUTF.c (19068B)


      1 /*
      2  * Copyright 2001-2004 Unicode, Inc.
      3  * 
      4  * Disclaimer
      5  * 
      6  * This source code is provided as is by Unicode, Inc. No claims are
      7  * made as to fitness for any particular purpose. No warranties of any
      8  * kind are expressed or implied. The recipient agrees to determine
      9  * applicability of information provided. If this file has been
     10  * purchased on magnetic or optical media from Unicode, Inc., the
     11  * sole remedy for any claim will be exchange of defective media
     12  * within 90 days of receipt.
     13  * 
     14  * Limitations on Rights to Redistribute This Code
     15  * 
     16  * Unicode, Inc. hereby grants the right to freely use the information
     17  * supplied in this file in the creation of products supporting the
     18  * Unicode Standard, and to make copies of this file in any form
     19  * for internal or external distribution as long as this notice
     20  * remains attached.
     21  */
     22 
     23 /* ---------------------------------------------------------------------
     24 
     25     Conversions between UTF32, UTF-16, and UTF-8. Source code file.
     26     Author: Mark E. Davis, 1994.
     27     Rev History: Rick McGowan, fixes & updates May 2001.
     28     Sept 2001: fixed const & error conditions per
     29 	mods suggested by S. Parent & A. Lillich.
     30     June 2002: Tim Dodd added detection and handling of incomplete
     31 	source sequences, enhanced error detection, added casts
     32 	to eliminate compiler warnings.
     33     July 2003: slight mods to back out aggressive FFFE detection.
     34     Jan 2004: updated switches in from-UTF8 conversions.
     35     Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
     36 
     37     See the header file "ConvertUTF.h" for complete documentation.
     38 
     39 ------------------------------------------------------------------------ */
     40 
     41 
     42 #include "ConvertUTF.h"
     43 #ifdef CVTUTF_DEBUG
     44 #include <stdio.h>
     45 #endif
     46 
     47 static const int halfShift  = 10; /* used for shifting by 10 bits */
     48 
     49 static const UTF32 halfBase = 0x0010000UL;
     50 static const UTF32 halfMask = 0x3FFUL;
     51 
     52 #define UNI_SUR_HIGH_START  (UTF32)0xD800
     53 #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
     54 #define UNI_SUR_LOW_START   (UTF32)0xDC00
     55 #define UNI_SUR_LOW_END     (UTF32)0xDFFF
     56 #define false	   0
     57 #define true	    1
     58 
     59 /* --------------------------------------------------------------------- */
     60 
     61 ConversionResult ConvertUTF32toUTF16 (
     62 	const UTF32** sourceStart, const UTF32* sourceEnd, 
     63 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
     64     ConversionResult result = conversionOK;
     65     const UTF32* source = *sourceStart;
     66     UTF16* target = *targetStart;
     67     while (source < sourceEnd) {
     68 	UTF32 ch;
     69 	if (target >= targetEnd) {
     70 	    result = targetExhausted; break;
     71 	}
     72 	ch = *source++;
     73 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
     74 	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
     75 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
     76 		if (flags == strictConversion) {
     77 		    --source; /* return to the illegal value itself */
     78 		    result = sourceIllegal;
     79 		    break;
     80 		} else {
     81 		    *target++ = UNI_REPLACEMENT_CHAR;
     82 		}
     83 	    } else {
     84 		*target++ = (UTF16)ch; /* normal case */
     85 	    }
     86 	} else if (ch > UNI_MAX_LEGAL_UTF32) {
     87 	    if (flags == strictConversion) {
     88 		result = sourceIllegal;
     89 	    } else {
     90 		*target++ = UNI_REPLACEMENT_CHAR;
     91 	    }
     92 	} else {
     93 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
     94 	    if (target + 1 >= targetEnd) {
     95 		--source; /* Back up source pointer! */
     96 		result = targetExhausted; break;
     97 	    }
     98 	    ch -= halfBase;
     99 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
    100 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
    101 	}
    102     }
    103     *sourceStart = source;
    104     *targetStart = target;
    105     return result;
    106 }
    107 
    108 /* --------------------------------------------------------------------- */
    109 
    110 ConversionResult ConvertUTF16toUTF32 (
    111 	const UTF16** sourceStart, const UTF16* sourceEnd, 
    112 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
    113     ConversionResult result = conversionOK;
    114     const UTF16* source = *sourceStart;
    115     UTF32* target = *targetStart;
    116     UTF32 ch, ch2;
    117     while (source < sourceEnd) {
    118 	const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
    119 	ch = *source++;
    120 	/* If we have a surrogate pair, convert to UTF32 first. */
    121 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
    122 	    /* If the 16 bits following the high surrogate are in the source buffer... */
    123 	    if (source < sourceEnd) {
    124 		ch2 = *source;
    125 		/* If it's a low surrogate, convert to UTF32. */
    126 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
    127 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
    128 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
    129 		    ++source;
    130 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
    131 		    --source; /* return to the illegal value itself */
    132 		    result = sourceIllegal;
    133 		    break;
    134 		}
    135 	    } else { /* We don't have the 16 bits following the high surrogate. */
    136 		--source; /* return to the high surrogate */
    137 		result = sourceExhausted;
    138 		break;
    139 	    }
    140 	} else if (flags == strictConversion) {
    141 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    142 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
    143 		--source; /* return to the illegal value itself */
    144 		result = sourceIllegal;
    145 		break;
    146 	    }
    147 	}
    148 	if (target >= targetEnd) {
    149 	    source = oldSource; /* Back up source pointer! */
    150 	    result = targetExhausted; break;
    151 	}
    152 	*target++ = ch;
    153     }
    154     *sourceStart = source;
    155     *targetStart = target;
    156 #ifdef CVTUTF_DEBUG
    157 if (result == sourceIllegal) {
    158     fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
    159     fflush(stderr);
    160 }
    161 #endif
    162     return result;
    163 }
    164 
    165 /* --------------------------------------------------------------------- */
    166 
    167 /*
    168  * Index into the table below with the first byte of a UTF-8 sequence to
    169  * get the number of trailing bytes that are supposed to follow it.
    170  * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
    171  * left as-is for anyone who may want to do such conversion, which was
    172  * allowed in earlier algorithms.
    173  */
    174 static const char trailingBytesForUTF8[256] = {
    175     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    176     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    177     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    178     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    179     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    180     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    181     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    182     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
    183 };
    184 
    185 /*
    186  * Magic values subtracted from a buffer value during UTF8 conversion.
    187  * This table contains as many values as there might be trailing bytes
    188  * in a UTF-8 sequence.
    189  */
    190 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
    191 		     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
    192 
    193 /*
    194  * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
    195  * into the first byte, depending on how many bytes follow.  There are
    196  * as many entries in this table as there are UTF-8 sequence types.
    197  * (I.e., one byte sequence, two byte... etc.). Remember that sequences
    198  * for *legal* UTF-8 will be 4 or fewer bytes total.
    199  */
    200 static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
    201 
    202 /* --------------------------------------------------------------------- */
    203 
    204 /* The interface converts a whole buffer to avoid function-call overhead.
    205  * Constants have been gathered. Loops & conditionals have been removed as
    206  * much as possible for efficiency, in favor of drop-through switches.
    207  * (See "Note A" at the bottom of the file for equivalent code.)
    208  * If your compiler supports it, the "isLegalUTF8" call can be turned
    209  * into an inline function.
    210  */
    211 
    212 /* --------------------------------------------------------------------- */
    213 
    214 ConversionResult ConvertUTF16toUTF8 (
    215 	const UTF16** sourceStart, const UTF16* sourceEnd, 
    216 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
    217     ConversionResult result = conversionOK;
    218     const UTF16* source = *sourceStart;
    219     UTF8* target = *targetStart;
    220     while (source < sourceEnd) {
    221 	UTF32 ch;
    222 	unsigned short bytesToWrite = 0;
    223 	const UTF32 byteMask = 0xBF;
    224 	const UTF32 byteMark = 0x80; 
    225 	const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
    226 	ch = *source++;
    227 	/* If we have a surrogate pair, convert to UTF32 first. */
    228 	if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
    229 	    /* If the 16 bits following the high surrogate are in the source buffer... */
    230 	    if (source < sourceEnd) {
    231 		UTF32 ch2 = *source;
    232 		/* If it's a low surrogate, convert to UTF32. */
    233 		if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
    234 		    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
    235 			+ (ch2 - UNI_SUR_LOW_START) + halfBase;
    236 		    ++source;
    237 		} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
    238 		    --source; /* return to the illegal value itself */
    239 		    result = sourceIllegal;
    240 		    break;
    241 		}
    242 	    } else { /* We don't have the 16 bits following the high surrogate. */
    243 		--source; /* return to the high surrogate */
    244 		result = sourceExhausted;
    245 		break;
    246 	    }
    247 	} else if (flags == strictConversion) {
    248 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    249 	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
    250 		--source; /* return to the illegal value itself */
    251 		result = sourceIllegal;
    252 		break;
    253 	    }
    254 	}
    255 	/* Figure out how many bytes the result will require */
    256 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
    257 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    258 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    259 	} else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
    260 	} else {			    bytesToWrite = 3;
    261 					    ch = UNI_REPLACEMENT_CHAR;
    262 	}
    263 
    264 	target += bytesToWrite;
    265 	if (target > targetEnd) {
    266 	    source = oldSource; /* Back up source pointer! */
    267 	    target -= bytesToWrite; result = targetExhausted; break;
    268 	}
    269 	switch (bytesToWrite) { /* note: everything falls through. */
    270 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    271 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    272 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    273 	    case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
    274 	}
    275 	target += bytesToWrite;
    276     }
    277     *sourceStart = source;
    278     *targetStart = target;
    279     return result;
    280 }
    281 
    282 /* --------------------------------------------------------------------- */
    283 
    284 /*
    285  * Utility routine to tell whether a sequence of bytes is legal UTF-8.
    286  * This must be called with the length pre-determined by the first byte.
    287  * If not calling this from ConvertUTF8to*, then the length can be set by:
    288  *  length = trailingBytesForUTF8[*source]+1;
    289  * and the sequence is illegal right away if there aren't that many bytes
    290  * available.
    291  * If presented with a length > 4, this returns false.  The Unicode
    292  * definition of UTF-8 goes up to 4-byte sequences.
    293  */
    294 
    295 static Boolean isLegalUTF8(const UTF8 *source, int length) {
    296     UTF8 a;
    297     const UTF8 *srcptr = source+length;
    298     switch (length) {
    299     default: return false;
    300 	/* Everything else falls through when "true"... */
    301     case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    302     case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    303     case 2: if ((a = (*--srcptr)) > 0xBF) return false;
    304 
    305 	switch (*source) {
    306 	    /* no fall-through in this inner switch */
    307 	    case 0xE0: if (a < 0xA0) return false; break;
    308 	    case 0xED: if (a > 0x9F) return false; break;
    309 	    case 0xF0: if (a < 0x90) return false; break;
    310 	    case 0xF4: if (a > 0x8F) return false; break;
    311 	    default:   if (a < 0x80) return false;
    312 	}
    313 
    314     case 1: if (*source >= 0x80 && *source < 0xC2) return false;
    315     }
    316     if (*source > 0xF4) return false;
    317     return true;
    318 }
    319 
    320 /* --------------------------------------------------------------------- */
    321 
    322 /*
    323  * Exported function to return whether a UTF-8 sequence is legal or not.
    324  * This is not used here; it's just exported.
    325  */
    326 Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
    327     int length = trailingBytesForUTF8[*source]+1;
    328     if (source+length > sourceEnd) {
    329 	return false;
    330     }
    331     return isLegalUTF8(source, length);
    332 }
    333 
    334 /* --------------------------------------------------------------------- */
    335 
    336 ConversionResult ConvertUTF8toUTF16 (
    337 	const UTF8** sourceStart, const UTF8* sourceEnd, 
    338 	UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
    339     ConversionResult result = conversionOK;
    340     const UTF8* source = *sourceStart;
    341     UTF16* target = *targetStart;
    342     while (source < sourceEnd) {
    343 	UTF32 ch = 0;
    344 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    345 	if (source + extraBytesToRead >= sourceEnd) {
    346 	    result = sourceExhausted; break;
    347 	}
    348 	/* Do this check whether lenient or strict */
    349 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
    350 	    result = sourceIllegal;
    351 	    break;
    352 	}
    353 	/*
    354 	 * The cases all fall through. See "Note A" below.
    355 	 */
    356 	switch (extraBytesToRead) {
    357 	    case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
    358 	    case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
    359 	    case 3: ch += *source++; ch <<= 6;
    360 	    case 2: ch += *source++; ch <<= 6;
    361 	    case 1: ch += *source++; ch <<= 6;
    362 	    case 0: ch += *source++;
    363 	}
    364 	ch -= offsetsFromUTF8[extraBytesToRead];
    365 
    366 	if (target >= targetEnd) {
    367 	    source -= (extraBytesToRead+1); /* Back up source pointer! */
    368 	    result = targetExhausted; break;
    369 	}
    370 	if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
    371 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    372 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    373 		if (flags == strictConversion) {
    374 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
    375 		    result = sourceIllegal;
    376 		    break;
    377 		} else {
    378 		    *target++ = UNI_REPLACEMENT_CHAR;
    379 		}
    380 	    } else {
    381 		*target++ = (UTF16)ch; /* normal case */
    382 	    }
    383 	} else if (ch > UNI_MAX_UTF16) {
    384 	    if (flags == strictConversion) {
    385 		result = sourceIllegal;
    386 		source -= (extraBytesToRead+1); /* return to the start */
    387 		break; /* Bail out; shouldn't continue */
    388 	    } else {
    389 		*target++ = UNI_REPLACEMENT_CHAR;
    390 	    }
    391 	} else {
    392 	    /* target is a character in range 0xFFFF - 0x10FFFF. */
    393 	    if (target + 1 >= targetEnd) {
    394 		source -= (extraBytesToRead+1); /* Back up source pointer! */
    395 		result = targetExhausted; break;
    396 	    }
    397 	    ch -= halfBase;
    398 	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
    399 	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
    400 	}
    401     }
    402     *sourceStart = source;
    403     *targetStart = target;
    404     return result;
    405 }
    406 
    407 /* --------------------------------------------------------------------- */
    408 
    409 ConversionResult ConvertUTF32toUTF8 (
    410 	const UTF32** sourceStart, const UTF32* sourceEnd, 
    411 	UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
    412     ConversionResult result = conversionOK;
    413     const UTF32* source = *sourceStart;
    414     UTF8* target = *targetStart;
    415     while (source < sourceEnd) {
    416 	UTF32 ch;
    417 	unsigned short bytesToWrite = 0;
    418 	const UTF32 byteMask = 0xBF;
    419 	const UTF32 byteMark = 0x80; 
    420 	ch = *source++;
    421 	if (flags == strictConversion ) {
    422 	    /* UTF-16 surrogate values are illegal in UTF-32 */
    423 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    424 		--source; /* return to the illegal value itself */
    425 		result = sourceIllegal;
    426 		break;
    427 	    }
    428 	}
    429 	/*
    430 	 * Figure out how many bytes the result will require. Turn any
    431 	 * illegally large UTF32 things (> Plane 17) into replacement chars.
    432 	 */
    433 	if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
    434 	} else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
    435 	} else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
    436 	} else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
    437 	} else {			    bytesToWrite = 3;
    438 					    ch = UNI_REPLACEMENT_CHAR;
    439 					    result = sourceIllegal;
    440 	}
    441 	
    442 	target += bytesToWrite;
    443 	if (target > targetEnd) {
    444 	    --source; /* Back up source pointer! */
    445 	    target -= bytesToWrite; result = targetExhausted; break;
    446 	}
    447 	switch (bytesToWrite) { /* note: everything falls through. */
    448 	    case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    449 	    case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    450 	    case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
    451 	    case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
    452 	}
    453 	target += bytesToWrite;
    454     }
    455     *sourceStart = source;
    456     *targetStart = target;
    457     return result;
    458 }
    459 
    460 /* --------------------------------------------------------------------- */
    461 
    462 ConversionResult ConvertUTF8toUTF32 (
    463 	const UTF8** sourceStart, const UTF8* sourceEnd, 
    464 	UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
    465     ConversionResult result = conversionOK;
    466     const UTF8* source = *sourceStart;
    467     UTF32* target = *targetStart;
    468     while (source < sourceEnd) {
    469 	UTF32 ch = 0;
    470 	unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
    471 	if (source + extraBytesToRead >= sourceEnd) {
    472 	    result = sourceExhausted; break;
    473 	}
    474 	/* Do this check whether lenient or strict */
    475 	if (! isLegalUTF8(source, extraBytesToRead+1)) {
    476 	    result = sourceIllegal;
    477 	    break;
    478 	}
    479 	/*
    480 	 * The cases all fall through. See "Note A" below.
    481 	 */
    482 	switch (extraBytesToRead) {
    483 	    case 5: ch += *source++; ch <<= 6;
    484 	    case 4: ch += *source++; ch <<= 6;
    485 	    case 3: ch += *source++; ch <<= 6;
    486 	    case 2: ch += *source++; ch <<= 6;
    487 	    case 1: ch += *source++; ch <<= 6;
    488 	    case 0: ch += *source++;
    489 	}
    490 	ch -= offsetsFromUTF8[extraBytesToRead];
    491 
    492 	if (target >= targetEnd) {
    493 	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
    494 	    result = targetExhausted; break;
    495 	}
    496 	if (ch <= UNI_MAX_LEGAL_UTF32) {
    497 	    /*
    498 	     * UTF-16 surrogate values are illegal in UTF-32, and anything
    499 	     * over Plane 17 (> 0x10FFFF) is illegal.
    500 	     */
    501 	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
    502 		if (flags == strictConversion) {
    503 		    source -= (extraBytesToRead+1); /* return to the illegal value itself */
    504 		    result = sourceIllegal;
    505 		    break;
    506 		} else {
    507 		    *target++ = UNI_REPLACEMENT_CHAR;
    508 		}
    509 	    } else {
    510 		*target++ = ch;
    511 	    }
    512 	} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
    513 	    result = sourceIllegal;
    514 	    *target++ = UNI_REPLACEMENT_CHAR;
    515 	}
    516     }
    517     *sourceStart = source;
    518     *targetStart = target;
    519     return result;
    520 }
    521 
    522 /* ---------------------------------------------------------------------
    523 
    524     Note A.
    525     The fall-through switches in UTF-8 reading code save a
    526     temp variable, some decrements & conditionals.  The switches
    527     are equivalent to the following loop:
    528 	{
    529 	    int tmpBytesToRead = extraBytesToRead+1;
    530 	    do {
    531 		ch += *source++;
    532 		--tmpBytesToRead;
    533 		if (tmpBytesToRead) ch <<= 6;
    534 	    } while (tmpBytesToRead > 0);
    535 	}
    536     In UTF-8 writing code, the switches on "bytesToWrite" are
    537     similarly unrolled loops.
    538 
    539    --------------------------------------------------------------------- */