47 static const int halfShift = 10;
49 static const UTF32 halfBase = 0x0010000UL;
50 static const UTF32 halfMask = 0x3FFUL;
52 #define UNI_SUR_HIGH_START (UTF32)0xD800
53 #define UNI_SUR_HIGH_END (UTF32)0xDBFF
54 #define UNI_SUR_LOW_START (UTF32)0xDC00
55 #define UNI_SUR_LOW_END (UTF32)0xDFFF
62 const UTF32** sourceStart,
const UTF32* sourceEnd,
65 const UTF32* source = *sourceStart;
66 UTF16* target = *targetStart;
67 while (source < sourceEnd) {
69 if (target >= targetEnd) {
84 *target++ = (
UTF16)ch;
94 if (target + 1 >= targetEnd) {
103 *sourceStart = source;
104 *targetStart = target;
111 const UTF16** sourceStart,
const UTF16* sourceEnd,
114 const UTF16* source = *sourceStart;
115 UTF32* target = *targetStart;
117 while (source < sourceEnd) {
118 const UTF16* oldSource = source;
123 if (source < sourceEnd) {
148 if (target >= targetEnd) {
154 *sourceStart = source;
155 *targetStart = target;
158 fprintf(stderr,
"ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
174 static const char trailinguint8_tsForUTF8[256] = {
175 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
176 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
177 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
178 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
179 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
180 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
181 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
182 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
190 static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
191 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
200 static const UTF8 firstuint8_tMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
215 const UTF16** sourceStart,
const UTF16* sourceEnd,
218 const UTF16* source = *sourceStart;
219 UTF8* target = *targetStart;
220 while (source < sourceEnd) {
222 unsigned short bytesToWrite = 0;
223 const UTF32 byteMask = 0xBF;
224 const UTF32 byteMark = 0x80;
225 const UTF16* oldSource = source;
230 if (source < sourceEnd) {
256 if (ch < (
UTF32)0x80) { bytesToWrite = 1;
257 }
else if (ch < (
UTF32)0x800) { bytesToWrite = 2;
258 }
else if (ch < (
UTF32)0x10000) { bytesToWrite = 3;
259 }
else if (ch < (
UTF32)0x110000) { bytesToWrite = 4;
260 }
else { bytesToWrite = 3;
264 target += bytesToWrite;
265 if (target > targetEnd) {
269 switch (bytesToWrite) {
270 case 4: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
271 case 3: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
272 case 2: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
273 case 1: *--target = (
UTF8)(ch | firstuint8_tMark[bytesToWrite]);
275 target += bytesToWrite;
277 *sourceStart = source;
278 *targetStart = target;
295 static Boolean isLegalUTF8(
const UTF8 *source,
int length) {
297 const UTF8 *srcptr = source+length;
299 default:
return false;
301 case 4:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
return false;
302 case 3:
if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
return false;
303 case 2:
if ((a = (*--srcptr)) > 0xBF)
return false;
307 case 0xE0:
if (a < 0xA0)
return false;
break;
308 case 0xED:
if (a > 0x9F)
return false;
break;
309 case 0xF0:
if (a < 0x90)
return false;
break;
310 case 0xF4:
if (a > 0x8F)
return false;
break;
311 default:
if (a < 0x80)
return false;
314 case 1:
if (*source >= 0x80 && *source < 0xC2)
return false;
316 if (*source > 0xF4)
return false;
327 int length = trailinguint8_tsForUTF8[*source]+1;
328 if (source+length > sourceEnd) {
331 return isLegalUTF8(source, length);
337 const UTF8** sourceStart,
const UTF8* sourceEnd,
340 const UTF8* source = *sourceStart;
341 UTF16* target = *targetStart;
342 while (source < sourceEnd) {
344 unsigned short extrauint8_tsToRead = trailinguint8_tsForUTF8[*source];
345 if (source + extrauint8_tsToRead >= sourceEnd) {
349 if (! isLegalUTF8(source, extrauint8_tsToRead+1)) {
356 switch (extrauint8_tsToRead) {
357 case 5: ch += *source++; ch <<= 6;
358 case 4: ch += *source++; ch <<= 6;
359 case 3: ch += *source++; ch <<= 6;
360 case 2: ch += *source++; ch <<= 6;
361 case 1: ch += *source++; ch <<= 6;
362 case 0: ch += *source++;
364 ch -= offsetsFromUTF8[extrauint8_tsToRead];
366 if (target >= targetEnd) {
367 source -= (extrauint8_tsToRead+1);
374 source -= (extrauint8_tsToRead+1);
381 *target++ = (
UTF16)ch;
386 source -= (extrauint8_tsToRead+1);
393 if (target + 1 >= targetEnd) {
394 source -= (extrauint8_tsToRead+1);
402 *sourceStart = source;
403 *targetStart = target;
410 const UTF32** sourceStart,
const UTF32* sourceEnd,
413 const UTF32* source = *sourceStart;
414 UTF8* target = *targetStart;
415 while (source < sourceEnd) {
417 unsigned short bytesToWrite = 0;
418 const UTF32 byteMask = 0xBF;
419 const UTF32 byteMark = 0x80;
433 if (ch < (
UTF32)0x80) { bytesToWrite = 1;
434 }
else if (ch < (
UTF32)0x800) { bytesToWrite = 2;
435 }
else if (ch < (
UTF32)0x10000) { bytesToWrite = 3;
437 }
else { bytesToWrite = 3;
442 target += bytesToWrite;
443 if (target > targetEnd) {
447 switch (bytesToWrite) {
448 case 4: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
449 case 3: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
450 case 2: *--target = (
UTF8)((ch | byteMark) & byteMask); ch >>= 6;
451 case 1: *--target = (
UTF8) (ch | firstuint8_tMark[bytesToWrite]);
453 target += bytesToWrite;
455 *sourceStart = source;
456 *targetStart = target;
463 const UTF8** sourceStart,
const UTF8* sourceEnd,
466 const UTF8* source = *sourceStart;
467 UTF32* target = *targetStart;
468 while (source < sourceEnd) {
470 unsigned short extrauint8_tsToRead = trailinguint8_tsForUTF8[*source];
471 if (source + extrauint8_tsToRead >= sourceEnd) {
475 if (! isLegalUTF8(source, extrauint8_tsToRead+1)) {
482 switch (extrauint8_tsToRead) {
483 case 5: ch += *source++; ch <<= 6;
484 case 4: ch += *source++; ch <<= 6;
485 case 3: ch += *source++; ch <<= 6;
486 case 2: ch += *source++; ch <<= 6;
487 case 1: ch += *source++; ch <<= 6;
488 case 0: ch += *source++;
490 ch -= offsetsFromUTF8[extrauint8_tsToRead];
492 if (target >= targetEnd) {
493 source -= (extrauint8_tsToRead+1);
503 source -= (extrauint8_tsToRead+1);
517 *sourceStart = source;
518 *targetStart = target;
#define UNI_MAX_LEGAL_UTF32
ConversionResult ConvertUTF16toUTF8(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd)
#define UNI_SUR_HIGH_START
#define UNI_SUR_LOW_START
ConversionResult ConvertUTF16toUTF32(const UTF16 **sourceStart, const UTF16 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF32 **targetStart, UTF32 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF32toUTF8(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF8 **targetStart, UTF8 *targetEnd, ConversionFlags flags)
#define UNI_REPLACEMENT_CHAR
ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart, const UTF8 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)
ConversionResult ConvertUTF32toUTF16(const UTF32 **sourceStart, const UTF32 *sourceEnd, UTF16 **targetStart, UTF16 *targetEnd, ConversionFlags flags)