From: Frank Yung-Fong Tang (ytang0648@aol.com)
Date: Wed May 19 2004 - 11:27:12 CDT
33 | UTF Conversion Code Update | 2004.06.08 |
The C language source code example for UTF conversions (ConverUTF.c) has been updated to version 1.2 and is being released for public review and comment. This update includes fixes for several minor bugs. The code can be found at the above link. |
/*This comment is also misleading "six byte sequence" and "0xF8, 0xFC"
* Index into the table below with the first byte of a UTF-8 sequence to
* get the number of trailing bytes that are supposed to follow it.
*/
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
although there are code prevent 5-6 bytes UTF-8 sequence. The array above mislead
people to think there are 5 and 6 bytes UTF-8. Also, F5-F7 should not map to 3.
C0 and C1 should not map to 1
It should be change to
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0
};
/*
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
* into the first byte, depending on how many bytes follow. There are
* as many entries in this table as there are UTF-8 sequence types.
* (I.e., one byte sequence, two byte... six byte sequence.)
*/
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0 xE0, 0xF0, 0xF8, 0xFC };
/* Figure out how many bytes the result will require */
if (ch < (UTF32)0x80) { bytesToWrite = 1;
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
} else if (ch < (UTF32)0x200000) { bytesToWrite = 4;
Shouldn't the last line be
} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
? where does the 0x200000 come from ?
switch (extraBytesToRead) {
case 5: ch += *source++; ch <<= 6;
case 4: ch += *source++; ch <<= 6;
This code also mislead people to think there are 5 and 6 bytes UTF-8 sequence
Also the following routine
static Boolean isLegalUTF8(const UTF8 *source, int length) {
UTF8 a;
const UTF8 *srcptr = source+length;
switch (length) {
default: return false;
/* Everything else falls through when "true"... */
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
case 2: if ((a = (*--srcptr)) > 0xBF) return false;
switch (*source) {
/* no fall-through in this inner switch */
case 0xE0: if (a < 0xA0) return false; break;
case 0xF0: if (a < 0x90) return false; break;
case 0xF4: if (a > 0x8F) return false; break;
default: if (a < 0x80) return false;
}
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
if (*source > 0xF4) return false;
}
return true;
}
Does NOT match the table 3.1B as defined in Unicode 3.2
see http://www.unicode.org/reports/tr28/#3_1_conformance
or Table 3-6 Well-Formed UTF-8 Byte Sequences in page 78 of Unciode 4.0
in particular the function treat the following range legal while
it should NOT
U+D800..U+DFFF ED A0-BF 80-BF
Al so http://www.unicode.org/Public/BETA/CVTUTF-1-2/harness.c
The following comment is misleading
/* ---------------------------------------------------------------------
test01 - Spot check a few legal & illegal UTF-8 values only.
This is not an exhaustive test, just a brief one that was
used to develop the "isLegalUTF8" routine.
Legal UTF-8 sequences are:
1st---- 2nd---- 3rd---- 4th---- Codepoints---
00-7F 0000- 007F
C2-DF 80-BF 0080- 07FF
E0 A0-BF 80-BF 0800- 0FFF
E1-EF 80-BF 80-BF 1000- FFFF
F0 90-BF 80-BF 80-BF 10000- 3FFFF
F1-F3 80-BF 80-BF 80-BF 40000- FFFFF
F4 80-8F 80-BF 80-BF 100000-10FFFF
--------------------------------------------------------------------- */
It should be
Legal UTF-8 sequences are:
1st---- 2nd---- 3rd---- 4th---- Codepoints---
00-7F 0000- 007F
C2-DF 80-BF 0080- 07FF
E0 A0-BF 80-BF 0800- 0FFF
E1-EC 80-BF 80-BF 1000- CFFF
ED 80-9F 80-BF D000- D7FF
EE-EF 80-BF 80-BF E000- FFFF
F0 90-BF 80-BF 80-BF 10000- 3FFFF
F1-F3 80-BF 80-BF 80-BF 40000- FFFFF
F4 80-8F 80-BF 80-BF 100000-10FFFF
The following test should put into utf8_testData array to catch the case of
directly encode surrogate high or surrogate low in UTF-8
{ 1, 3, { 0xED, 0x9F, 0xBF, 0x00, 0x00 }}, /* ? */
{ 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* ? */
{ 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* ? */
{ 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* ? */
This archive was generated by hypermail 2.1.5 : Wed May 19 2004 - 11:29:25 CDT