-----BEGIN PGP SIGNED MESSAGE-----
"Carl W. Brown" wrote:
> I am checking out my UTF-8 validation rules to see if they are correct.
>
> Check each character to be a valid UTF-8 initial character.
>
> \x00 to \x7f or \xC2 to \xF4
>
> Allow invalid forms such as \xC0 & \xC1 to decode but consider them invalid.
Unicode 3.1 says that these should not be allowed to decode (see the first
and second notes after C12 added by UAX #27).
> A first byte of \xE0 or \xF0 with a second byte less than \xA0 is also an
> invalid form.
>
> \xED followed by anything >= \xA0 is an encoded surrogate and not a valid
> character.
>
> \xEF\xBF\xBE and \xEF\xBF\xBF are invalid Unicode characters.
>
> Anything greater than \xF4\x80\xBF\xBF is beyond the Unicode range.
It's arguably simpler to convert to a code point, and then check whether the
code point is valid, than to directly check that the UTF-8 encoding is valid
(see the pseudocode below for precisely what I mean).
Also, if you're converting to, say, UTF-16, then non-character sequences
like \xEF\xBF\xBE and \xEF\xBF\xBF should probably be converted to the
corresponding UTF-16 non-characters (\uFFFE and \uFFFF), rather than being
rejected. (Note: Unicode 3.1 and ISO/IEC 10646-1:2000 differ on this point;
10646 requires them to be rejected.)
Here is some C-like pseudocode for a validating converter from UTF-8 to
UTF-16. It is suitable for cases where a bijective mapping between valid
sequences is needed, provided the ALLOW_IRREGULAR flag is *not* set.
// Set STRICT_ISO10646 for strict ISO/IEC 10646-1:2000 Annex D compliance
// (reject U+FFFE and U+FFFF).
// Set ALLOW_IRREGULAR to tolerate irregular UTF-8 sequences (that is,
// where UTF-16 surrogates have been incorrectly treated as separate
// characters).
int toUTF16(uint8_t * utf8, int utf8len) { // utf8len type must be signed
uint8_t b0, b1, b2, b3;
uint32_t codepoint, temp;
int i;
for (i = 0; i < utf8len; ) {
b0 = utf8[i++];
if ((b0 & 0x80) == 0) { // 0xxxxxxx
output b0;
} else if ((b0 & 0xE0) == 0xC0) { // 110xxxxx 10xxxxxx
if (i >= utf8len) {
return TRUNCATED;
}
b1 = utf8[i++];
if ((b1 & 0xC0) != 0x80) {
return INVALID;
}
codepoint = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
if (codepoint < 0x80) {
return INVALID; // non-shortest form
}
output codepoint;
} else if ((b0 & 0xF0) == 0xE0) { // 1110xxxx 10xxxxxx 10xxxxxx
if (i >= utf8len-1) {
return TRUNCATED;
}
b1 = utf8[i++];
b2 = utf8[i++];
if ((b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80) {
return INVALID;
}
codepoint = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
if (ALLOW_IRREGULAR && codepoint >= 0xD800 && codepoint <= 0xDBFF) {
if (i >= utf8len-2) {
return TRUNCATED;
}
b0 = utf8[i++];
b1 = utf8[i++];
b2 = utf8[i++];
if ((b0 & 0xF0) != 0xE0 || (b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80) {
return INVALID;
}
temp = ((b0 & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
if (temp < 0xDC00 || temp > 0xDFFF) {
return INVALID;
}
output codepoint;
output temp;
} else if (codepoint < 0x800 // non-shortest form
|| (codepoint >= 0xD800 && codepoint <= 0xDFFF)
|| (STRICT_ISO10646 && codepoint >= 0xFFFE)) {
return INVALID;
} else {
output codepoint;
}
} else if ((b0 & 0xF8) == 0xF0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if (i >= utf8len-2) {
return TRUNCATED;
}
b1 = utf8[i++];
b2 = utf8[i++];
b3 = utf8[i++];
if ((b1 & 0xC0) != 0x80 || (b2 & 0xC0) != 0x80 || (b3 & 0xC0) != 0x80) {
return INVALID;
}
codepoint = ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) |
((b2 & 0x3F) << 6) | (b3 & 0x3F);
if (codepoint < 0x10000 // non-shortest form
|| codepoint > 0x10FFFF) {
return INVALID;
}
temp = codepoint - 0x10000;
output (temp >> 10 ) + 0xD800;
output (temp & 0x3FF) + 0xDC00;
} else {
return INVALID;
}
} /* for i */
return VALID;
}
- --
David Hopwood <david.hopwood@zetnet.co.uk>
Home page & PGP public key: http://www.users.zetnet.co.uk/hopwood/
RSA 2048-bit; fingerprint 71 8E A6 23 0E D3 4C E5 0F 69 8C D4 FA 66 15 01
Nothing in this message is intended to be legally binding. If I revoke a
public key but refuse to specify why, it is because the private key has been
seized under the Regulation of Investigatory Powers Act; see www.fipr.org/rip
-----BEGIN PGP SIGNATURE-----
Version: 2.6.3i
Charset: noconv
iQEVAwUBO5vj/zkCAxeYt5gVAQEV3gf/X/u6oXT4zk/pF6yOq6a/xKCfl1/jupWt
Xvxq2BILdpAXAD6GjQzeHDfb6z9avc3xO8thPbuSCbjN/UmqIPEbDjW9gWosAu0i
4Bd6t5Ft3uMBbA4Hp9okQbWlA/dIEasEZxjtSQTK04S0xTTcInVJOpaYGped4vEX
1aINyFGIrbGdJ091coq8dXMjyFl+HFtw37WBm/uftd6/TsM7T49IGTZuFblmuC9a
w3FZ5V+ANO+zEXueZ8JjkgLqMdePVD+qkhcPqXlt2OHRe3YDLPeRqLV0QP59CF7d
LlXjuJKlmztwp84NH3uQGSl8YZv2pcP547+hzDuaY/V4KHoJ24PtAA==
=b6Ht
-----END PGP SIGNATURE-----
This archive was generated by hypermail 2.1.2 : Mon Sep 10 2001 - 21:30:54 EDT