From: Dan Kogai (dankogai@dan.co.jp)
Date: Sat Mar 17 2007 - 14:17:53 CST
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Folks,
I am really surprised to find that EUC and UTF-8 can be mashed up
easily.
The secret is \xFF. This byte NEVER appears in EUC or UTF-8. So you
can define the combo character as follow;
EUC_UTF8_CHAR = EUC_CHAR | \xFF + UTF8_CHAR
What's that good for? You need less bytes to represent text which
consists most of EUC-mappable characters and a few characters which
require full Unicode. Which is the case for most documents that are
locally used in SE Asia.
EUC-UTF8 is so simple I already came up with a crude implementation
in Perl 5. The module provides transcoding for euc-jp-utf8, euc-cn-
utf8, and euc-kr-utf8 all at once. Source code right after my
signature.
Since UTF-8 is already popular enough, I am not sure how seriously we
should take EUC-UTF8. I just wanted to tell you such an encoding is
possible.
Dan the Maintainer of Encode.pm
====
package Encode::EUCUTF8;
use 5.008001;
use strict;
use warnings;
use Encode 2.12;
our $VERSION = sprintf "%d.%02d", q$Revision: 0.1 $ =~ /(\d+)/g;
our $EU_CHAR = qr{(
[\x00-\x7F] # ASCII
| [\xa1-\xfe][\xa1-\xfe] # 2byte EUC
| \x8f[\xa1-\xfe][\xa1-\xfe] # 3byte EUC
| \x8e[\xa1-\xdf] # EUC Kana
| \xFF(?: # UTF-8 - ASCII
[\xC2-\xDF][\x80-\xBF] # U+0080 - U+07FF
| \xE0[\xA0-\xBF][\x80-\xBF] # U+0800 - U+0FFF
| [\xE1-\xEC][\x80-\xBF]{2} # U+1000 - U+CFFF
| \xED[\x80-\x9F][\x80-\xBF] # U+D000 - U+D7FF
| \xEF[\x80-\xBF][\x80-\xBD] # U+E000 - U+FFFD
| \xF0[\x90-\xBF][\x80-\xBF]{2} # U+10000 - U+3FFFF
| [\xF1-\xF3][\x80-\xBF]{3} # U+40000 - U+FFFFF
| \xF4[\x80-\x8F][\x80-\xBF]{2} # U+100000 - U+10FFFF
)
)}ox;
sub gen_decode($) {
my $euc_what = shift;
sub ($$;$) {
my ( $obj, $bytes, $chk ) = @_;
no warnings 'uninitialized';
$bytes =~ s{
$Encode::EUCUTF8::$EU_CHAR
}{
substr($1,0,1) eq "\xFF"
? Encode::decode('utf8', substr($1,1))
: Encode::decode($euc_what, $1);
}egx;
$_[1] = '' if $chk;
return $bytes;
}
}
sub gen_encode {
my $euc_what = shift;
sub ($$;$) {
my ( $obj, $str, $chk ) = @_;
my $bytes =
Encode::encode( $euc_what, $str,
sub { "\xFF" . Encode::encode_utf8 pack "U", shift } );
$_[1] = '' if $chk;
return $bytes;
}
}
package Encode::EUCUTF8::JP;
use base qw/Encode::Encoding/;
__PACKAGE__->Define('euc-jp-utf8');
sub needs_lines { 1 }
*decode = Encode::EUCUTF8::gen_decode('euc-jp');
*encode = Encode::EUCUTF8::gen_encode('euc-jp');
package Encode::EUCUTF8::CN;
use base qw/Encode::Encoding/;
__PACKAGE__->Define('euc-cn-utf8');
sub needs_lines { 1 }
*decode = Encode::EUCUTF8::gen_decode('euc-cn');
*encode = Encode::EUCUTF8::gen_encode('euc-cn');
package Encode::EUCUTF8::KR;
use base qw/Encode::Encoding/;
__PACKAGE__->Define('euc-kr-utf8');
sub needs_lines { 1 }
*decode = Encode::EUCUTF8::gen_decode('euc-kr');
*encode = Encode::EUCUTF8::gen_encode('euc-kr');
1;
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.3 (Darwin)
iD8DBQFF/Ez1ErJia/WXtBsRAvT4AJ9igfpe/aqdwp+3RiOMHAGGGTjLRQCgkbmM
b8gM9+6IooXUH32zvqOAuQo=
=PsQO
-----END PGP SIGNATURE-----
This archive was generated by hypermail 2.1.5 : Sat Mar 17 2007 - 14:19:29 CST