2006-06-05
Unicode::Normalize ¤ÇÍ·¤Ö
Unicode ¤Îµ¬³Ê¤Ç¤Ï¡¤Ê¸»ú¤Î¹ç»ú¡Ê¥ê¥¬¥Á¥ãÅù¡ËÅù¤òÅý°ìŪ¤Ë°·¤¨¤ë¤è¤¦¤Ë¡¤¡ÖÀµµ¬²½¡×¤È¤¤¤¦½èÍý¤¬»ÅÍͤȤ·¤ÆÄê¤Þ¤Ã¤Æ¤¤¤Þ¤¹¡£¤³¤ÎÀµµ¬²½½èÍý¤Î¤¦¤Á¡Ö¸ß´¹Àʬ²ò¡×¤È¤¤¤¦½èÍý¤ò¹Ô¤¦¤ÈÉûºîÍѤȤ·¤ÆÈ¾³Ñ¥«¥Ê¤òÁ´³Ñ¥«¥Ê¤ËÊÑ´¹¤Ç¤¤Þ¤¹¡ÊµÕ¤ËÁ´³Ñ¥«¥Ê¢ªÈ¾³Ñ¥«¥Ê¤Ï¤Ç¤¤Þ¤»¤ó¡Ë¡£
#!/usr/bin/perl
use strict;
use utf8;
use Encode;
use Unicode::Normalize;
my $src = 'ŽÎŽßŽÝ޼ŽÞŽŽ°Ž½';
my $dst = Unicode::Normalize::NFKC($src);
print Encode::encode('utf8', "${src} => ${dst}\n");
# OUTPUT is: ŽÎŽßŽÝ޼ŽÞŽŽ°Ž½ => ¥Ý¥ó¥¸¥å¡¼¥¹
Unicode ¤ÎÀµµ¬²½¤Ë¤Ä¤¤¤Æ¤Ïperl5.8¤ÎUnicode¥µ¥Ý¡¼¥È ¤ª¤è¤Ó UnicodeÀµµ¬²½ ¤¬¾Ü¤·¤¤¤Ç¤¹¡£ÆÃ¤Ë¸å¼Ô¤Ë
UAX #15¤Ë¤è¤ì¤Ð¡¢NFKD¤ª¤è¤ÓNFKC¤Ï¡¢¸ß´¹Ê¬²ò¤Ë¤è¤Ã¤ÆÂ¿¤¯¤Î½ñ¼°¾å¤Îº¹°Û¤ò¤Ê¤¯¤·¤Æ¤·¤Þ¤¦¤¿¤á¡¢Ç¤°Õ¤Î¥Æ¥¥¹¥È¤ËÂФ·¤ÆÌµ°Ç¤ËÍѤ¤¤Æ¤Ï¤¤¤±¤Ê¤¤ (Normalization forms KC and KD must not be blindly applied to arbitrary text.) ¤¬¡¢Âçʸ»ú¡¦¾®Ê¸»ú¤ò·¤¨¤ë½èÍý¤Î¤è¤¦¤Ë¹Í¤¨¤ë¤Î¤¬Îɤ¯¡¢Î㤨¤Ð¡¢ÆÃÄê¤Îʸ̮¤Ë¤ª¤¤¤Æ¥Æ¥¥¹¥È¤Î³Ë¿´¤Î°ÕÌ£¤òÆÃÄꤷ¤¿¤ê¡¢É¬¤º¤·¤âŬÀڤǤϤʤ¤¥Æ¥¥¹¥È¤ò½¤Àµ¤·¤¿¤ê¤¹¤ë¤Î¤ËÍÍѤȤµ¤ì¤Æ¤¤¤Þ¤¹¡£
¤È½ñ¤¤¤Æ¤¢¤ê¤Þ¤¹Ä̤ꡤËÜÍè¤Ï¤¿¤È¤¨¤Ð¥Ç¡¼¥¿¥Ù¡¼¥¹Åù¤ÇƱ¤¸¤è¤¦¤Ê¥Õ¥£¡¼¥ë¥ÉÃ͡ʷ¤ÈIII¤È¤«¡Ë¤ò¸¡º÷¤·¤¿¤¤¾ì¹ç¤ËÍѤ¤¤ë¤â¤Î¤Ç¤¹¤Î¤Ç¡¤¤à¤ä¤ß¤ä¤¿¤é¤ÈÍѤ¤¤ë¤Ù¤¤Ç¤Ï¤Ê¤¤¤½¤¦¤Ç¤¹¡£¥¦¥§¥Ö¥¢¥×¥ê¤Ç¤Î¥Õ¥©¡¼¥àÆþÎϤÏŬÍÑÈϰϤȤ·¤Æ¤Þ¤¡¤Þ¤¡Åö¤Æ¤Ï¤Þ¤ëµ¤¤â¤·¤Þ¤¹¤¬¡£
¤µ¤Æ¡£
ÉáḀ̈ꥬ¥Á¥ã¤È¤¤¤¦¤È²¤Ê¸Ê¸»ú¤Ç¤Î¥¢¥»¥ó¥ÈÉդʸ»ú¤äfi¤Ê¤É¤ò»Ø¤·¤Þ¤¹¤¬¡¤ÆüËܸì¤Ç¤âÊ£¿ô¤Îʸ»ú¤òÁȤ߹ç¤ï¤»¤¿µ¹æ¤ò»È¤Ã¤¿¤ê¤·¤Þ¤¹¡£¤¿¤È¤¨¤Ð¡¤¡Öê¡×¤È¤«¡£
¤³¤¦¤¤¤Ã¤¿Ê¸»ú¤¿¤Á¤ò¡¤¸ß´¹Ê¬²ò¤¹¤ë¤È¤É¤¦¤Ê¤ë¤Î¤«¡¤¤¿¤á¤·¤Æ¤ß¤Þ¤·¤¿¡£
#!/usr/bin/perl
use strict;
use utf8;
use Encode;
use Unicode::Normalize ();
my @chars = (
"\x{00a9}" => 'copyright',
"\x{00be}" => '4ʬ¤Î3',
"\x{00ab}" => '<<',
"\x{2103}" => '¡î',
"\x{2121}" => 'TEL',
"\x{2122}" => '¥È¥ì¡¼¥É¥Þ¡¼¥¯',
"\x{2166}" => '¥í¡¼¥Þ¿ô»ú Âç7',
"\x{2177}" => '¥í¡¼¥Þ¿ô»ú ¾®8',
"\x{2460}" => '´ÝÉÕ¤¿ô»ú 1',
"\x{2475}" => '³ç¸ÌÉÕ¤¿ô»ú 2',
"\x{248a}" => '3.',
"\x{249f}" => '(d)',
"\x{24ba}" => '´ÝÉÕ¤±Ñ»ú E',
"\x{24d5}" => '´ÝÉÕ¤±Ñ»ú f',
"\x{266d}" => '²»³Úµ¹æ ¥Õ¥é¥Ã¥È',
"\x{266f}" => '²»³Úµ¹æ ¥·¥ã¡¼¥×',
"\x{2749}" => 'balloon spoked asterisk',
"\x{277c}" => '¹õ´Ý¿ô»ú 7',
"\x{3004}" => 'JIS¥Þ¡¼¥¯',
"\x{3020}" => '¥Ý¥¹¥È¥ó¤¯¤ó',
"\x{3036}" => '¥Þ¥ë¢©',
"\x{3037}" => '¥À¥Ö¥ë¥¨¥Ã¥¯¥¹',
"\x{3227}" => '(Ȭ)',
"\x{3240}" => '(º×)',
"\x{3299}" => '¥Þ¥ëÈë',
"\x{32c5}" => '6·î',
"\x{33e4}" => '5Æü',
"\x{32da}" => '¥Þ¥ë¥µ',
"\x{334c}" => '¥á¥¬¥È¥ó',
"\x{3370}" => '24ÅÀ',
"\x{337b}" => 'Ê¿À®',
"\x{337f}" => '³ô¼°²ñ¼Ò',
"\x{3393}" => '¥®¥¬¥Ø¥ë¥Ä',
"\x{33a0}" => 'Ê¿Êý¥»¥ó¥Á¥á¡¼¥È¥ë',
);
while (@chars) {
my $chr = shift @chars;
my $desc = shift @chars;
my $nfkc = Unicode::Normalize::NFKC($chr);
next if $chr eq $nfkc;
print Encode::encode('utf8', "${chr}\t${nfkc}\t${desc}\n");
}
¤³¤ó¤Ê¤Ë¤â»È¤ï¤Ê¤µ¤½¤¦¤Ê¡Ê¼ºÎé¡Ëµ¹æ¤¬¤¿¤¯¤µ¤óÄêµÁ¤µ¤ì¤Æ¤¤¤ë¤ó¤À¤Ê¤È¤â»×¤¤¤Þ¤¹¤¬¡¤¤³¤ì¤Î¼Â¹ÔÎã¤Ï
¾ 3⁄4 4ʬ¤Î3 ¡î ¡ëC ¡î ä TEL TEL ™ TM ¥È¥ì¡¼¥É¥Þ¡¼¥¯ » VII ¥í¡¼¥Þ¿ô»ú Âç7 üø viii ¥í¡¼¥Þ¿ô»ú ¾®8 ¡ 1 ´ÝÉÕ¤¿ô»ú 1 ⑵ (2) ³ç¸ÌÉÕ¤¿ô»ú 2 ⒊ 3. 3. ⒟ (d) (d) Ⓔ E ´ÝÉÕ¤±Ñ»ú E ⓕ f ´ÝÉÕ¤±Ñ»ú f 〶 ¢© ¥Þ¥ë¢© ㈧ (Ȭ) (Ȭ) ㉀ (º×) (º×) ㊙ Èë ¥Þ¥ëÈë ㋅ 6·î 6·î ㏤ 5Æü 5Æü ㋚ ¥µ ¥Þ¥ë¥µ ㍌ ¥á¥¬¥È¥ó ¥á¥¬¥È¥ó ㍰ 24ÅÀ 24ÅÀ ß Ê¿À® Ê¿À® ㍿ ³ô¼°²ñ¼Ò ³ô¼°²ñ¼Ò ㎓ GHz ¥®¥¬¥Ø¥ë¥Ä ㎠ cm2 Ê¿Êý¥»¥ó¥Á¥á¡¼¥È¥ë
¥Ö¥é¥¦¥¶¤Ç¤¹¤Ù¤Æ¤òÀµ¤·¤¯É½¼¨¤Ç¤¤Ê¤¤²ÄǽÀ¤â¹â¤¤¤Î¤Ç¡¤UTF-8 ¤¬»È¤¨¤ë¥¿¡¼¥ß¥Ê¥ë¤ò¤ª»ý¤Á¤ÎÊý¤Ï¤´¼«Ê¬¤Ç»î¤·¤ÆÄº¤±¤ë¤È¤è¤ê³Ú¤·¤á¤ë¤È»×¤¤¤Þ¤¹¡£
¡Öê¡×¤È¤«¡Ö·¡×¤È¤«ÆþÎϤǻȤ¦¤Ê¤è¡ª¤È¤ªÅܤê¤Î¡ÊÀΤʤ¬¤é¤Î¡©¡Ë¶È̳¥¢¥×¥êÀ©ºî¼Ô¤ÎÊý¤ä¡¤¸¡º÷¥·¥¹¥Æ¥à¤ò¹½ÃÛ¤·¤Æ¤ë¤È¤¤¤¦Êý¤Ï¡¤µ¤¤Ë¤È¤á¤Æ¤ª¤¯¤È¤ß¤ó¤Ê¥Ï¥Ã¥Ô¡¼¤Ë¤Ê¤ì¤ë¤«¤â¤·¤ì¤Þ¤»¤ó¡£
