Front page | perl.perl5.porters |
Postings from February 2012
Unicode cheatsheet for Perl
Thread Next
From:
Tom Christiansen
Date:
February 20, 2012 09:28
Subject:
Unicode cheatsheet for Perl
Message ID:
3786.1329758853@chthon
Inspired by how scandalously Unicode-deficient the
otherwise fine 4-way polyglot table comparing PHP, Perl,
Python, and Ruby is at
http://hyperpolyglot.org/scripting
I created a quick Unicode cheatsheet for Perl, mostly by
mining the examples in the new 4th edition of the came.
Gee, I foresee a *whole* lot of "impossibles" in the
other three languages' columns, don't you? :)
Hm, have I left anything out that Perl is especially cool with?
I almost wonder whether this sort of thing oughtn't be a manpage,
something like perluni{ref,cheat,quick}?
--tom
=Characters and their numbers
# ASCII
ord("A")
chr(65)
# BMP
ord("Σ")
chr(0x3A3)
# beyond the BMP
ord("ð")
chr(0x1D45B)
# beyond Unicode (up to MAXINT)
ord("\x{20_0000}")
chr(0x20_0000)
=Unicode literals by character number
String: "\x{3a3}"
Regex: /\x{3a3}/
String: "\x{1d45b}"
Regex: /\x{1d45b}/
# even non-BMP ranges in regex work fine
/[\x{1D434}-\x{1D467}]/
=Get character name by number
use charnames ();
my $name = charnames::viacode(0x03A3);
=Get character number by name
use charnames ();
my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA");
=Unicode named characters
use charnames qw(:full :short greek);
"\N{MATHEMATICAL ITALIC SMALL N}"
"\N{GREEK CAPITAL LETTER SIGMA}"
"\N{Greek:Sigma}"
"\N{epsilon}"
=Unicode named sequences
use charnames qw(:full);
my $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}";
printf "U+%v04X\n", $seq;
U+0100.0300
=Custom named characters
use charnames ":full", ":alias" => {
ecute => "LATIN SMALL LETTER E WITH ACUTE",
"APPLE LOGO" => 0xF8FF, # private use character
};
"\N{ecute}"
"\N{APPLE LOGO}"
=Declare source in utf8 for identifiers and literals
use utf8;
my $measure = "Ã
ngström";
my @μsoft = qw( cp852 cp1251 cp1252 );
my @á½ÏÎÏÎ¼ÎµÎ³Î±Ï = qw( á½ÏÎÏ Î¼ÎµÎ³Î±Ï );
my @é¯ = qw( koi8âf koi8âu koi8âr );
=Unicode casing
uc("henry â
·") # "HENRY â
§"
uc("tschüÃ") # "TSHUESS"
# both are true:
"tschüÃ" =~ /TSHUESS/i
"ΣίÏÏ
ÏοÏ" =~ /ΣÎΣΥΦÎΣ/i
=Unicode case-insensitive comparisons
use utf8;
use feature "fc"; # fc() function is from v5.16
# sort case-insensitively
my @sorted = sort { fc($a) cmp fc($b) } @list;
# both are true:
fc("tschüÃ") eq fc("TSHUESS")
fc("ΣίÏÏ
ÏοÏ") eq fc("ΣÎΣΥΦÎΣ")
=Match Unicode linebreak sequence in regex
\R
s/\R/\n/g; # normalize all linebreaks to \n
=Match Unicode properties in regex with \p, \P
\pL, \pN, \pS, \pP, \pM, \pZ, \pC
\p{Sk}, \p{Ps}, \p{Lt}
\p{alpha}, \p{upper}, \p{lower}
\p{Latin}, \p{Greek}
\p{script=Latin}, \p{script=Greek}
\p{East_Asian_Width=Wide}, \p{EA=W}
\p{Line_Break=Hyphen}, \p{LB=HY}
\p{Numeric_Value=4}, \p{NV=4}
=Custom character properties
# using private-use characters
sub In_Tengwar { "E000\tE07F\n" }
if (/\p{In_Tengwar}/) { ... }
# blending existing properties
sub Is_GraecoRoman_Title {<<'END_OF_SET'}
+utf8::IsLatin
+utf8::IsGreek
&utf8::IsTitle
END_OF_SET
if (/\p{Is_GraecoRoman_Title}/ { ... }
=Get character category
use Unicode::UCD qw(charinfo);
my $cat = charinfo(0x3A3)->{category}; # "Lu"
=Convert non-ASCII Unicode numerics
# from v5.12
use Unicode::UCD qw(num);
if (/(\d+|\N)) { # not just ASCII!
$nv = num($1);
}
use charnames qw(:full);
my $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
=Match Unicode grapheme cluster in regex
\X
# match and grab five first graphemes
my($first_five) = /^(\X{5})/;
# Find vowel plus any diacritics
use Unicode::Normalize;
my $nfd = NFD($orig);
$nfd =~ /(?=[aeiou])\X/i
=Reverse string by grapheme
$str = join("", reverse $str =~ /\X/g);
# OR: cpan -i Unicode::GCString
use Unicode::GCString;
$str = reverse Unicode::GCString->new($str);
=String length in graphemes
my $count = 0;
while ($str =~ /\X/) { $count++ }
# OR: cpan -i Unicode::GCString
use Unicode::GCString;
$gcs = Unicode::GCString->new($str);
my $count = $gcs->length;
=Substring by grapheme
# cpan -i Unicode::GCString
use Unicode::GCString;
$gcs = Unicode::GCString->new($str);
my $piece = $gcs->substr(5, 5);
=Unicode column-width for printing
# cpan -i Unicode::GCString
use Unicode::GCString;
$gcs = Unicode::GCString->new($str);
my $cols = $gcs->columns;
printf "%*s\n", $cols, $str,
=Unicode normalization
use Unicode::Normalize;
my $nfd = NFD($orig);
my $nfc = NFC($orig);
my $nfkd = NFKD($orig);
my $nfkc = NFKC($orig);
=Unicode collation
use Unicode::Collate;
my $col = Unicode::Collate->new();
my @list = $col->sort(@old_list);
=Case- *and* accent-insensitive Unicode sort
use Unicode::Collate;
my $col = Unicode::Collate->new(level => 1);
my @list = $col->sort(@old_list);
=Unicode locale collation
# either use v5.12, OR: cpan -i Unicode::Collate::Locale
use Unicode::Collate::Locale;
my $col = Unicode::Collate::Locale->new(locale => "de__phonebook");
my @list = $col->sort(@old_list);
=Case- *and* accent-insensitive comparisons
use utf8;
use Unicode::Collate;
my $coll = Unicode::Collateâ>new(
level => 1,
normalization => undef
);
# now both are true:
$coll->eq("GarcÃa", "GARCIA" );
$coll->eq("Márquez", "MARQUEZ");
=Unicode linebreaking
# cpan -i Unicode::LineBreak
use Unicode::LineBreak;
use charnames qw(:full);
my $para = "This is a super\N{HYPHEN}long string. " x 20;
my $fmt = new Unicode::LineBreak;
print $fmt->break($para), "\n";
=Declare std streams to be utf8
$ perl -CS ...
or
$ export PERL_UNICODE=S
or
use open qw(:std :utf8);
or
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
=Make I/O default to utf8
$ perl -CSD ...
or
$ export PERL_UNICODE=SD
or
use open qw(:std :utf8);
=Open file with implicit encode/decode
# input file
open(my $in_file, "< :encoding(UTF-16)", "wintext");
OR
open(my $in_file, "<", "wintext");
binmode($in_file, ":encoding(UTF-16)");
THEN
my $line = <$in_file>;
# output file
open($out_file, "> :encoding(cp1252)", "wintext");
OR
open(my $out_file, ">", "wintext");
binmode($out_file, ":encoding(cp1252)");
THEN
print $out_file "some text\n";
=Explicit encode/decode [rarely needed, see previous]
use Encode qw(encode decode);
my $chars = decode("shiftjis", $bytes);
OR
my $bytes = encode("MIMEâHeaderâISO_2022_JP", $chars);
Thread Next
-
Unicode cheatsheet for Perl
by Tom Christiansen