Unicode cheatsheet for Perl

Front page | perl.perl5.porters | Postings from February 2012

Unicode cheatsheet for Perl

Thread Next

From:

Tom Christiansen

Date:

February 20, 2012 09:28

Subject:

Unicode cheatsheet for Perl

Message ID:

3786.1329758853@chthon

Inspired by how scandalously Unicode-deficient the
otherwise fine 4-way polyglot table comparing PHP, Perl,
Python, and Ruby is at 

    http://hyperpolyglot.org/scripting

I created a quick Unicode cheatsheet for Perl, mostly by 
mining the examples in the new 4th edition of the came.

Gee, I foresee a *whole* lot of "impossibles" in the 
other three languages' columns, don't you? :)

Hm, have I left anything out that Perl is especially cool with?

I almost wonder whether this sort of thing oughtn't be a manpage,
something like perluni{ref,cheat,quick}?

--tom

=Characters and their numbers

    # ASCII
    ord("A")
    chr(65)

    # BMP
    ord("Î£")
    chr(0x3A3)

    # beyond the BMP
    ord("ð‘›") 
    chr(0x1D45B)

    # beyond Unicode (up to MAXINT)
    ord("\x{20_0000}")
    chr(0x20_0000)

=Unicode literals by character number

    String: "\x{3a3}"    
    Regex:  /\x{3a3}/

    String: "\x{1d45b}"  
    Regex:  /\x{1d45b}/  

    # even non-BMP ranges in regex work fine
    /[\x{1D434}-\x{1D467}]/ 

=Get character name by number

    use charnames ();
    my $name = charnames::viacode(0x03A3); 

=Get character number by name

    use charnames ();
    my $number = charnames::vianame("GREEK CAPITAL LETTER SIGMA");

=Unicode named characters

    use charnames qw(:full :short greek);

    "\N{MATHEMATICAL ITALIC SMALL N}"
    "\N{GREEK CAPITAL LETTER SIGMA}"
    "\N{Greek:Sigma}"
    "\N{epsilon}"

=Unicode named sequences

    use charnames qw(:full);
    my $seq = "\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}";
    printf "U+%v04X\n", $seq;
    U+0100.0300

=Custom named characters

    use charnames ":full", ":alias" => {
        ecute => "LATIN SMALL LETTER E WITH ACUTE",
        "APPLE LOGO" => 0xF8FF, # private use character
    };

    "\N{ecute}"
    "\N{APPLE LOGO}"

=Declare source in utf8 for identifiers and literals

    use utf8;

    my $measure   = "Ã…ngstrÃ¶m";
    my @Î¼soft     = qw( cp852 cp1251 cp1252 );
    my @á½‘Ï€ÎÏÎ¼ÎµÎ³Î±Ï‚ = qw( á½‘Ï€ÎÏ Î¼ÎµÎ³Î±Ï‚ );
    my @é¯‰        = qw( koi8â€“f koi8â€“u koi8â€“r );

=Unicode casing

    uc("henry â…·")  # "HENRY â…§"
    uc("tschÃ¼ÃŸ")   # "TSHUESS"

    # both are true:
    "tschÃ¼ÃŸ"  =~ /TSHUESS/i
    "Î£Î¯ÏƒÏ…Ï†Î¿Ï‚" =~ /Î£ÎŠÎ£Î¥Î¦ÎŸÎ£/i

=Unicode case-insensitive comparisons

    use utf8;
    use feature "fc"; # fc() function is from v5.16

    # sort case-insensitively 
    my @sorted = sort { fc($a) cmp fc($b) } @list;

    # both are true:
    fc("tschÃ¼ÃŸ")  eq fc("TSHUESS")
    fc("Î£Î¯ÏƒÏ…Ï†Î¿Ï‚") eq fc("Î£ÎŠÎ£Î¥Î¦ÎŸÎ£")

=Match Unicode linebreak sequence in regex

    \R

    s/\R/\n/g;  # normalize all linebreaks to \n

=Match Unicode properties in regex with \p, \P

    \pL, \pN, \pS, \pP, \pM, \pZ, \pC
    \p{Sk}, \p{Ps}, \p{Lt}
    \p{alpha}, \p{upper}, \p{lower} 
    \p{Latin}, \p{Greek}
    \p{script=Latin}, \p{script=Greek}
    \p{East_Asian_Width=Wide}, \p{EA=W}
    \p{Line_Break=Hyphen}, \p{LB=HY}
    \p{Numeric_Value=4}, \p{NV=4}

=Custom character properties

    # using private-use characters
    sub In_Tengwar { "E000\tE07F\n" } 

    if (/\p{In_Tengwar}/) { ... }

    # blending existing properties
    sub Is_GraecoRoman_Title {<<'END_OF_SET'}
    +utf8::IsLatin
    +utf8::IsGreek
    &utf8::IsTitle
    END_OF_SET

    if (/\p{Is_GraecoRoman_Title}/ { ... }

=Get character category

    use Unicode::UCD qw(charinfo);
    my $cat = charinfo(0x3A3)->{category};  # "Lu"

=Convert non-ASCII Unicode numerics

    # from v5.12
    use Unicode::UCD qw(num);
    if (/(\d+|\N)) {  # not just ASCII!
	$nv = num($1);
    } 

    use charnames qw(:full);
    my $nv = num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");

=Match Unicode grapheme cluster in regex

    \X

    # match and grab five first graphemes
    my($first_five) = /^(\X{5})/;

    # Find vowel plus any diacritics
    use Unicode::Normalize;
    my $nfd = NFD($orig);
    $nfd =~ /(?=[aeiou])\X/i

=Reverse string by grapheme

    $str = join("", reverse $str =~ /\X/g);

    # OR: cpan -i Unicode::GCString
    use Unicode::GCString;
    $str = reverse Unicode::GCString->new($str);

=String length in graphemes

    my $count = 0;
    while ($str =~ /\X/) { $count++ }

  # OR: cpan -i Unicode::GCString
    use Unicode::GCString;
    $gcs = Unicode::GCString->new($str);
    my $count = $gcs->length;

=Substring by grapheme

  # cpan -i Unicode::GCString
    use Unicode::GCString;
    $gcs = Unicode::GCString->new($str);
    my $piece = $gcs->substr(5, 5);

=Unicode column-width for printing

  # cpan -i Unicode::GCString
    use Unicode::GCString;
    $gcs = Unicode::GCString->new($str);
    my $cols = $gcs->columns;
    printf "%*s\n", $cols, $str,

=Unicode normalization

    use Unicode::Normalize;
    my $nfd  = NFD($orig);
    my $nfc  = NFC($orig);
    my $nfkd = NFKD($orig);
    my $nfkc = NFKC($orig);

=Unicode collation

    use Unicode::Collate;
    my $col = Unicode::Collate->new();
    my @list = $col->sort(@old_list);

=Case- *and* accent-insensitive Unicode sort

    use Unicode::Collate;
    my $col = Unicode::Collate->new(level => 1);
    my @list = $col->sort(@old_list);

=Unicode locale collation

    # either use v5.12, OR: cpan -i Unicode::Collate::Locale
    use Unicode::Collate::Locale;
    my $col = Unicode::Collate::Locale->new(locale => "de__phonebook");
    my @list = $col->sort(@old_list);

=Case- *and* accent-insensitive comparisons

    use utf8;
    use Unicode::Collate;
    my $coll = Unicode::Collateâ€“>new(
                level => 1, 
                normalization => undef
    );

  # now both are true:
    $coll->eq("GarcÃa",  "GARCIA" );
    $coll->eq("MÃ¡rquez", "MARQUEZ");

=Unicode linebreaking

    # cpan -i Unicode::LineBreak 
    use Unicode::LineBreak;
    use charnames qw(:full);

    my $para = "This is a super\N{HYPHEN}long string. " x 20;
    my $fmt = new Unicode::LineBreak;
    print $fmt->break($para), "\n";

=Declare std streams to be utf8

        $ perl -CS ...
    or
        $ export PERL_UNICODE=S
    or 
        use open qw(:std :utf8);
    or
        binmode(STDIN,  ":utf8");
        binmode(STDOUT, ":utf8");
        binmode(STDERR, ":utf8");

=Make I/O default to utf8

        $ perl -CSD ...
    or
        $ export PERL_UNICODE=SD
    or 
        use open qw(:std :utf8);

=Open file with implicit encode/decode 

    # input file
        open(my $in_file, "< :encoding(UTF-16)", "wintext");
    OR
        open(my $in_file, "<", "wintext");
        binmode($in_file, ":encoding(UTF-16)");
    THEN 
        my $line = <$in_file>;

    # output file
        open($out_file, "> :encoding(cp1252)", "wintext");
    OR
        open(my $out_file, ">", "wintext");
        binmode($out_file, ":encoding(cp1252)");
    THEN
        print $out_file "some text\n";

=Explicit encode/decode  [rarely needed, see previous]

    use Encode qw(encode decode);

        my $chars = decode("shiftjis", $bytes);
    OR
        my $bytes = encode("MIMEâ€“Headerâ€“ISO_2022_JP", $chars);

Thread Next