Character Encodings

Peter Karman

https://svn01.publicradio.org/sw/tech-talks/character-encodings

0. Intro

1. Just Enough History

2. Just Enough History, cont

2. Defense against the Dark Arts

3. Defense: Perl

#!/usr/bin/env perl
#
# test the UTF8-ness of a file

use strict;
use warnings;
use Search::Tools::UTF8;
use Search::Tools;

if ( !@ARGV ) {
    die "usage: $0 file\n";
}

for my $file (@ARGV) {
    my $buf = Search::Tools->slurp($file);
    printf( "is_flagged_utf8=%d\n",     is_flagged_utf8($buf) );
    printf( "is_valid_utf8=%d\n",       is_valid_utf8($buf) );
    printf( "is_perl_utf8_string=%d\n", is_perl_utf8_string($buf) );
    printf( "is_sane_utf8=%d\n",        is_sane_utf8( $buf, 1 ) );

    print " ... running through to_utf8(), testing again.\n";
    $buf = to_utf8($buf);
    printf( "is_flagged_utf8=%d\n",     is_flagged_utf8($buf) );
    printf( "is_valid_utf8=%d\n",       is_valid_utf8($buf) );
    printf( "is_perl_utf8_string=%d\n", is_perl_utf8_string($buf) );
    printf( "is_sane_utf8=%d\n",        is_sane_utf8( $buf, 1 ) );
}

4. Defense: Perl, cont

#!/usr/bin/env perl
#
# print chart of chars and corresponding hexdump vals
# just latin1 by default
# otherwise, specify start/stop numerals at cmd line
#

use strict;
use warnings;
my $NUM_COLS = 3;
binmode STDOUT, ':utf8';
print '   ';
my $c     = 0;
my $start = shift @ARGV || 161;
my $stop  = shift @ARGV || 255;

for ( 33 .. 126, $start .. $stop ) {
    printf( "%05d  %c  0x%05x     ", $_, $_, $_ );
    if ( ++$c == $NUM_COLS ) {
        print "\n   ";
        $c = 0;
    }
}
print "\n";

5. pin.org example

    # clean up some encoding problems
    $buf =~ s/\xc2\xa0/ /g;
    $buf =~ s/\xc2\xbb/»/g;