#! /usr/bin/perl use strict; my $USAGE = <<__EOF__; usage: mkutable [-n] [-f#] type... [--] [<] UnicodeData.txt -n = take non-matching types -f = zero-based type field (default 2) __EOF__ use vars qw( $opt_f $opt_n ); use Getopt::Std; my $type_field = 2; # Override Unicode tables for certain control chars # that are expected to be found in normal text files. my %force_space = ( 0x08 => 1, # backspace 0x09 => 1, # tab 0x0a => 1, # newline 0x0c => 1, # form feed 0x0d => 1, # carriage return ); exit (main() ? 0 : 1); sub main { my $date = `date`; chomp $date; my $args = join ' ', @ARGV; my $header = "/* Generated by \"$0 $args\" on $date */\n"; die $USAGE if not getopts('f:n'); $type_field = $opt_f if $opt_f; my %types; my $arg; while ($arg = shift @ARGV) { last if $arg eq '--'; $types{$arg} = 1; } my %out = ( 'types' => \%types ); print $header; my $last_code = 0; while (<>) { chomp; s/#.*//; my @fields = split /;/; next if not @fields; my ($lo_code, $hi_code); my $codes = $fields[0]; if ($codes =~ /(\w+)\.\.(\w+)/) { $lo_code = hex $1; $hi_code = hex $2; } else { $lo_code = $hi_code = hex $fields[0]; } my $type = $fields[$type_field]; $type =~ s/\s//g; for ($last_code = $lo_code; $last_code <= $hi_code; ++$last_code) { $type = 'Zs' if $force_space{$last_code}; output(\%out, $last_code, $type); } } output(\%out, $last_code); return 1; } sub output { my ($out, $code, $type) = @_; my $type_ok = ($type and ${${$out}{types}}{$type}); $type_ok = not $type_ok if $opt_n; my $prev_code = $$out{prev_code}; if (not $type_ok) { end_run($out, $prev_code); } elsif (not $$out{in_run} or $type ne $$out{run_type} or $code != $prev_code+1) { end_run($out, $prev_code); start_run($out, $code, $type); } $$out{prev_code} = $code; } sub start_run { my ($out, $code, $type) = @_; $$out{start_code} = $code; $$out{prev_code} = $code; $$out{run_type} = $type; $$out{in_run} = 1; } sub end_run { my ($out, $code) = @_; return if not $$out{in_run}; printf "\t{ 0x%04x, 0x%04x }, /* %s */\n", $$out{start_code}, $code, $$out{run_type}; $$out{in_run} = 0; }