tools/tools/locale/tools/utf8-rollup.pl

   1 #!/usr/local/bin/perl -wC
   2
   3 # SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   4 #
   5 # Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
   6 # Copyright 2015 John Marino <draco@marino.st>
   7 #
   8 # Redistribution and use in source and binary forms, with or without
   9 # modification, are permitted provided that the following conditions
  10 # are met:
  11 # 1. Redistributions of source code must retain the above copyright
  12 #    notice, this list of conditions and the following disclaimer.
  13 # 2. Redistributions in binary form must reproduce the above copyright
  14 #    notice, this list of conditions and the following disclaimer in the
  15 #    documentation and/or other materials provided with the distribution.
  16 #
  17 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20 # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27 # SUCH DAMAGE.
  28 #
  29 # $FreeBSD$
  30
  31 use strict;
  32 use Getopt::Long;
  33 use Encode qw(encode decode);
  34
  35 if ($#ARGV != 0) {
  36         print "Usage: $0 --unidir=<unidir>\n";
  37         exit(1);
  38 }
  39
  40 my $UNIDIR = undef;
  41
  42 my $result = GetOptions (
  43                 "unidir=s"      => \$UNIDIR
  44             );
  45
  46 my %utf8map = ();
  47 my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
  48
  49 get_utf8map("$UNIDIR/posix/UTF-8.cm");
  50 generate_header ();
  51 parse_unidata ("$UNIDIR/UnicodeData.txt");
  52 generate_footer ();
  53
  54 ############################
  55
  56 sub utf8to32 {
  57         my @kl = split /\\x/, $_[0];
  58
  59         shift @kl if ($kl[0] eq '');
  60         my $k = pack('H2' x scalar @kl, @kl);
  61         my $ux = encode('UTF-32BE', decode('UTF-8', $k));
  62         my $u = uc(unpack('H*', $ux));
  63         # Remove BOM
  64         $u =~ s/^0000FEFF//;
  65         # Remove heading bytes of 0
  66         while ($u =~ m/^0/ and length($u) > 4) {
  67                 $u =~ s/^0//;
  68         }
  69
  70         return $u;
  71 }
  72
  73 sub get_utf8map {
  74         my $file = shift;
  75
  76         open(FIN, $file);
  77         my @lines = <FIN>;
  78         close(FIN);
  79         chomp(@lines);
  80
  81         my $incharmap = 0;
  82         foreach my $l (@lines) {
  83                 $l =~ s/\r//;
  84                 next if ($l =~ /^\#/);
  85                 next if ($l eq "");
  86
  87                 if ($l eq "CHARMAP") {
  88                         $incharmap = 1;
  89                         next;
  90                 }
  91
  92                 next if (!$incharmap);
  93                 last if ($l eq "END CHARMAP");
  94
  95                 $l =~ /^(<[^\s]+>)\s+(.*)/;
  96                 my $k = utf8to32($2);   # UTF-8 char code
  97                 my $v = $1;
  98
  99 #               print STDERR "register: $k - $v\n";
 100                 $utf8map{$k} = $v;
 101         }
 102 }
 103
 104 sub generate_header {
 105         open(FOUT, ">", "$outfilename")
 106                 or die ("can't write to $outfilename\n");
 107         print FOUT <<EOF;
 108 # Warning: Do not edit. This file is automatically generated from the
 109 # tools in /usr/src/tools/tools/locale. The data is obtained from the
 110 # CLDR project, obtained from http://cldr.unicode.org/
 111 # -----------------------------------------------------------------------------
 112
 113 comment_char *
 114 escape_char /
 115
 116 LC_CTYPE
 117 EOF
 118 }
 119
 120 sub generate_footer {
 121         print FOUT "\nEND LC_CTYPE\n";
 122         close (FOUT);
 123 }
 124
 125 sub wctomb {
 126         my $wc = hex(shift);
 127         my $lead;
 128         my $len;
 129         my $ret = "";
 130         my $i;
 131
 132         if (($wc & ~0x7f) == 0) {
 133                 return sprintf "%02X", $wc;
 134         } elsif (($wc & ~0x7ff) == 0) {
 135                 $lead = 0xc0;
 136                 $len = 2;
 137         } elsif (($wc & ~0xffff) == 0) {
 138                 $lead = 0xe0;
 139                 $len = 3;
 140         } elsif ($wc >= 0 && $wc <= 0x10ffff) {
 141                 $lead = 0xf0;
 142                 $len = 4;
 143         }
 144
 145         for ($i = $len - 1; $i > 0; $i--) {
 146                 $ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret;
 147                 $wc >>= 6;
 148         }
 149         $ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret;
 150
 151         return $ret;
 152 }
 153
 154 sub parse_unidata {
 155         my $file = shift;
 156         my %data = ();
 157
 158         open(FIN, $file);
 159         my @lines = <FIN>;
 160         close(FIN);
 161         chomp(@lines);
 162
 163         foreach my $l (@lines) {
 164                 my @d = split(/;/, $l, -1);
 165                 my $mb = $d[0];
 166                 my $cat;
 167
 168                 # XXX There are code points present in UnicodeData.txt
 169                 # and missing from UTF-8.cm
 170                 next if !defined $utf8map{$mb};
 171
 172                 # Define the category
 173                 if ($d[2] =~ /^Lu/) {
 174                         $cat = "upper";
 175                 } elsif ($d[2] =~ /^Ll/) {
 176                         $cat = "lower";
 177                 } elsif ($d[2] =~ /^Nd/) {
 178                         $cat = "digit";
 179                 } elsif ($d[2] =~ /^L/) {
 180                         $cat = "alpha";
 181                 } elsif ($d[2] =~ /^P/) {
 182                         $cat = "punct";
 183                 } elsif ($d[2] =~ /^Co/ || $d[2] =~ /^M/ || $d[2] =~ /^N/ ||
 184                     $d[2] =~ /^S/) {
 185                         $cat = "graph";
 186                 } elsif ($d[2] =~ /^C/) {
 187                         $cat = "cntrl";
 188                 } elsif ($d[2] =~ /^Z/) {
 189                         $cat = "space";
 190                 }
 191                 $data{$cat}{$mb}{'wc'} = $d[0];
 192
 193                 # Check if it's a start or end of range
 194                 if ($d[1] =~ /First>$/) {
 195                         $data{$cat}{$mb}{'start'} = 1;
 196                 } elsif ($d[1] =~ /Last>$/) {
 197                         $data{$cat}{$mb}{'end'} = 1;
 198                 }
 199
 200                 # Check if there's upper/lower mapping
 201                 if ($d[12] ne "") {
 202                         $data{'toupper'}{$mb} = $d[12];
 203                 } elsif ($d[13] ne "") {
 204                         $data{'tolower'}{$mb} = $d[13];
 205                 }
 206         }
 207
 208         my $first;
 209         my $inrange = 0;
 210
 211         # Now write out the categories
 212         foreach my $cat (sort keys (%data)) {
 213                 print FOUT "$cat\t";
 214                 $first = 1;
 215         foreach my $mb (sort {hex($a) <=> hex($b)} keys (%{$data{$cat}})) {
 216                 if ($first == 1) {
 217                         $first = 0;
 218                 } elsif ($inrange == 1) {
 219                         # Safety belt
 220                         die "broken range end wc=$data{$cat}{$mb}{'wc'}"
 221                             if !defined $data{$cat}{$mb}{'end'};
 222                         print FOUT ";...;";
 223                         $inrange = 0;
 224                 } else {
 225                         print FOUT ";/\n\t";
 226                 }
 227
 228                 if ($cat eq "tolower" || $cat eq "toupper") {
 229                         print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})";
 230                 } else {
 231                         if (defined($data{$cat}{$mb}{'start'})) {
 232                                 $inrange = 1;
 233                         }
 234                         print FOUT "$utf8map{$mb}";
 235                 }
 236         }
 237                 print FOUT "\n";
 238         }
 239 }