tools/tools/locale/tools/cldr2def.pl

   1 #!/usr/local/bin/perl -wC
   2
   3 # SPDX-License-Identifier: BSD-2-Clause-FreeBSD
   4 #
   5 # Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
   6 # Copyright 2015 John Marino <draco@marino.st>
   7 #
   8 # Redistribution and use in source and binary forms, with or without
   9 # modification, are permitted provided that the following conditions
  10 # are met:
  11 # 1. Redistributions of source code must retain the above copyright
  12 #    notice, this list of conditions and the following disclaimer.
  13 # 2. Redistributions in binary form must reproduce the above copyright
  14 #    notice, this list of conditions and the following disclaimer in the
  15 #    documentation and/or other materials provided with the distribution.
  16 #
  17 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20 # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27 # SUCH DAMAGE.
  28 #
  29 # $FreeBSD$
  30
  31 use strict;
  32 use File::Copy;
  33 use XML::Parser;
  34 use Tie::IxHash;
  35 use Text::Iconv;
  36 #use Data::Dumper;
  37 use Getopt::Long;
  38 use Digest::SHA qw(sha1_hex);
  39 require "charmaps.pm";
  40
  41
  42 if ($#ARGV < 2) {
  43         print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n";
  44         exit(1);
  45 }
  46
  47 my $DEFENCODING = "UTF-8";
  48
  49 my $UNIDIR = undef;
  50 my $ETCDIR = undef;
  51 my $TYPE = undef;
  52
  53 my $result = GetOptions (
  54                 "unidir=s"      => \$UNIDIR,
  55                 "etc=s"         => \$ETCDIR,
  56                 "type=s"        => \$TYPE,
  57             );
  58
  59 my %convertors = ();
  60
  61 my %ucd = ();
  62 my %values = ();
  63 my %hashtable = ();
  64 my %languages = ();
  65 my %translations = ();
  66 my %encodings = ();
  67 my %alternativemonths = ();
  68 get_languages();
  69
  70 my %utf8map = ();
  71 my %utf8aliases = ();
  72 get_unidata($UNIDIR);
  73 get_utf8map("$UNIDIR/posix/$DEFENCODING.cm");
  74 get_encodings("$ETCDIR/charmaps");
  75
  76 my %keys = ();
  77 tie(%keys, "Tie::IxHash");
  78 tie(%hashtable, "Tie::IxHash");
  79
  80 my %FILESNAMES = (
  81         "monetdef"      => "LC_MONETARY",
  82         "timedef"       => "LC_TIME",
  83         "msgdef"        => "LC_MESSAGES",
  84         "numericdef"    => "LC_NUMERIC",
  85         "colldef"       => "LC_COLLATE",
  86         "ctypedef"      => "LC_CTYPE"
  87 );
  88
  89 my %callback = (
  90         mdorder => \&callback_mdorder,
  91         altmon => \&callback_altmon,
  92         cformat => \&callback_cformat,
  93         dformat => \&callback_dformat,
  94         dtformat => \&callback_dtformat,
  95         cbabmon => \&callback_abmon,
  96         cbampm => \&callback_ampm,
  97         data => undef,
  98 );
  99
 100 my %DESC = (
 101
 102         # numericdef
 103         "decimal_point" => "decimal_point",
 104         "thousands_sep" => "thousands_sep",
 105         "grouping"      => "grouping",
 106
 107         # monetdef
 108         "int_curr_symbol"       => "int_curr_symbol (last character always " .
 109                                    "SPACE)",
 110         "currency_symbol"       => "currency_symbol",
 111         "mon_decimal_point"     => "mon_decimal_point",
 112         "mon_thousands_sep"     => "mon_thousands_sep",
 113         "mon_grouping"          => "mon_grouping",
 114         "positive_sign"         => "positive_sign",
 115         "negative_sign"         => "negative_sign",
 116         "int_frac_digits"       => "int_frac_digits",
 117         "frac_digits"           => "frac_digits",
 118         "p_cs_precedes"         => "p_cs_precedes",
 119         "p_sep_by_space"        => "p_sep_by_space",
 120         "n_cs_precedes"         => "n_cs_precedes",
 121         "n_sep_by_space"        => "n_sep_by_space",
 122         "p_sign_posn"           => "p_sign_posn",
 123         "n_sign_posn"           => "n_sign_posn",
 124
 125         # msgdef
 126         "yesexpr"       => "yesexpr",
 127         "noexpr"        => "noexpr",
 128         "yesstr"        => "yesstr",
 129         "nostr"         => "nostr",
 130
 131         # timedef
 132         "abmon"         => "Short month names",
 133         "mon"           => "Long month names (as in a date)",
 134         "abday"         => "Short weekday names",
 135         "day"           => "Long weekday names",
 136         "t_fmt"         => "X_fmt",
 137         "d_fmt"         => "x_fmt",
 138         "c_fmt"         => "c_fmt",
 139         "am_pm"         => "AM/PM",
 140         "d_t_fmt"       => "date_fmt",
 141         "altmon"        => "Long month names (without case ending)",
 142         "md_order"      => "md_order",
 143         "t_fmt_ampm"    => "ampm_fmt",
 144 );
 145
 146 if ($TYPE eq "colldef") {
 147         transform_collation();
 148         make_makefile();
 149 }
 150
 151 if ($TYPE eq "ctypedef") {
 152         transform_ctypes();
 153         make_makefile();
 154 }
 155
 156 if ($TYPE eq "numericdef") {
 157         %keys = (
 158             "decimal_point"     => "s",
 159             "thousands_sep"     => "s",
 160             "grouping"          => "ai",
 161         );
 162         get_fields();
 163         print_fields();
 164         make_makefile();
 165 }
 166
 167 if ($TYPE eq "monetdef") {
 168         %keys = (
 169             "int_curr_symbol"   => "s",
 170             "currency_symbol"   => "s",
 171             "mon_decimal_point" => "s",
 172             "mon_thousands_sep" => "s",
 173             "mon_grouping"      => "ai",
 174             "positive_sign"     => "s",
 175             "negative_sign"     => "s",
 176             "int_frac_digits"   => "i",
 177             "frac_digits"       => "i",
 178             "p_cs_precedes"     => "i",
 179             "p_sep_by_space"    => "i",
 180             "n_cs_precedes"     => "i",
 181             "n_sep_by_space"    => "i",
 182             "p_sign_posn"       => "i",
 183             "n_sign_posn"       => "i"
 184         );
 185         get_fields();
 186         print_fields();
 187         make_makefile();
 188 }
 189
 190 if ($TYPE eq "msgdef") {
 191         %keys = (
 192             "yesexpr"           => "s",
 193             "noexpr"            => "s",
 194             "yesstr"            => "s",
 195             "nostr"             => "s"
 196         );
 197         get_fields();
 198         print_fields();
 199         make_makefile();
 200 }
 201
 202 if ($TYPE eq "timedef") {
 203         %keys = (
 204             "abmon"             => "<cbabmon<abmon<as",
 205             "mon"               => "as",
 206             "abday"             => "as",
 207             "day"               => "as",
 208             "t_fmt"             => "s",
 209             "d_fmt"             => "<dformat<d_fmt<s",
 210             "c_fmt"             => "<cformat<d_t_fmt<s",
 211             "am_pm"             => "<cbampm<am_pm<as",
 212             "d_t_fmt"           => "<dtformat<d_t_fmt<s",
 213             "altmon"            => "<altmon<mon<as",
 214             "md_order"          => "<mdorder<d_fmt<s",
 215             "t_fmt_ampm"        => "s",
 216         );
 217         get_fields();
 218         print_fields();
 219         make_makefile();
 220 }
 221
 222 sub callback_ampm {
 223         my $s = shift;
 224         my $nl = $callback{data}{l} . "_" . $callback{data}{c};
 225         my $enc = $callback{data}{e};
 226
 227         if ($nl eq 'ru_RU') {
 228                 if ($enc eq 'UTF-8') {
 229                         $s = 'дп;пп';
 230                 } else {
 231                         my  $converter = Text::Iconv->new("utf-8", "$enc");
 232                         $s = $converter->convert("дп;пп");
 233                 }
 234         }
 235         return $s;
 236 }
 237
 238 sub callback_cformat {
 239         my $s = shift;
 240         my $nl = $callback{data}{l} . "_" . $callback{data}{c};
 241
 242         if ($nl eq 'ko_KR') {
 243                 $s =~ s/(> )(%p)/$1%A $2/;
 244         }
 245         $s =~ s/\.,/\./;
 246         $s =~ s/ %Z//;
 247         $s =~ s/ %z//;
 248         $s =~ s/^"%e\./%A %e/;
 249         $s =~ s/^"(%B %e, )/"%A, $1/;
 250         $s =~ s/^"(%e %B )/"%A $1/;
 251         return $s;
 252 };
 253
 254 sub callback_dformat {
 255         my $s = shift;
 256
 257         $s =~ s/(%m(<SOLIDUS>|[-.]))%e/$1%d/;
 258         $s =~ s/%e((<SOLIDUS>|[-.])%m)/%d$1/;
 259         return $s;
 260 };
 261
 262 sub callback_dtformat {
 263         my $s = shift;
 264         my $nl = $callback{data}{l} . "_" . $callback{data}{c};
 265
 266         if ($nl eq 'ja_JP') {
 267                 $s =~ s/(> )(%H)/$1%A $2/;
 268         } elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') {
 269                 if ($nl ne 'ko_KR') {
 270                         $s =~ s/%m/%_m/;
 271                 }
 272                 $s =~ s/(> )(%p)/$1%A $2/;
 273         }
 274         $s =~ s/\.,/\./;
 275         $s =~ s/^"%e\./%A %e/;
 276         $s =~ s/^"(%B %e, )/"%A, $1/;
 277         $s =~ s/^"(%e %B )/"%A $1/;
 278         return $s;
 279 };
 280
 281 sub callback_mdorder {
 282         my $s = shift;
 283         return undef if (!defined $s);
 284         $s =~ s/[^dem]//g;
 285         $s =~ s/e/d/g;
 286         return $s;
 287 };
 288
 289 sub callback_altmon {
 290         # if the language/country is known in %alternative months then
 291         # return that, otherwise repeat mon
 292         my $s = shift;
 293
 294         if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) {
 295                 my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}});
 296                 my @cleaned;
 297                 foreach (@altnames)
 298                 {
 299                         $_ =~ s/^\s+//;
 300                         $_ =~ s/\s+$//;
 301                         push @cleaned, $_;
 302                 }
 303                 return join(";",@cleaned);
 304         }
 305
 306         return $s;
 307 }
 308
 309 sub callback_abmon {
 310         # for specified CJK locales, pad result with a space to enable
 311         # columns to line up (style established in FreeBSD in 2001)
 312         my $s = shift;
 313         my $nl = $callback{data}{l} . "_" . $callback{data}{c};
 314
 315         if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' ||
 316             $nl eq 'zh_HK' || $nl eq 'zh_TW') {
 317                 my @monthnames = split(";", $s);
 318                 my @cleaned;
 319                 foreach (@monthnames)
 320                 {
 321                         if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ ||
 322                            ($_ =~ /^"<one>/ && $_ !~ /^"<one>(<zero>|<one>|<two>)/))
 323                         {
 324                                 $_ =~ s/^"/"<space>/;
 325                         }
 326                         push @cleaned, $_;
 327                 }
 328                 return join(";",@cleaned);
 329         }
 330         return $s;
 331 }
 332
 333 ############################
 334
 335 sub get_unidata {
 336         my $directory = shift;
 337
 338         open(FIN, "$directory/UnicodeData.txt")
 339             or die("Cannot open $directory/UnicodeData.txt");;
 340         my @lines = <FIN>;
 341         chomp(@lines);
 342         close(FIN);
 343
 344         foreach my $l (@lines) {
 345                 my @a = split(/;/, $l);
 346
 347                 $ucd{code2name}{"$a[0]"} = $a[1];       # Unicode name
 348                 $ucd{name2code}{"$a[1]"} = $a[0];       # Unicode code
 349         }
 350 }
 351
 352 sub get_utf8map {
 353         my $file = shift;
 354
 355         open(FIN, $file);
 356         my @lines = <FIN>;
 357         close(FIN);
 358         chomp(@lines);
 359
 360         my $prev_k = undef;
 361         my $prev_v = "";
 362         my $incharmap = 0;
 363         foreach my $l (@lines) {
 364                 $l =~ s/\r//;
 365                 next if ($l =~ /^\#/);
 366                 next if ($l eq "");
 367
 368                 if ($l eq "CHARMAP") {
 369                         $incharmap = 1;
 370                         next;
 371                 }
 372
 373                 next if (!$incharmap);
 374                 last if ($l eq "END CHARMAP");
 375
 376                 $l =~ /^<([^\s]+)>\s+(.*)/;
 377                 my $k = $1;
 378                 my $v = $2;
 379                 $k =~ s/_/ /g;          # unicode char string
 380                 $v =~ s/\\x//g;         # UTF-8 char code
 381                 $utf8map{$k} = $v;
 382
 383                 $utf8aliases{$k} = $prev_k if ($prev_v eq $v);
 384
 385                 $prev_v = $v;
 386                 $prev_k = $k;
 387         }
 388 }
 389
 390 sub get_encodings {
 391         my $dir = shift;
 392         foreach my $e (sort(keys(%encodings))) {
 393                 if (!open(FIN, "$dir/$e.TXT")) {
 394                         print "Cannot open charmap for $e\n";
 395                         next;
 396
 397                 }
 398                 $encodings{$e} = 1;
 399                 my @lines = <FIN>;
 400                 close(FIN);
 401                 chomp(@lines);
 402                 foreach my $l (@lines) {
 403                         $l =~ s/\r//;
 404                         next if ($l =~ /^\#/);
 405                         next if ($l eq "");
 406
 407                         my @a = split(" ", $l);
 408                         next if ($#a < 1);
 409                         $a[0] =~ s/^0[xX]//;    # local char code
 410                         $a[1] =~ s/^0[xX]//;    # unicode char code
 411                         $convertors{$e}{uc($a[1])} = uc($a[0]);
 412                 }
 413         }
 414 }
 415
 416 sub get_languages {
 417         my %data = get_xmldata($ETCDIR);
 418         %languages = %{$data{L}};
 419         %translations = %{$data{T}};
 420         %alternativemonths = %{$data{AM}};
 421         %encodings = %{$data{E}};
 422 }
 423
 424 sub transform_ctypes {
 425         # Add the C.UTF-8
 426         $languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef;
 427
 428         foreach my $l (sort keys(%languages)) {
 429         foreach my $f (sort keys(%{$languages{$l}})) {
 430         foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
 431                 next if (defined $languages{$l}{$f}{definitions}
 432                     && $languages{$l}{$f}{definitions} !~ /$TYPE/);
 433                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread
 434                 my $file = $l;
 435                 $file .= "_" . $f if ($f ne "x");
 436                 $file .= "_" . $c if ($c ne "x");
 437                 my $actfile = $file;
 438
 439                 my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
 440                 if (! -f $filename) {
 441                         print STDERR "Cannot open $filename\n";
 442                         next;
 443                 }
 444                 open(FIN, "$filename");
 445                 print "Reading from $filename for ${l}_${f}_${c}\n";
 446                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read
 447                 my @lines;
 448                 my $shex;
 449                 my $uhex;
 450                 while (<FIN>) {
 451                         push @lines, $_;
 452                 }
 453                 close(FIN);
 454                 $shex = sha1_hex(join("\n", @lines));
 455                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
 456                 $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
 457                 open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
 458                 print FOUT @lines;
 459                 close(FOUT);
 460                 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
 461                         next if ($enc eq $DEFENCODING);
 462                         $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
 463                         if ($file eq 'ja_JP') {
 464                                 # Override $filename for ja_JP because
 465                                 # its CTYPE is not compatible with UTF-8.
 466                                 $filename = "$UNIDIR/posix/$file.eucJP.src";
 467                         }
 468                         if (! -f $filename) {
 469                                 print STDERR "Cannot open $filename\n";
 470                                 next;
 471                         }
 472                         @lines = ();
 473                         open(FIN, "$filename");
 474                         while (<FIN>) {
 475                                 if ((/^comment_char\s/) || (/^escape_char\s/)){
 476                                         push @lines, $_;
 477                                 }
 478                                 if (/^LC_CTYPE/../^END LC_CTYPE/) {
 479                                         push @lines, $_;
 480                                 }
 481                         }
 482                         close(FIN);
 483                         $uhex = sha1_hex(join("\n", @lines) . $enc);
 484                         $languages{$l}{$f}{data}{$c}{$enc} = $uhex;
 485                         $hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1;
 486                         open(FOUT, ">$TYPE.draft/$actfile.$enc.src");
 487                         print FOUT <<EOF;
 488 # Warning: Do not edit. This file is automatically extracted from the
 489 # tools in /usr/src/tools/tools/locale. The data is obtained from the
 490 # CLDR project, obtained from http://cldr.unicode.org/
 491 # -----------------------------------------------------------------------------
 492 EOF
 493                         print FOUT @lines;
 494                         close(FOUT);
 495                 }
 496         }
 497         }
 498         }
 499 }
 500
 501
 502 sub transform_collation {
 503         foreach my $l (sort keys(%languages)) {
 504         foreach my $f (sort keys(%{$languages{$l}})) {
 505         foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
 506                 next if (defined $languages{$l}{$f}{definitions}
 507                     && $languages{$l}{$f}{definitions} !~ /$TYPE/);
 508                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread
 509                 my $file;
 510                 $file = $l . "_";
 511                 $file .= $f . "_" if ($f ne "x");
 512                 $file .= $c;
 513                 my $actfile = $file;
 514
 515                 my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
 516                 $filename = "$ETCDIR/$file.$DEFENCODING.src"
 517                     if (! -f $filename);
 518                 if (! -f $filename
 519                  && defined $languages{$l}{$f}{fallback}) {
 520                         $file = $languages{$l}{$f}{fallback};
 521                         $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
 522                 }
 523                 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
 524                     if (! -f $filename);
 525                 if (! -f $filename) {
 526                         print STDERR
 527                             "Cannot open $file.$DEFENCODING.src or fallback\n";
 528                         next;
 529                 }
 530                 open(FIN, "$filename");
 531                 print "Reading from $filename for ${l}_${f}_${c}\n";
 532                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read
 533                 my @lines;
 534                 my $shex;
 535                 while (<FIN>) {
 536                         if ((/^comment_char\s/) || (/^escape_char\s/)){
 537                                 push @lines, $_;
 538                         }
 539                         if (/^LC_COLLATE/../^END LC_COLLATE/) {
 540                                 $_ =~ s/[ ]+/ /g;
 541                                 push @lines, $_;
 542                         }
 543                 }
 544                 close(FIN);
 545                 $shex = sha1_hex(join("\n", @lines));
 546                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
 547                 $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
 548                 open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
 549                 print FOUT <<EOF;
 550 # Warning: Do not edit. This file is automatically extracted from the
 551 # tools in /usr/src/tools/tools/locale. The data is obtained from the
 552 # CLDR project, obtained from http://cldr.unicode.org/
 553 # -----------------------------------------------------------------------------
 554 EOF
 555                 print FOUT @lines;
 556                 close(FOUT);
 557
 558                 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
 559                         next if ($enc eq $DEFENCODING);
 560                         copy ("$TYPE.draft/$actfile.$DEFENCODING.src",
 561                               "$TYPE.draft/$actfile.$enc.src");
 562                         $languages{$l}{$f}{data}{$c}{$enc} = $shex;
 563                         $hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1;
 564                 }
 565         }
 566         }
 567         }
 568 }
 569
 570 sub get_fields {
 571         foreach my $l (sort keys(%languages)) {
 572         foreach my $f (sort keys(%{$languages{$l}})) {
 573         foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
 574                 next if (defined $languages{$l}{$f}{definitions}
 575                     && $languages{$l}{$f}{definitions} !~ /$TYPE/);
 576
 577                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread
 578                 my $file;
 579                 $file = $l . "_";
 580                 $file .= $f . "_" if ($f ne "x");
 581                 $file .= $c;
 582
 583                 my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
 584                 $filename = "$ETCDIR/$file.$DEFENCODING.src"
 585                     if (! -f $filename);
 586                 if (! -f $filename
 587                  && defined $languages{$l}{$f}{fallback}) {
 588                         $file = $languages{$l}{$f}{fallback};
 589                         $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
 590                 }
 591                 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
 592                     if (! -f $filename);
 593                 if (! -f $filename) {
 594                         print STDERR
 595                             "Cannot open $file.$DEFENCODING.src or fallback\n";
 596                         next;
 597                 }
 598                 open(FIN, "$filename");
 599                 print "Reading from $filename for ${l}_${f}_${c}\n";
 600                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read
 601                 my @lines = <FIN>;
 602                 chomp(@lines);
 603                 close(FIN);
 604                 my $continue = 0;
 605                 foreach my $k (keys(%keys)) {
 606                         foreach my $line (@lines) {
 607                                 $line =~ s/\r//;
 608                                 next if (!$continue && $line !~ /^$k\s/);
 609                                 if ($continue) {
 610                                         $line =~ s/^\s+//;
 611                                 } else {
 612                                         $line =~ s/^$k\s+//;
 613                                 }
 614
 615                                 $values{$l}{$f}{$c}{$k} = ""
 616                                         if (!defined $values{$l}{$f}{$c}{$k});
 617
 618                                 $continue = ($line =~ /\/$/);
 619                                 $line =~ s/\/$// if ($continue);
 620
 621                                 while ($line =~ /_/) {
 622                                         $line =~
 623                                             s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
 624                                 }
 625                                 die "_ in data - $line" if ($line =~ /_/);
 626                                 $values{$l}{$f}{$c}{$k} .= $line;
 627
 628                                 last if (!$continue);
 629                         }
 630                 }
 631         }
 632         }
 633         }
 634 }
 635
 636 sub decodecldr {
 637         my $e = shift;
 638         my $s = shift;
 639
 640         my $v = undef;
 641
 642         if ($e eq "UTF-8") {
 643                 #
 644                 # Conversion to UTF-8 can be done from the Unicode name to
 645                 # the UTF-8 character code.
 646                 #
 647                 $v = $utf8map{$s};
 648                 die "Cannot convert $s in $e (charmap)" if (!defined $v);
 649         } else {
 650                 #
 651                 # Conversion to these encodings can be done from the Unicode
 652                 # name to Unicode code to the encodings code.
 653                 #
 654                 my $ucc = undef;
 655                 $ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s});
 656                 $ucc = $ucd{name2code}{$utf8aliases{$s}}
 657                         if (!defined $ucc
 658                          && $utf8aliases{$s}
 659                          && defined $ucd{name2code}{$utf8aliases{$s}});
 660
 661                 if (!defined $ucc) {
 662                         if (defined $translations{$e}{$s}{hex}) {
 663                                 $v = $translations{$e}{$s}{hex};
 664                                 $ucc = 0;
 665                         } elsif (defined $translations{$e}{$s}{ucc}) {
 666                                 $ucc = $translations{$e}{$s}{ucc};
 667                         }
 668                 }
 669
 670                 die "Cannot convert $s in $e (ucd string)" if (!defined $ucc);
 671                 $v = $convertors{$e}{$ucc} if (!defined $v);
 672
 673                 $v = $translations{$e}{$s}{hex}
 674                         if (!defined $v && defined $translations{$e}{$s}{hex});
 675
 676                 if (!defined $v && defined $translations{$e}{$s}{unicode}) {
 677                         my $ucn = $translations{$e}{$s}{unicode};
 678                         $ucc = $ucd{name2code}{$ucn}
 679                                 if (defined $ucd{name2code}{$ucn});
 680                         $ucc = $ucd{name2code}{$utf8aliases{$ucn}}
 681                                 if (!defined $ucc
 682                                  && defined $ucd{name2code}{$utf8aliases{$ucn}});
 683                         $v = $convertors{$e}{$ucc};
 684                 }
 685
 686                 die "Cannot convert $s in $e (charmap)" if (!defined $v);
 687         }
 688
 689         return pack("C", hex($v)) if (length($v) == 2);
 690         return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
 691                 if (length($v) == 4);
 692         return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
 693             hex(substr($v, 4, 2))) if (length($v) == 6);
 694         print STDERR "Cannot convert $e $s\n";
 695         return "length = " . length($v);
 696
 697 }
 698
 699 sub translate {
 700         my $enc = shift;
 701         my $v = shift;
 702
 703         return $translations{$enc}{$v} if (defined $translations{$enc}{$v});
 704         return undef;
 705 }
 706
 707 sub print_fields {
 708         foreach my $l (sort keys(%languages)) {
 709         foreach my $f (sort keys(%{$languages{$l}})) {
 710         foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
 711                 next if (defined $languages{$l}{$f}{definitions}
 712                     && $languages{$l}{$f}{definitions} !~ /$TYPE/);
 713                 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
 714                         if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
 715                                 print "Skipping ${l}_" .
 716                                     ($f eq "x" ? "" : "${f}_") .
 717                                     "${c} - not read\n";
 718                                 next;
 719                         }
 720                         my $file = $l;
 721                         $file .= "_" . $f if ($f ne "x");
 722                         $file .= "_" . $c;
 723                         print "Writing to $file in $enc\n";
 724
 725                         if ($enc ne $DEFENCODING &&
 726                             !defined $convertors{$enc}) {
 727                                 print "Failed! Cannot convert to $enc.\n";
 728                                 next;
 729                         };
 730
 731                         open(FOUT, ">$TYPE.draft/$file.$enc.new");
 732                         my $okay = 1;
 733                         my $output = "";
 734                         print FOUT <<EOF;
 735 # Warning: Do not edit. This file is automatically generated from the
 736 # tools in /usr/src/tools/tools/locale. The data is obtained from the
 737 # CLDR project, obtained from http://cldr.unicode.org/
 738 # -----------------------------------------------------------------------------
 739 EOF
 740                         foreach my $k (keys(%keys)) {
 741                                 my $g = $keys{$k};
 742
 743                                 die("Unknown $k in \%DESC")
 744                                         if (!defined $DESC{$k});
 745
 746                                 $output .= "#\n# $DESC{$k}\n";
 747
 748                                 # Replace one row with another
 749                                 if ($g =~ /^>/) {
 750                                         $k = substr($g, 1);
 751                                         $g = $keys{$k};
 752                                 }
 753
 754                                 # Callback function
 755                                 if ($g =~ /^\</) {
 756                                         $callback{data}{c} = $c;
 757                                         $callback{data}{k} = $k;
 758                                         $callback{data}{f} = $f;
 759                                         $callback{data}{l} = $l;
 760                                         $callback{data}{e} = $enc;
 761                                         my @a = split(/\</, substr($g, 1));
 762                                         my $rv =
 763                                             &{$callback{$a[0]}}($values{$l}{$f}{$c}{$a[1]});
 764                                         $values{$l}{$f}{$c}{$k} = $rv;
 765                                         $g = $a[2];
 766                                         $callback{data} = ();
 767                                 }
 768
 769                                 my $v = $values{$l}{$f}{$c}{$k};
 770                                 $v = "undef" if (!defined $v);
 771
 772                                 if ($g eq "i") {
 773                                         $output .= "$v\n";
 774                                         next;
 775                                 }
 776                                 if ($g eq "ai") {
 777                                         $output .= "$v\n";
 778                                         next;
 779                                 }
 780                                 if ($g eq "s") {
 781                                         $v =~ s/^"//;
 782                                         $v =~ s/"$//;
 783                                         my $cm = "";
 784                                         while ($v =~ /^(.*?)<(.*?)>(.*)/) {
 785                                                 my $p1 = $1;
 786                                                 $cm = $2;
 787                                                 my $p3 = $3;
 788
 789                                                 my $rv = decodecldr($enc, $cm);
 790 #                                               $rv = translate($enc, $cm)
 791 #                                                       if (!defined $rv);
 792                                                 if (!defined $rv) {
 793                                                         print STDERR
 794 "Could not convert $k ($cm) from $DEFENCODING to $enc\n";
 795                                                         $okay = 0;
 796                                                         next;
 797                                                 }
 798
 799                                                 $v = $p1 . $rv . $p3;
 800                                         }
 801                                         $output .= "$v\n";
 802                                         next;
 803                                 }
 804                                 if ($g eq "as") {
 805                                         foreach my $v (split(/;/, $v)) {
 806                                                 $v =~ s/^"//;
 807                                                 $v =~ s/"$//;
 808                                                 my $cm = "";
 809                                                 while ($v =~ /^(.*?)<(.*?)>(.*)/) {
 810                                                         my $p1 = $1;
 811                                                         $cm = $2;
 812                                                         my $p3 = $3;
 813
 814                                                         my $rv =
 815                                                             decodecldr($enc,
 816                                                                 $cm);
 817 #                                                       $rv = translate($enc,
 818 #                                                           $cm)
 819 #                                                           if (!defined $rv);
 820                                                         if (!defined $rv) {
 821                                                                 print STDERR
 822 "Could not convert $k ($cm) from $DEFENCODING to $enc\n";
 823                                                                 $okay = 0;
 824                                                                 next;
 825                                                         }
 826
 827                                                         $v = $1 . $rv . $3;
 828                                                 }
 829                                                 $output .= "$v\n";
 830                                         }
 831                                         next;
 832                                 }
 833
 834                                 die("$k is '$g'");
 835
 836                         }
 837
 838                         $languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output);
 839                         $hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1;
 840                         print FOUT "$output# EOF\n";
 841                         close(FOUT);
 842
 843                         if ($okay) {
 844                                 rename("$TYPE.draft/$file.$enc.new",
 845                                     "$TYPE.draft/$file.$enc.src");
 846                         } else {
 847                                 rename("$TYPE.draft/$file.$enc.new",
 848                                     "$TYPE.draft/$file.$enc.failed");
 849                         }
 850                 }
 851         }
 852         }
 853         }
 854 }
 855
 856 sub make_makefile {
 857         print "Creating Makefile for $TYPE\n";
 858         my $SRCOUT;
 859         my $SRCOUT2;
 860         my $SRCOUT3 = "";
 861         my $SRCOUT4 = "";
 862         my $MAPLOC;
 863         if ($TYPE eq "colldef") {
 864                 $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " .
 865                         "-i \${.IMPSRC} \\\n" .
 866                         "\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " .
 867                         "\${.OBJDIR}/\${.IMPSRC:T:R}";
 868                 $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
 869                                 "locale/etc/final-maps\n";
 870                 $SRCOUT2 = "LC_COLLATE";
 871                 $SRCOUT3 = "" .
 872                         ".for f t in \${LOCALES_MAPPED}\n" .
 873                         "FILES+=\t\$t.LC_COLLATE\n" .
 874                         "FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" .
 875                         "\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" .
 876                         "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " .
 877                         "-i \${.ALLSRC} \\\n" .
 878                         "\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" .
 879                         "\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" .
 880                         ".endfor\n\n";
 881                 $SRCOUT4 = "## LOCALES_MAPPED\n";
 882         }
 883         elsif ($TYPE eq "ctypedef") {
 884                 $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " .
 885                         "-w \${MAPLOC}/widths.txt \\\n" .
 886                         "\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " .
 887                         "\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " .
 888                         " || true";
 889                 $SRCOUT2 = "LC_CTYPE";
 890                 $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
 891                                 "locale/etc/final-maps\n";
 892                 $SRCOUT3 = "## SYMPAIRS\n\n" .
 893                         ".for s t in \${SYMPAIRS}\n" .
 894                         "\${t:S/src\$/LC_CTYPE/}: " .
 895                         "\$s\n" .
 896                         "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " .
 897                         "-w \${MAPLOC}/widths.txt \\\n" .
 898                         "\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " .
 899                         "\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " .
 900                         " || true\n" .
 901                         ".endfor\n\n";
 902         }
 903         else {
 904                 $SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}";
 905                 $SRCOUT2 = "out";
 906                 $MAPLOC = "";
 907         }
 908         open(FOUT, ">$TYPE.draft/Makefile");
 909         print FOUT <<EOF;
 910 # \$FreeBSD\$
 911 # Warning: Do not edit. This file is automatically generated from the
 912 # tools in /usr/src/tools/tools/locale.
 913
 914 LOCALEDIR=      \${SHAREDIR}/locale
 915 FILESNAME=      $FILESNAMES{$TYPE}
 916 .SUFFIXES:      .src .${SRCOUT2}
 917 ${MAPLOC}
 918 EOF
 919
 920         if ($TYPE eq "colldef" || $TYPE eq "ctypedef") {
 921                 print FOUT <<EOF;
 922 .include <bsd.endian.mk>
 923
 924 EOF
 925         }
 926
 927         print FOUT <<EOF;
 928 .src.${SRCOUT2}:
 929         $SRCOUT
 930
 931 ## PLACEHOLDER
 932
 933 ${SRCOUT4}
 934
 935 EOF
 936
 937         foreach my $hash (keys(%hashtable)) {
 938                 # For colldef, weight LOCALES to UTF-8
 939                 #     Sort as upper-case and reverse to achieve it
 940                 #     Make en_US, ru_RU, and ca_AD preferred
 941                 my @files;
 942                 if ($TYPE eq "colldef") {
 943                         @files = sort {
 944                                 if ($a eq 'en_x_US.UTF-8' ||
 945                                     $a eq 'ru_x_RU.UTF-8' ||
 946                                     $a eq 'ca_x_AD.UTF-8') { return -1; }
 947                                 elsif ($b eq 'en_x_US.UTF-8' ||
 948                                        $b eq 'ru_x_RU.UTF-8' ||
 949                                        $b eq 'ca_x_AD.UTF-8') { return 1; }
 950                                 else { return uc($b) cmp uc($a); }
 951                                 } keys(%{$hashtable{$hash}});
 952                 } elsif ($TYPE eq "ctypedef") {
 953                         @files = sort {
 954                                 if ($a eq 'C_x_x.UTF-8') { return -1; }
 955                                 elsif ($b eq 'C_x_x.UTF-8') { return 1; }
 956                                 if ($a =~ /^en_x_US/) { return -1; }
 957                                 elsif ($b =~ /^en_x_US/) { return 1; }
 958
 959                                 if ($a =~ /^en_x_GB.ISO8859-15/ ||
 960                                     $a =~ /^ru_x_RU/) { return -1; }
 961                                 elsif ($b =~ /^en_x_GB.ISO8859-15/ ||
 962                                        $b =~ /ru_x_RU/) { return 1; }
 963                                 else { return uc($b) cmp uc($a); }
 964
 965                                 } keys(%{$hashtable{$hash}});
 966                 } else {
 967                         @files = sort {
 968                                 if ($a =~ /_Comm_/ ||
 969                                     $b eq 'en_x_US.UTF-8') { return 1; }
 970                                 elsif ($b =~ /_Comm_/ ||
 971                                        $a eq 'en_x_US.UTF-8') { return -1; }
 972                                 else { return uc($b) cmp uc($a); }
 973                                 } keys(%{$hashtable{$hash}});
 974                 }
 975                 if ($#files > 0) {
 976                         my $link = shift(@files);
 977                         $link =~ s/_x_x//;      # special case for C
 978                         $link =~ s/_x_/_/;      # strip family if none there
 979                         foreach my $file (@files) {
 980                                 my @a = split(/_/, $file);
 981                                 my @b = split(/\./, $a[-1]);
 982                                 $file =~ s/_x_/_/;
 983                                 print FOUT "SAME+=\t\t$link $file\n";
 984                                 undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]});
 985                         }
 986                 }
 987         }
 988
 989         foreach my $l (sort keys(%languages)) {
 990         foreach my $f (sort keys(%{$languages{$l}})) {
 991         foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
 992                 next if (defined $languages{$l}{$f}{definitions}
 993                     && $languages{$l}{$f}{definitions} !~ /$TYPE/);
 994                 if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING}
 995                  && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
 996                         print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") .
 997                             "${c} - not read\n";
 998                         next;
 999                 }
1000                 foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1001                         my $file = $l;
1002                         $file .= "_" . $f if ($f ne "x");
1003                         $file .= "_" . $c if ($c ne "x");
1004                         next if (!defined $languages{$l}{$f}{data}{$c}{$e});
1005                         print FOUT "LOCALES+=\t$file.$e\n";
1006                 }
1007
1008                 if (defined $languages{$l}{$f}{nc_link}) {
1009                         foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1010                                 my $file = $l . "_";
1011                                 $file .= $f . "_" if ($f ne "x");
1012                                 $file .= $c;
1013                                 print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n";
1014                         }
1015                 }
1016
1017                 if (defined $languages{$l}{$f}{e_link}) {
1018                         foreach my $el (split(" ", $languages{$l}{$f}{e_link})) {
1019                                 my @a = split(/:/, $el);
1020                                 my $file = $l . "_";
1021                                 $file .= $f . "_" if ($f ne "x");
1022                                 $file .= $c;
1023                                 print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n";
1024                         }
1025                 }
1026
1027         }
1028         }
1029         }
1030
1031         print FOUT <<EOF;
1032
1033 FILES=          \${LOCALES:S/\$/.${SRCOUT2}/}
1034 CLEANFILES=     \${FILES}
1035
1036 .for f t in \${SAME}
1037 SYMLINKS+=      ../\$f/\${FILESNAME} \\
1038     \${LOCALEDIR}/\$t/\${FILESNAME}
1039 .endfor
1040
1041 .for f in \${LOCALES}
1042 FILESDIR_\${f}.${SRCOUT2}= \${LOCALEDIR}/\${f}
1043 .endfor
1044
1045 ${SRCOUT3}.include <bsd.prog.mk>
1046 EOF
1047
1048         close(FOUT);
1049 }