]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - tools/tools/locale/tools/cldr2def.pl
Update nvi to 2.2.0
[FreeBSD/FreeBSD.git] / tools / tools / locale / tools / cldr2def.pl
1 #!/usr/local/bin/perl -wC
2
3 # SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 #
5 # Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
6 # Copyright 2015 John Marino <draco@marino.st>
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions
10 # are met:
11 # 1. Redistributions of source code must retain the above copyright
12 #    notice, this list of conditions and the following disclaimer.
13 # 2. Redistributions in binary form must reproduce the above copyright
14 #    notice, this list of conditions and the following disclaimer in the
15 #    documentation and/or other materials provided with the distribution.
16 #
17 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 # SUCH DAMAGE.
28 #
29 # $FreeBSD$
30
31 use strict;
32 use File::Copy;
33 use XML::Parser;
34 use Tie::IxHash;
35 use Text::Iconv;
36 #use Data::Dumper;
37 use Getopt::Long;
38 use Digest::SHA qw(sha1_hex);
39 require "charmaps.pm";
40
41
42 if ($#ARGV < 2) {
43         print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n";
44         exit(1);
45 }
46
47 my $DEFENCODING = "UTF-8";
48
49 my $UNIDIR = undef;
50 my $ETCDIR = undef;
51 my $TYPE = undef;
52
53 my $result = GetOptions (
54                 "unidir=s"      => \$UNIDIR,
55                 "etc=s"         => \$ETCDIR,
56                 "type=s"        => \$TYPE,
57             );
58
59 my %convertors = ();
60
61 my %ucd = ();
62 my %values = ();
63 my %hashtable = ();
64 my %languages = ();
65 my %translations = ();
66 my %encodings = ();
67 my %alternativemonths = ();
68 get_languages();
69
70 my %utf8map = ();
71 my %utf8aliases = ();
72 get_unidata($UNIDIR);
73 get_utf8map("$UNIDIR/posix/$DEFENCODING.cm");
74 get_encodings("$ETCDIR/charmaps");
75
76 my %keys = ();
77 tie(%keys, "Tie::IxHash");
78 tie(%hashtable, "Tie::IxHash");
79
80 my %FILESNAMES = (
81         "monetdef"      => "LC_MONETARY",
82         "timedef"       => "LC_TIME",
83         "msgdef"        => "LC_MESSAGES",
84         "numericdef"    => "LC_NUMERIC",
85         "colldef"       => "LC_COLLATE",
86         "ctypedef"      => "LC_CTYPE"
87 );
88
89 my %callback = (
90         mdorder => \&callback_mdorder,
91         altmon => \&callback_altmon,
92         cformat => \&callback_cformat,
93         dformat => \&callback_dformat,
94         dtformat => \&callback_dtformat,
95         cbabmon => \&callback_abmon,
96         cbampm => \&callback_ampm,
97         data => undef,
98 );
99
100 my %DESC = (
101
102         # numericdef
103         "decimal_point" => "decimal_point",
104         "thousands_sep" => "thousands_sep",
105         "grouping"      => "grouping",
106
107         # monetdef
108         "int_curr_symbol"       => "int_curr_symbol (last character always " .
109                                    "SPACE)",
110         "currency_symbol"       => "currency_symbol",
111         "mon_decimal_point"     => "mon_decimal_point",
112         "mon_thousands_sep"     => "mon_thousands_sep",
113         "mon_grouping"          => "mon_grouping",
114         "positive_sign"         => "positive_sign",
115         "negative_sign"         => "negative_sign",
116         "int_frac_digits"       => "int_frac_digits",
117         "frac_digits"           => "frac_digits",
118         "p_cs_precedes"         => "p_cs_precedes",
119         "p_sep_by_space"        => "p_sep_by_space",
120         "n_cs_precedes"         => "n_cs_precedes",
121         "n_sep_by_space"        => "n_sep_by_space",
122         "p_sign_posn"           => "p_sign_posn",
123         "n_sign_posn"           => "n_sign_posn",
124
125         # msgdef
126         "yesexpr"       => "yesexpr",
127         "noexpr"        => "noexpr",
128         "yesstr"        => "yesstr",
129         "nostr"         => "nostr",
130
131         # timedef
132         "abmon"         => "Short month names",
133         "mon"           => "Long month names (as in a date)",
134         "abday"         => "Short weekday names",
135         "day"           => "Long weekday names",
136         "t_fmt"         => "X_fmt",
137         "d_fmt"         => "x_fmt",
138         "c_fmt"         => "c_fmt",
139         "am_pm"         => "AM/PM",
140         "d_t_fmt"       => "date_fmt",
141         "altmon"        => "Long month names (without case ending)",
142         "md_order"      => "md_order",
143         "t_fmt_ampm"    => "ampm_fmt",
144 );
145
146 if ($TYPE eq "colldef") {
147         transform_collation();
148         make_makefile();
149 }
150
151 if ($TYPE eq "ctypedef") {
152         transform_ctypes();
153         make_makefile();
154 }
155
156 if ($TYPE eq "numericdef") {
157         %keys = (
158             "decimal_point"     => "s",
159             "thousands_sep"     => "s",
160             "grouping"          => "ai",
161         );
162         get_fields();
163         print_fields();
164         make_makefile();
165 }
166
167 if ($TYPE eq "monetdef") {
168         %keys = (
169             "int_curr_symbol"   => "s",
170             "currency_symbol"   => "s",
171             "mon_decimal_point" => "s",
172             "mon_thousands_sep" => "s",
173             "mon_grouping"      => "ai",
174             "positive_sign"     => "s",
175             "negative_sign"     => "s",
176             "int_frac_digits"   => "i",
177             "frac_digits"       => "i",
178             "p_cs_precedes"     => "i",
179             "p_sep_by_space"    => "i",
180             "n_cs_precedes"     => "i",
181             "n_sep_by_space"    => "i",
182             "p_sign_posn"       => "i",
183             "n_sign_posn"       => "i"
184         );
185         get_fields();
186         print_fields();
187         make_makefile();
188 }
189
190 if ($TYPE eq "msgdef") {
191         %keys = (
192             "yesexpr"           => "s",
193             "noexpr"            => "s",
194             "yesstr"            => "s",
195             "nostr"             => "s"
196         );
197         get_fields();
198         print_fields();
199         make_makefile();
200 }
201
202 if ($TYPE eq "timedef") {
203         %keys = (
204             "abmon"             => "<cbabmon<abmon<as",
205             "mon"               => "as",
206             "abday"             => "as",
207             "day"               => "as",
208             "t_fmt"             => "s",
209             "d_fmt"             => "<dformat<d_fmt<s",
210             "c_fmt"             => "<cformat<d_t_fmt<s",
211             "am_pm"             => "<cbampm<am_pm<as",
212             "d_t_fmt"           => "<dtformat<d_t_fmt<s",
213             "altmon"            => "<altmon<mon<as",
214             "md_order"          => "<mdorder<d_fmt<s",
215             "t_fmt_ampm"        => "s",
216         );
217         get_fields();
218         print_fields();
219         make_makefile();
220 }
221
222 sub callback_ampm {
223         my $s = shift;
224         my $nl = $callback{data}{l} . "_" . $callback{data}{c};
225         my $enc = $callback{data}{e};
226
227         if ($nl eq 'ru_RU') {
228                 if ($enc eq 'UTF-8') {
229                         $s = 'дп;пп';
230                 } else {
231                         my  $converter = Text::Iconv->new("utf-8", "$enc");
232                         $s = $converter->convert("дп;пп");
233                 }
234         }
235         return $s;
236 }
237
238 sub callback_cformat {
239         my $s = shift;
240         my $nl = $callback{data}{l} . "_" . $callback{data}{c};
241
242         if ($nl eq 'ko_KR') {
243                 $s =~ s/(> )(%p)/$1%A $2/;
244         }
245         $s =~ s/\.,/\./;
246         $s =~ s/ %Z//;
247         $s =~ s/ %z//;
248         $s =~ s/^"%e\./%A %e/;
249         $s =~ s/^"(%B %e, )/"%A, $1/;
250         $s =~ s/^"(%e %B )/"%A $1/;
251         return $s;
252 };
253
254 sub callback_dformat {
255         my $s = shift;
256
257         $s =~ s/(%m(<SOLIDUS>|[-.]))%e/$1%d/;
258         $s =~ s/%e((<SOLIDUS>|[-.])%m)/%d$1/;
259         return $s;
260 };
261
262 sub callback_dtformat {
263         my $s = shift;
264         my $nl = $callback{data}{l} . "_" . $callback{data}{c};
265
266         if ($nl eq 'ja_JP') {
267                 $s =~ s/(> )(%H)/$1%A $2/;
268         } elsif ($nl eq 'ko_KR' || $nl eq 'zh_CN' || $nl eq 'zh_TW') {
269                 if ($nl ne 'ko_KR') {
270                         $s =~ s/%m/%_m/;
271                 }
272                 $s =~ s/(> )(%p)/$1%A $2/;
273         }
274         $s =~ s/\.,/\./;
275         $s =~ s/^"%e\./%A %e/;
276         $s =~ s/^"(%B %e, )/"%A, $1/;
277         $s =~ s/^"(%e %B )/"%A $1/;
278         return $s;
279 };
280
281 sub callback_mdorder {
282         my $s = shift;
283         return undef if (!defined $s);
284         $s =~ s/[^dem]//g;
285         $s =~ s/e/d/g;
286         return $s;
287 };
288
289 sub callback_altmon {
290         # if the language/country is known in %alternative months then
291         # return that, otherwise repeat mon
292         my $s = shift;
293
294         if (defined $alternativemonths{$callback{data}{l}}{$callback{data}{c}}) {
295                 my @altnames = split(";",$alternativemonths{$callback{data}{l}}{$callback{data}{c}});
296                 my @cleaned;
297                 foreach (@altnames)
298                 {
299                         $_ =~ s/^\s+//;
300                         $_ =~ s/\s+$//;
301                         push @cleaned, $_;
302                 }
303                 return join(";",@cleaned);
304         }
305
306         return $s;
307 }
308
309 sub callback_abmon {
310         # for specified CJK locales, pad result with a space to enable
311         # columns to line up (style established in FreeBSD in 2001)
312         my $s = shift;
313         my $nl = $callback{data}{l} . "_" . $callback{data}{c};
314
315         if ($nl eq 'ja_JP' || $nl eq 'ko_KR' || $nl eq 'zh_CN' ||
316             $nl eq 'zh_HK' || $nl eq 'zh_TW') {
317                 my @monthnames = split(";", $s);
318                 my @cleaned;
319                 foreach (@monthnames)
320                 {
321                         if ($_ =~ /^"<(two|three|four|five|six|seven|eight|nine)>/ ||
322                            ($_ =~ /^"<one>/ && $_ !~ /^"<one>(<zero>|<one>|<two>)/))
323                         {
324                                 $_ =~ s/^"/"<space>/;
325                         }
326                         push @cleaned, $_;
327                 }
328                 return join(";",@cleaned);
329         }
330         return $s;
331 }
332
333 ############################
334
335 sub get_unidata {
336         my $directory = shift;
337
338         open(FIN, "$directory/UnicodeData.txt")
339             or die("Cannot open $directory/UnicodeData.txt");;
340         my @lines = <FIN>;
341         chomp(@lines);
342         close(FIN);
343
344         foreach my $l (@lines) {
345                 my @a = split(/;/, $l);
346
347                 $ucd{code2name}{"$a[0]"} = $a[1];       # Unicode name
348                 $ucd{name2code}{"$a[1]"} = $a[0];       # Unicode code
349         }
350 }
351
352 sub get_utf8map {
353         my $file = shift;
354
355         open(FIN, $file);
356         my @lines = <FIN>;
357         close(FIN);
358         chomp(@lines);
359
360         my $prev_k = undef;
361         my $prev_v = "";
362         my $incharmap = 0;
363         foreach my $l (@lines) {
364                 $l =~ s/\r//;
365                 next if ($l =~ /^\#/);
366                 next if ($l eq "");
367
368                 if ($l eq "CHARMAP") {
369                         $incharmap = 1;
370                         next;
371                 }
372
373                 next if (!$incharmap);
374                 last if ($l eq "END CHARMAP");
375
376                 $l =~ /^<([^\s]+)>\s+(.*)/;
377                 my $k = $1;
378                 my $v = $2;
379                 $k =~ s/_/ /g;          # unicode char string
380                 $v =~ s/\\x//g;         # UTF-8 char code
381                 $utf8map{$k} = $v;
382
383                 $utf8aliases{$k} = $prev_k if ($prev_v eq $v);
384
385                 $prev_v = $v;
386                 $prev_k = $k;
387         }
388 }
389
390 sub get_encodings {
391         my $dir = shift;
392         foreach my $e (sort(keys(%encodings))) {
393                 if (!open(FIN, "$dir/$e.TXT")) {
394                         print "Cannot open charmap for $e\n";
395                         next;
396
397                 }
398                 $encodings{$e} = 1;
399                 my @lines = <FIN>;
400                 close(FIN);
401                 chomp(@lines);
402                 foreach my $l (@lines) {
403                         $l =~ s/\r//;
404                         next if ($l =~ /^\#/);
405                         next if ($l eq "");
406
407                         my @a = split(" ", $l);
408                         next if ($#a < 1);
409                         $a[0] =~ s/^0[xX]//;    # local char code
410                         $a[1] =~ s/^0[xX]//;    # unicode char code
411                         $convertors{$e}{uc($a[1])} = uc($a[0]);
412                 }
413         }
414 }
415
416 sub get_languages {
417         my %data = get_xmldata($ETCDIR);
418         %languages = %{$data{L}}; 
419         %translations = %{$data{T}}; 
420         %alternativemonths = %{$data{AM}}; 
421         %encodings = %{$data{E}}; 
422 }
423
424 sub transform_ctypes {
425         # Add the C.UTF-8
426         $languages{"C"}{"x"}{data}{"x"}{$DEFENCODING} = undef;
427
428         foreach my $l (sort keys(%languages)) {
429         foreach my $f (sort keys(%{$languages{$l}})) {
430         foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
431                 next if (defined $languages{$l}{$f}{definitions}
432                     && $languages{$l}{$f}{definitions} !~ /$TYPE/);
433                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread
434                 my $file = $l;
435                 $file .= "_" . $f if ($f ne "x");
436                 $file .= "_" . $c if ($c ne "x");
437                 my $actfile = $file;
438
439                 my $filename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
440                 if (! -f $filename) {
441                         print STDERR "Cannot open $filename\n";
442                         next;
443                 }
444                 open(FIN, "$filename");
445                 print "Reading from $filename for ${l}_${f}_${c}\n";
446                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read
447                 my @lines;
448                 my $shex;
449                 my $uhex;
450                 while (<FIN>) {
451                         push @lines, $_;
452                 }
453                 close(FIN);
454                 $shex = sha1_hex(join("\n", @lines));
455                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
456                 $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
457                 open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
458                 print FOUT @lines;
459                 close(FOUT);
460                 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
461                         next if ($enc eq $DEFENCODING);
462                         $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
463                         if ($file eq 'ja_JP') {
464                                 # Override $filename for ja_JP because
465                                 # its CTYPE is not compatible with UTF-8.
466                                 $filename = "$UNIDIR/posix/$file.eucJP.src";
467                         }
468                         if (! -f $filename) {
469                                 print STDERR "Cannot open $filename\n";
470                                 next;
471                         }
472                         @lines = ();
473                         open(FIN, "$filename");
474                         while (<FIN>) {
475                                 if ((/^comment_char\s/) || (/^escape_char\s/)){
476                                         push @lines, $_;
477                                 }
478                                 if (/^LC_CTYPE/../^END LC_CTYPE/) {
479                                         push @lines, $_;
480                                 }
481                         }
482                         close(FIN);
483                         $uhex = sha1_hex(join("\n", @lines) . $enc);
484                         $languages{$l}{$f}{data}{$c}{$enc} = $uhex;
485                         $hashtable{$uhex}{"${l}_${f}_${c}.$enc"} = 1;
486                         open(FOUT, ">$TYPE.draft/$actfile.$enc.src");
487                         print FOUT <<EOF;
488 # Warning: Do not edit. This file is automatically extracted from the
489 # tools in /usr/src/tools/tools/locale. The data is obtained from the
490 # CLDR project, obtained from http://cldr.unicode.org/
491 # -----------------------------------------------------------------------------
492 EOF
493                         print FOUT @lines;
494                         close(FOUT);
495                 }
496         }
497         }
498         }
499 }
500
501
502 sub transform_collation {
503         foreach my $l (sort keys(%languages)) {
504         foreach my $f (sort keys(%{$languages{$l}})) {
505         foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
506                 next if (defined $languages{$l}{$f}{definitions}
507                     && $languages{$l}{$f}{definitions} !~ /$TYPE/);
508                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread
509                 my $file;
510                 $file = $l . "_";
511                 $file .= $f . "_" if ($f ne "x");
512                 $file .= $c;
513                 my $actfile = $file;
514
515                 my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
516                 $filename = "$ETCDIR/$file.$DEFENCODING.src"
517                     if (! -f $filename);
518                 if (! -f $filename
519                  && defined $languages{$l}{$f}{fallback}) {
520                         $file = $languages{$l}{$f}{fallback};
521                         $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
522                 }
523                 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
524                     if (! -f $filename);
525                 if (! -f $filename) {
526                         print STDERR
527                             "Cannot open $file.$DEFENCODING.src or fallback\n";
528                         next;
529                 }
530                 open(FIN, "$filename");
531                 print "Reading from $filename for ${l}_${f}_${c}\n";
532                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read
533                 my @lines;
534                 my $shex;
535                 while (<FIN>) {
536                         if ((/^comment_char\s/) || (/^escape_char\s/)){
537                                 push @lines, $_;
538                         }
539                         if (/^LC_COLLATE/../^END LC_COLLATE/) {
540                                 $_ =~ s/[ ]+/ /g;
541                                 push @lines, $_;
542                         }
543                 }
544                 close(FIN);
545                 $shex = sha1_hex(join("\n", @lines));
546                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = $shex;
547                 $hashtable{$shex}{"${l}_${f}_${c}.$DEFENCODING"} = 1;
548                 open(FOUT, ">$TYPE.draft/$actfile.$DEFENCODING.src");
549                 print FOUT <<EOF;
550 # Warning: Do not edit. This file is automatically extracted from the
551 # tools in /usr/src/tools/tools/locale. The data is obtained from the
552 # CLDR project, obtained from http://cldr.unicode.org/
553 # -----------------------------------------------------------------------------
554 EOF
555                 print FOUT @lines;
556                 close(FOUT);
557
558                 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
559                         next if ($enc eq $DEFENCODING);
560                         copy ("$TYPE.draft/$actfile.$DEFENCODING.src",
561                               "$TYPE.draft/$actfile.$enc.src");
562                         $languages{$l}{$f}{data}{$c}{$enc} = $shex;
563                         $hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1;
564                 }
565         }
566         }
567         }
568 }
569
570 sub get_fields {
571         foreach my $l (sort keys(%languages)) {
572         foreach my $f (sort keys(%{$languages{$l}})) {
573         foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
574                 next if (defined $languages{$l}{$f}{definitions}
575                     && $languages{$l}{$f}{definitions} !~ /$TYPE/);
576
577                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 0; # unread
578                 my $file;
579                 $file = $l . "_";
580                 $file .= $f . "_" if ($f ne "x");
581                 $file .= $c;
582
583                 my $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
584                 $filename = "$ETCDIR/$file.$DEFENCODING.src"
585                     if (! -f $filename);
586                 if (! -f $filename
587                  && defined $languages{$l}{$f}{fallback}) {
588                         $file = $languages{$l}{$f}{fallback};
589                         $filename = "$UNIDIR/posix/$file.$DEFENCODING.src";
590                 }
591                 $filename = "$UNIDIR/posix/$file.$DEFENCODING.src"
592                     if (! -f $filename);
593                 if (! -f $filename) {
594                         print STDERR
595                             "Cannot open $file.$DEFENCODING.src or fallback\n";
596                         next;
597                 }
598                 open(FIN, "$filename");
599                 print "Reading from $filename for ${l}_${f}_${c}\n";
600                 $languages{$l}{$f}{data}{$c}{$DEFENCODING} = 1; # read
601                 my @lines = <FIN>;
602                 chomp(@lines);
603                 close(FIN);
604                 my $continue = 0;
605                 foreach my $k (keys(%keys)) {
606                         foreach my $line (@lines) {
607                                 $line =~ s/\r//;
608                                 next if (!$continue && $line !~ /^$k\s/);
609                                 if ($continue) {
610                                         $line =~ s/^\s+//;
611                                 } else {
612                                         $line =~ s/^$k\s+//;
613                                 }
614
615                                 $values{$l}{$f}{$c}{$k} = ""
616                                         if (!defined $values{$l}{$f}{$c}{$k});
617
618                                 $continue = ($line =~ /\/$/);
619                                 $line =~ s/\/$// if ($continue);
620
621                                 while ($line =~ /_/) {
622                                         $line =~
623                                             s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
624                                 }
625                                 die "_ in data - $line" if ($line =~ /_/);
626                                 $values{$l}{$f}{$c}{$k} .= $line;
627
628                                 last if (!$continue);
629                         }
630                 }
631         }
632         }
633         }
634 }
635
636 sub decodecldr {
637         my $e = shift;
638         my $s = shift;
639
640         my $v = undef;
641
642         if ($e eq "UTF-8") {
643                 #
644                 # Conversion to UTF-8 can be done from the Unicode name to
645                 # the UTF-8 character code.
646                 #
647                 $v = $utf8map{$s};
648                 die "Cannot convert $s in $e (charmap)" if (!defined $v);
649         } else {
650                 #
651                 # Conversion to these encodings can be done from the Unicode
652                 # name to Unicode code to the encodings code.
653                 #
654                 my $ucc = undef;
655                 $ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s});
656                 $ucc = $ucd{name2code}{$utf8aliases{$s}}
657                         if (!defined $ucc
658                          && $utf8aliases{$s}
659                          && defined $ucd{name2code}{$utf8aliases{$s}});
660
661                 if (!defined $ucc) {
662                         if (defined $translations{$e}{$s}{hex}) {
663                                 $v = $translations{$e}{$s}{hex};
664                                 $ucc = 0;
665                         } elsif (defined $translations{$e}{$s}{ucc}) {
666                                 $ucc = $translations{$e}{$s}{ucc};
667                         }
668                 }
669
670                 die "Cannot convert $s in $e (ucd string)" if (!defined $ucc);
671                 $v = $convertors{$e}{$ucc} if (!defined $v);
672
673                 $v = $translations{$e}{$s}{hex}
674                         if (!defined $v && defined $translations{$e}{$s}{hex});
675
676                 if (!defined $v && defined $translations{$e}{$s}{unicode}) {
677                         my $ucn = $translations{$e}{$s}{unicode};
678                         $ucc = $ucd{name2code}{$ucn}
679                                 if (defined $ucd{name2code}{$ucn});
680                         $ucc = $ucd{name2code}{$utf8aliases{$ucn}}
681                                 if (!defined $ucc
682                                  && defined $ucd{name2code}{$utf8aliases{$ucn}});
683                         $v = $convertors{$e}{$ucc};
684                 }
685
686                 die "Cannot convert $s in $e (charmap)" if (!defined $v);
687         }
688
689         return pack("C", hex($v)) if (length($v) == 2);
690         return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
691                 if (length($v) == 4);
692         return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
693             hex(substr($v, 4, 2))) if (length($v) == 6);
694         print STDERR "Cannot convert $e $s\n";
695         return "length = " . length($v);
696
697 }
698
699 sub translate {
700         my $enc = shift;
701         my $v = shift;
702
703         return $translations{$enc}{$v} if (defined $translations{$enc}{$v});
704         return undef;
705 }
706
707 sub print_fields {
708         foreach my $l (sort keys(%languages)) {
709         foreach my $f (sort keys(%{$languages{$l}})) {
710         foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
711                 next if (defined $languages{$l}{$f}{definitions}
712                     && $languages{$l}{$f}{definitions} !~ /$TYPE/);
713                 foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
714                         if ($languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
715                                 print "Skipping ${l}_" .
716                                     ($f eq "x" ? "" : "${f}_") .
717                                     "${c} - not read\n";
718                                 next;
719                         }
720                         my $file = $l;
721                         $file .= "_" . $f if ($f ne "x");
722                         $file .= "_" . $c;
723                         print "Writing to $file in $enc\n";
724
725                         if ($enc ne $DEFENCODING &&
726                             !defined $convertors{$enc}) {
727                                 print "Failed! Cannot convert to $enc.\n";
728                                 next;
729                         };
730
731                         open(FOUT, ">$TYPE.draft/$file.$enc.new");
732                         my $okay = 1;
733                         my $output = "";
734                         print FOUT <<EOF;
735 # Warning: Do not edit. This file is automatically generated from the
736 # tools in /usr/src/tools/tools/locale. The data is obtained from the
737 # CLDR project, obtained from http://cldr.unicode.org/
738 # -----------------------------------------------------------------------------
739 EOF
740                         foreach my $k (keys(%keys)) {
741                                 my $g = $keys{$k};
742
743                                 die("Unknown $k in \%DESC")
744                                         if (!defined $DESC{$k});
745
746                                 $output .= "#\n# $DESC{$k}\n";
747
748                                 # Replace one row with another
749                                 if ($g =~ /^>/) {
750                                         $k = substr($g, 1);
751                                         $g = $keys{$k};
752                                 }
753
754                                 # Callback function
755                                 if ($g =~ /^\</) {
756                                         $callback{data}{c} = $c;
757                                         $callback{data}{k} = $k;
758                                         $callback{data}{f} = $f;
759                                         $callback{data}{l} = $l;
760                                         $callback{data}{e} = $enc;
761                                         my @a = split(/\</, substr($g, 1));
762                                         my $rv =
763                                             &{$callback{$a[0]}}($values{$l}{$f}{$c}{$a[1]});
764                                         $values{$l}{$f}{$c}{$k} = $rv;
765                                         $g = $a[2];
766                                         $callback{data} = ();
767                                 }
768
769                                 my $v = $values{$l}{$f}{$c}{$k};
770                                 $v = "undef" if (!defined $v);
771
772                                 if ($g eq "i") {
773                                         $output .= "$v\n";
774                                         next;
775                                 }
776                                 if ($g eq "ai") {
777                                         $output .= "$v\n";
778                                         next;
779                                 }
780                                 if ($g eq "s") {
781                                         $v =~ s/^"//;
782                                         $v =~ s/"$//;
783                                         my $cm = "";
784                                         while ($v =~ /^(.*?)<(.*?)>(.*)/) {
785                                                 my $p1 = $1;
786                                                 $cm = $2;
787                                                 my $p3 = $3;
788
789                                                 my $rv = decodecldr($enc, $cm);
790 #                                               $rv = translate($enc, $cm)
791 #                                                       if (!defined $rv);
792                                                 if (!defined $rv) {
793                                                         print STDERR 
794 "Could not convert $k ($cm) from $DEFENCODING to $enc\n";
795                                                         $okay = 0;
796                                                         next;
797                                                 }
798
799                                                 $v = $p1 . $rv . $p3;
800                                         }
801                                         $output .= "$v\n";
802                                         next;
803                                 }
804                                 if ($g eq "as") {
805                                         foreach my $v (split(/;/, $v)) {
806                                                 $v =~ s/^"//;
807                                                 $v =~ s/"$//;
808                                                 my $cm = "";
809                                                 while ($v =~ /^(.*?)<(.*?)>(.*)/) {
810                                                         my $p1 = $1;
811                                                         $cm = $2;
812                                                         my $p3 = $3;
813
814                                                         my $rv =
815                                                             decodecldr($enc,
816                                                                 $cm);
817 #                                                       $rv = translate($enc,
818 #                                                           $cm)
819 #                                                           if (!defined $rv);
820                                                         if (!defined $rv) {
821                                                                 print STDERR 
822 "Could not convert $k ($cm) from $DEFENCODING to $enc\n";
823                                                                 $okay = 0;
824                                                                 next;
825                                                         }
826
827                                                         $v = $1 . $rv . $3;
828                                                 }
829                                                 $output .= "$v\n";
830                                         }
831                                         next;
832                                 }
833
834                                 die("$k is '$g'");
835
836                         }
837
838                         $languages{$l}{$f}{data}{$c}{$enc} = sha1_hex($output);
839                         $hashtable{sha1_hex($output)}{"${l}_${f}_${c}.$enc"} = 1;
840                         print FOUT "$output# EOF\n";
841                         close(FOUT);
842
843                         if ($okay) {
844                                 rename("$TYPE.draft/$file.$enc.new",
845                                     "$TYPE.draft/$file.$enc.src");
846                         } else {
847                                 rename("$TYPE.draft/$file.$enc.new",
848                                     "$TYPE.draft/$file.$enc.failed");
849                         }
850                 }
851         }
852         }
853         }
854 }
855
856 sub make_makefile {
857         print "Creating Makefile for $TYPE\n";
858         my $SRCOUT;
859         my $SRCOUT2;
860         my $SRCOUT3 = "";
861         my $SRCOUT4 = "";
862         my $MAPLOC;
863         if ($TYPE eq "colldef") {
864                 $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U " .
865                         "-i \${.IMPSRC} \\\n" .
866                         "\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} " .
867                         "\${.OBJDIR}/\${.IMPSRC:T:R}";
868                 $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
869                                 "locale/etc/final-maps\n";
870                 $SRCOUT2 = "LC_COLLATE";
871                 $SRCOUT3 = "" .
872                         ".for f t in \${LOCALES_MAPPED}\n" .
873                         "FILES+=\t\$t.LC_COLLATE\n" .
874                         "FILESDIR_\$t.LC_COLLATE=\t\${LOCALEDIR}/\$t\n" .
875                         "\$t.LC_COLLATE: \${.CURDIR}/\$f.src\n" .
876                         "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U " .
877                         "-i \${.ALLSRC} \\\n" .
878                         "\t\t-f \${MAPLOC}/map.\${.TARGET:T:R:E:C/@.*//} \\\n" .
879                         "\t\t\${.OBJDIR}/\${.TARGET:T:R}\n" .
880                         ".endfor\n\n";
881                 $SRCOUT4 = "## LOCALES_MAPPED\n";
882         }
883         elsif ($TYPE eq "ctypedef") {
884                 $SRCOUT = "localedef \${LOCALEDEF_ENDIAN} -D -U -c " .
885                         "-w \${MAPLOC}/widths.txt \\\n" .
886                         "\t-f \${MAPLOC}/map.\${.IMPSRC:T:R:E} " .
887                         "\\\n\t-i \${.IMPSRC} \${.OBJDIR}/\${.IMPSRC:T:R} " .
888                         " || true";
889                 $SRCOUT2 = "LC_CTYPE";
890                 $MAPLOC = "MAPLOC=\t\t\${.CURDIR}/../../tools/tools/" .
891                                 "locale/etc/final-maps\n";
892                 $SRCOUT3 = "## SYMPAIRS\n\n" .
893                         ".for s t in \${SYMPAIRS}\n" .
894                         "\${t:S/src\$/LC_CTYPE/}: " .
895                         "\$s\n" .
896                         "\tlocaledef \${LOCALEDEF_ENDIAN} -D -U -c " .
897                         "-w \${MAPLOC}/widths.txt \\\n" .
898                         "\t-f \${MAPLOC}/map.\${.TARGET:T:R:C/^.*\\.//} " .
899                         "\\\n\t-i \${.ALLSRC} \${.OBJDIR}/\${.TARGET:T:R} " .
900                         " || true\n" .
901                         ".endfor\n\n";
902         }
903         else {
904                 $SRCOUT = "grep -v -E '^(\#\$\$|\#[ ])' < \${.IMPSRC} > \${.TARGET}";
905                 $SRCOUT2 = "out";
906                 $MAPLOC = "";
907         }
908         open(FOUT, ">$TYPE.draft/Makefile");
909         print FOUT <<EOF;
910 # \$FreeBSD\$
911 # Warning: Do not edit. This file is automatically generated from the
912 # tools in /usr/src/tools/tools/locale.
913
914 LOCALEDIR=      \${SHAREDIR}/locale
915 FILESNAME=      $FILESNAMES{$TYPE}
916 .SUFFIXES:      .src .${SRCOUT2}
917 ${MAPLOC}
918 EOF
919
920         if ($TYPE eq "colldef" || $TYPE eq "ctypedef") {
921                 print FOUT <<EOF;
922 .include <bsd.endian.mk>
923
924 EOF
925         }
926
927         print FOUT <<EOF;
928 .src.${SRCOUT2}:
929         $SRCOUT
930
931 ## PLACEHOLDER
932
933 ${SRCOUT4}
934
935 EOF
936
937         foreach my $hash (keys(%hashtable)) {
938                 # For colldef, weight LOCALES to UTF-8
939                 #     Sort as upper-case and reverse to achieve it
940                 #     Make en_US, ru_RU, and ca_AD preferred
941                 my @files;
942                 if ($TYPE eq "colldef") {
943                         @files = sort {
944                                 if ($a eq 'en_x_US.UTF-8' ||
945                                     $a eq 'ru_x_RU.UTF-8' ||
946                                     $a eq 'ca_x_AD.UTF-8') { return -1; }
947                                 elsif ($b eq 'en_x_US.UTF-8' ||
948                                        $b eq 'ru_x_RU.UTF-8' ||
949                                        $b eq 'ca_x_AD.UTF-8') { return 1; }
950                                 else { return uc($b) cmp uc($a); }
951                                 } keys(%{$hashtable{$hash}});
952                 } elsif ($TYPE eq "ctypedef") {
953                         @files = sort {
954                                 if ($a eq 'C_x_x.UTF-8') { return -1; }
955                                 elsif ($b eq 'C_x_x.UTF-8') { return 1; }
956                                 if ($a =~ /^en_x_US/) { return -1; }
957                                 elsif ($b =~ /^en_x_US/) { return 1; }
958
959                                 if ($a =~ /^en_x_GB.ISO8859-15/ ||
960                                     $a =~ /^ru_x_RU/) { return -1; }
961                                 elsif ($b =~ /^en_x_GB.ISO8859-15/ ||
962                                        $b =~ /ru_x_RU/) { return 1; }
963                                 else { return uc($b) cmp uc($a); }
964
965                                 } keys(%{$hashtable{$hash}});
966                 } else {
967                         @files = sort {
968                                 if ($a =~ /_Comm_/ ||
969                                     $b eq 'en_x_US.UTF-8') { return 1; }
970                                 elsif ($b =~ /_Comm_/ ||
971                                        $a eq 'en_x_US.UTF-8') { return -1; }
972                                 else { return uc($b) cmp uc($a); }
973                                 } keys(%{$hashtable{$hash}});
974                 }
975                 if ($#files > 0) {
976                         my $link = shift(@files);
977                         $link =~ s/_x_x//;      # special case for C
978                         $link =~ s/_x_/_/;      # strip family if none there
979                         foreach my $file (@files) {
980                                 my @a = split(/_/, $file);
981                                 my @b = split(/\./, $a[-1]);
982                                 $file =~ s/_x_/_/;
983                                 print FOUT "SAME+=\t\t$link $file\n";
984                                 undef($languages{$a[0]}{$a[1]}{data}{$b[0]}{$b[1]});
985                         }
986                 }
987         }
988
989         foreach my $l (sort keys(%languages)) {
990         foreach my $f (sort keys(%{$languages{$l}})) {
991         foreach my $c (sort keys(%{$languages{$l}{$f}{data}})) {
992                 next if (defined $languages{$l}{$f}{definitions}
993                     && $languages{$l}{$f}{definitions} !~ /$TYPE/);
994                 if (defined $languages{$l}{$f}{data}{$c}{$DEFENCODING}
995                  && $languages{$l}{$f}{data}{$c}{$DEFENCODING} eq "0") {
996                         print "Skipping ${l}_" . ($f eq "x" ? "" : "${f}_") .
997                             "${c} - not read\n";
998                         next;
999                 }
1000                 foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1001                         my $file = $l;
1002                         $file .= "_" . $f if ($f ne "x");
1003                         $file .= "_" . $c if ($c ne "x");
1004                         next if (!defined $languages{$l}{$f}{data}{$c}{$e});
1005                         print FOUT "LOCALES+=\t$file.$e\n";
1006                 }
1007
1008                 if (defined $languages{$l}{$f}{nc_link}) {
1009                         foreach my $e (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
1010                                 my $file = $l . "_";
1011                                 $file .= $f . "_" if ($f ne "x");
1012                                 $file .= $c;
1013                                 print FOUT "SAME+=\t\t$file.$e $languages{$l}{$f}{nc_link}.$e\t# legacy (lang/country change)\n";
1014                         }
1015                 }
1016
1017                 if (defined $languages{$l}{$f}{e_link}) {
1018                         foreach my $el (split(" ", $languages{$l}{$f}{e_link})) {
1019                                 my @a = split(/:/, $el);
1020                                 my $file = $l . "_";
1021                                 $file .= $f . "_" if ($f ne "x");
1022                                 $file .= $c;
1023                                 print FOUT "SAME+=\t\t$file.$a[0] $file.$a[1]\t# legacy (same charset)\n";
1024                         }
1025                 }
1026
1027         }
1028         }
1029         }
1030
1031         print FOUT <<EOF;
1032
1033 FILES=          \${LOCALES:S/\$/.${SRCOUT2}/}
1034 CLEANFILES=     \${FILES}
1035
1036 .for f t in \${SAME}
1037 SYMLINKS+=      ../\$f/\${FILESNAME} \\
1038     \${LOCALEDIR}/\$t/\${FILESNAME}
1039 .endfor
1040
1041 .for f in \${LOCALES}
1042 FILESDIR_\${f}.${SRCOUT2}= \${LOCALEDIR}/\${f}
1043 .endfor
1044
1045 ${SRCOUT3}.include <bsd.prog.mk>
1046 EOF
1047
1048         close(FOUT);
1049 }