1 #!/usr/local/bin/perl -wC
3 # SPDX-License-Identifier: BSD-2-Clause-FreeBSD
5 # Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
6 # Copyright 2015 John Marino <draco@marino.st>
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions
11 # 1. Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # 2. Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
17 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 use Encode qw(encode decode);
36 print "Usage: $0 --unidir=<unidir>\n";
42 my $result = GetOptions (
43 "unidir=s" => \$UNIDIR
47 my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
49 get_utf8map("$UNIDIR/posix/UTF-8.cm");
51 parse_unidata ("$UNIDIR/UnicodeData.txt");
54 ############################
57 my @kl = split /\\x/, $_[0];
59 shift @kl if ($kl[0] eq '');
60 my $k = pack('H2' x scalar @kl, @kl);
61 my $ux = encode('UTF-32BE', decode('UTF-8', $k));
62 my $u = uc(unpack('H*', $ux));
65 # Remove heading bytes of 0
66 while ($u =~ m/^0/ and length($u) > 4) {
82 foreach my $l (@lines) {
84 next if ($l =~ /^\#/);
87 if ($l eq "CHARMAP") {
92 next if (!$incharmap);
93 last if ($l eq "END CHARMAP");
95 $l =~ /^(<[^\s]+>)\s+(.*)/;
96 my $k = utf8to32($2); # UTF-8 char code
99 # print STDERR "register: $k - $v\n";
104 sub generate_header {
105 open(FOUT, ">", "$outfilename")
106 or die ("can't write to $outfilename\n");
108 # Warning: Do not edit. This file is automatically generated from the
109 # tools in /usr/src/tools/tools/locale. The data is obtained from the
110 # CLDR project, obtained from http://cldr.unicode.org/
111 # -----------------------------------------------------------------------------
120 sub generate_footer {
121 print FOUT "\nEND LC_CTYPE\n";
132 if (($wc & ~0x7f) == 0) {
133 return sprintf "%02X", $wc;
134 } elsif (($wc & ~0x7ff) == 0) {
137 } elsif (($wc & ~0xffff) == 0) {
140 } elsif ($wc >= 0 && $wc <= 0x10ffff) {
145 for ($i = $len - 1; $i > 0; $i--) {
146 $ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret;
149 $ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret;
163 foreach my $l (@lines) {
164 my @d = split(/;/, $l, -1);
168 # XXX There are code points present in UnicodeData.txt
169 # and missing from UTF-8.cm
170 next if !defined $utf8map{$mb};
172 # Define the category
173 if ($d[2] =~ /^Lu/) {
175 } elsif ($d[2] =~ /^Ll/) {
177 } elsif ($d[2] =~ /^Nd/) {
179 } elsif ($d[2] =~ /^L/) {
181 } elsif ($d[2] =~ /^P/) {
183 } elsif ($d[2] =~ /^Co/ || $d[2] =~ /^M/ || $d[2] =~ /^N/ ||
186 } elsif ($d[2] =~ /^C/) {
188 } elsif ($d[2] =~ /^Z/) {
191 $data{$cat}{$mb}{'wc'} = $d[0];
193 # Check if it's a start or end of range
194 if ($d[1] =~ /First>$/) {
195 $data{$cat}{$mb}{'start'} = 1;
196 } elsif ($d[1] =~ /Last>$/) {
197 $data{$cat}{$mb}{'end'} = 1;
200 # Check if there's upper/lower mapping
202 $data{'toupper'}{$mb} = $d[12];
203 } elsif ($d[13] ne "") {
204 $data{'tolower'}{$mb} = $d[13];
211 # Now write out the categories
212 foreach my $cat (sort keys (%data)) {
215 foreach my $mb (sort {hex($a) <=> hex($b)} keys (%{$data{$cat}})) {
218 } elsif ($inrange == 1) {
220 die "broken range end wc=$data{$cat}{$mb}{'wc'}"
221 if !defined $data{$cat}{$mb}{'end'};
228 if ($cat eq "tolower" || $cat eq "toupper") {
229 print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})";
231 if (defined($data{$cat}{$mb}{'start'})) {
234 print FOUT "$utf8map{$mb}";