]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - lib/libc/locale/collate.c
Merge ^/head r339015 through r339669.
[FreeBSD/FreeBSD.git] / lib / libc / locale / collate.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright 2014 Garrett D'Amore <garrett@damore.org>
5  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
6  * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
7  *              at Electronni Visti IA, Kiev, Ukraine.
8  *                      All rights reserved.
9  *
10  * Copyright (c) 2011 The FreeBSD Foundation
11  * All rights reserved.
12  * Portions of this software were developed by David Chisnall
13  * under sponsorship from the FreeBSD Foundation.
14  *
15  * Redistribution and use in source and binary forms, with or without
16  * modification, are permitted provided that the following conditions
17  * are met:
18  * 1. Redistributions of source code must retain the above copyright
19  *    notice, this list of conditions and the following disclaimer.
20  * 2. Redistributions in binary form must reproduce the above copyright
21  *    notice, this list of conditions and the following disclaimer in the
22  *    documentation and/or other materials provided with the distribution.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  * Adapted to xlocale by John Marino <draco@marino.st>
37  */
38
39 #include <sys/cdefs.h>
40 __FBSDID("$FreeBSD$");
41
42 #include "namespace.h"
43
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <sys/mman.h>
47
48 #include <assert.h>
49 #include <stdio.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <wchar.h>
53 #include <errno.h>
54 #include <unistd.h>
55 #include <fcntl.h>
56 #include "un-namespace.h"
57
58 #include "collate.h"
59 #include "setlocale.h"
60 #include "ldpart.h"
61 #include "libc_private.h"
62
63 struct xlocale_collate __xlocale_global_collate = {
64         {{0}, "C"}, 1, 0, 0, 0
65 };
66
67 struct xlocale_collate __xlocale_C_collate = {
68         {{0}, "C"}, 1, 0, 0, 0
69 };
70
71 static int
72 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table);
73
74 static void
75 destruct_collate(void *t)
76 {
77         struct xlocale_collate *table = t;
78         if (table->map && (table->maplen > 0)) {
79                 (void) munmap(table->map, table->maplen);
80         }
81         free(t);
82 }
83
84 void *
85 __collate_load(const char *encoding, __unused locale_t unused)
86 {
87         if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
88                 return &__xlocale_C_collate;
89         }
90         struct xlocale_collate *table = calloc(sizeof(struct xlocale_collate), 1);
91         table->header.header.destructor = destruct_collate;
92         // FIXME: Make sure that _LDP_CACHE is never returned.  We should be doing
93         // the caching outside of this section
94         if (__collate_load_tables_l(encoding, table) != _LDP_LOADED) {
95                 xlocale_release(table);
96                 return NULL;
97         }
98         return table;
99 }
100
101 /**
102  * Load the collation tables for the specified encoding into the global table.
103  */
104 int
105 __collate_load_tables(const char *encoding)
106 {
107
108         return (__collate_load_tables_l(encoding, &__xlocale_global_collate));
109 }
110
111 int
112 __collate_load_tables_l(const char *encoding, struct xlocale_collate *table)
113 {
114         int i, chains, z;
115         char *buf;
116         char *TMP;
117         char *map;
118         collate_info_t *info;
119         struct stat sbuf;
120         int fd;
121
122         table->__collate_load_error = 1;
123
124         /* 'encoding' must be already checked. */
125         if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) {
126                 return (_LDP_CACHE);
127         }
128
129         if (asprintf(&buf, "%s/%s/LC_COLLATE", _PathLocale, encoding) == -1)
130                 return (_LDP_ERROR);
131
132         if ((fd = _open(buf, O_RDONLY)) < 0) {
133                 free(buf);
134                 return (_LDP_ERROR);
135         }
136         free(buf);
137         if (_fstat(fd, &sbuf) < 0) {
138                 (void) _close(fd);
139                 return (_LDP_ERROR);
140         }
141         if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) {
142                 (void) _close(fd);
143                 errno = EINVAL;
144                 return (_LDP_ERROR);
145         }
146         map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
147         (void) _close(fd);
148         if ((TMP = map) == NULL) {
149                 return (_LDP_ERROR);
150         }
151
152         if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) {
153                 (void) munmap(map, sbuf.st_size);
154                 errno = EINVAL;
155                 return (_LDP_ERROR);
156         }
157         TMP += COLLATE_STR_LEN;
158
159         info = (void *)TMP;
160         TMP += sizeof (*info);
161
162         if ((info->directive_count < 1) ||
163             (info->directive_count >= COLL_WEIGHTS_MAX) ||
164             ((chains = info->chain_count) < 0)) {
165                 (void) munmap(map, sbuf.st_size);
166                 errno = EINVAL;
167                 return (_LDP_ERROR);
168         }
169
170         i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
171             (sizeof (collate_chain_t) * chains) +
172             (sizeof (collate_large_t) * info->large_count);
173         for (z = 0; z < info->directive_count; z++) {
174                 i += sizeof (collate_subst_t) * info->subst_count[z];
175         }
176         if (i != (sbuf.st_size - (TMP - map))) {
177                 (void) munmap(map, sbuf.st_size);
178                 errno = EINVAL;
179                 return (_LDP_ERROR);
180         }
181
182         table->info = info;
183         table->char_pri_table = (void *)TMP;
184         TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
185
186         for (z = 0; z < info->directive_count; z++) {
187                 if (info->subst_count[z] > 0) {
188                         table->subst_table[z] = (void *)TMP;
189                         TMP += info->subst_count[z] * sizeof (collate_subst_t);
190                 } else {
191                         table->subst_table[z] = NULL;
192                 }
193         }
194
195         if (chains > 0) {
196                 table->chain_pri_table = (void *)TMP;
197                 TMP += chains * sizeof (collate_chain_t);
198         } else
199                 table->chain_pri_table = NULL;
200         if (info->large_count > 0)
201                 table->large_pri_table = (void *)TMP;
202         else
203                 table->large_pri_table = NULL;
204
205         table->__collate_load_error = 0;
206         return (_LDP_LOADED);
207 }
208
209 static const int32_t *
210 substsearch(struct xlocale_collate *table, const wchar_t key, int pass)
211 {
212         const collate_subst_t *p;
213         int n = table->info->subst_count[pass];
214
215         if (n == 0)
216                 return (NULL);
217
218         if (pass >= table->info->directive_count)
219                 return (NULL);
220
221         if (!(key & COLLATE_SUBST_PRIORITY))
222                 return (NULL);
223
224         p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
225         assert(p->key == key);
226
227         return (p->pri);
228 }
229
230 static collate_chain_t *
231 chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len)
232 {
233         int low = 0;
234         int high = table->info->chain_count - 1;
235         int next, compar, l;
236         collate_chain_t *p;
237         collate_chain_t *tab = table->chain_pri_table;
238
239         if (high < 0)
240                 return (NULL);
241
242         while (low <= high) {
243                 next = (low + high) / 2;
244                 p = tab + next;
245                 compar = *key - *p->str;
246                 if (compar == 0) {
247                         l = wcsnlen(p->str, COLLATE_STR_LEN);
248                         compar = wcsncmp(key, p->str, l);
249                         if (compar == 0) {
250                                 *len = l;
251                                 return (p);
252                         }
253                 }
254                 if (compar > 0)
255                         low = next + 1;
256                 else
257                         high = next - 1;
258         }
259         return (NULL);
260 }
261
262 static collate_large_t *
263 largesearch(struct xlocale_collate *table, const wchar_t key)
264 {
265         int low = 0;
266         int high = table->info->large_count - 1;
267         int next, compar;
268         collate_large_t *p;
269         collate_large_t *tab = table->large_pri_table;
270
271         if (high < 0)
272                 return (NULL);
273
274         while (low <= high) {
275                 next = (low + high) / 2;
276                 p = tab + next;
277                 compar = key - p->val;
278                 if (compar == 0)
279                         return (p);
280                 if (compar > 0)
281                         low = next + 1;
282                 else
283                         high = next - 1;
284         }
285         return (NULL);
286 }
287
288 void
289 _collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len,
290     int *pri, int which, const int **state)
291 {
292         collate_chain_t *p2;
293         collate_large_t *match;
294         int p, l;
295         const int *sptr;
296
297         /*
298          * If this is the "last" pass for the UNDEFINED, then
299          * we just return the priority itself.
300          */
301         if (which >= table->info->directive_count) {
302                 *pri = *t;
303                 *len = 1;
304                 *state = NULL;
305                 return;
306         }
307
308         /*
309          * If we have remaining substitution data from a previous
310          * call, consume it first.
311          */
312         if ((sptr = *state) != NULL) {
313                 *pri = *sptr;
314                 sptr++;
315                 if ((sptr == *state) || (sptr == NULL))
316                         *state = NULL;
317                 else
318                         *state = sptr;
319                 *len = 0;
320                 return;
321         }
322
323         /* No active substitutions */
324         *len = 1;
325
326         /*
327          * Check for composites such as diphthongs that collate as a
328          * single element (aka chains or collating-elements).
329          */
330         if (((p2 = chainsearch(table, t, &l)) != NULL) &&
331             ((p = p2->pri[which]) >= 0)) {
332
333                 *len = l;
334                 *pri = p;
335
336         } else if (*t <= UCHAR_MAX) {
337
338                 /*
339                  * Character is a small (8-bit) character.
340                  * We just look these up directly for speed.
341                  */
342                 *pri = table->char_pri_table[*t].pri[which];
343
344         } else if ((table->info->large_count > 0) &&
345             ((match = largesearch(table, *t)) != NULL)) {
346
347                 /*
348                  * Character was found in the extended table.
349                  */
350                 *pri = match->pri.pri[which];
351
352         } else {
353                 /*
354                  * Character lacks a specific definition.
355                  */
356                 if (table->info->directive[which] & DIRECTIVE_UNDEFINED) {
357                         /* Mask off sign bit to prevent ordering confusion. */
358                         *pri = (*t & COLLATE_MAX_PRIORITY);
359                 } else {
360                         *pri = table->info->undef_pri[which];
361                 }
362                 /* No substitutions for undefined characters! */
363                 return;
364         }
365
366         /*
367          * Try substituting (expanding) the character.  We are
368          * currently doing this *after* the chain compression.  I
369          * think it should not matter, but this way might be slightly
370          * faster.
371          *
372          * We do this after the priority search, as this will help us
373          * to identify a single key value.  In order for this to work,
374          * its important that the priority assigned to a given element
375          * to be substituted be unique for that level.  The localedef
376          * code ensures this for us.
377          */
378         if ((sptr = substsearch(table, *pri, which)) != NULL) {
379                 if ((*pri = *sptr) > 0) {
380                         sptr++;
381                         *state = *sptr ? sptr : NULL;
382                 }
383         }
384
385 }
386
387 /*
388  * This is the meaty part of wcsxfrm & strxfrm.  Note that it does
389  * NOT NULL terminate.  That is left to the caller.
390  */
391 size_t
392 _collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf,
393     size_t room)
394 {
395         int             pri;
396         int             len;
397         const wchar_t   *t;
398         wchar_t         *tr = NULL;
399         int             direc;
400         int             pass;
401         const int32_t   *state;
402         size_t          want = 0;
403         size_t          need = 0;
404         int             ndir = table->info->directive_count;
405
406         assert(src);
407
408         for (pass = 0; pass <= ndir; pass++) {
409
410                 state = NULL;
411
412                 if (pass != 0) {
413                         /* insert level separator from the previous pass */
414                         if (room) {
415                                 *xf++ = 1;
416                                 room--;
417                         }
418                         want++;
419                 }
420
421                 /* special pass for undefined */
422                 if (pass == ndir) {
423                         direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
424                 } else {
425                         direc = table->info->directive[pass];
426                 }
427
428                 t = src;
429
430                 if (direc & DIRECTIVE_BACKWARD) {
431                         wchar_t *bp, *fp, c;
432                         free(tr);
433                         if ((tr = wcsdup(t)) == NULL) {
434                                 errno = ENOMEM;
435                                 goto fail;
436                         }
437                         bp = tr;
438                         fp = tr + wcslen(tr) - 1;
439                         while (bp < fp) {
440                                 c = *bp;
441                                 *bp++ = *fp;
442                                 *fp-- = c;
443                         }
444                         t = (const wchar_t *)tr;
445                 }
446
447                 if (direc & DIRECTIVE_POSITION) {
448                         while (*t || state) {
449                                 _collate_lookup(table, t, &len, &pri, pass, &state);
450                                 t += len;
451                                 if (pri <= 0) {
452                                         if (pri < 0) {
453                                                 errno = EINVAL;
454                                                 goto fail;
455                                         }
456                                         state = NULL;
457                                         pri = COLLATE_MAX_PRIORITY;
458                                 }
459                                 if (room) {
460                                         *xf++ = pri;
461                                         room--;
462                                 }
463                                 want++;
464                                 need = want;
465                         }
466                 } else {
467                         while (*t || state) {
468                                 _collate_lookup(table, t, &len, &pri, pass, &state);
469                                 t += len;
470                                 if (pri <= 0) {
471                                         if (pri < 0) {
472                                                 errno = EINVAL;
473                                                 goto fail;
474                                         }
475                                         state = NULL;
476                                         continue;
477                                 }
478                                 if (room) {
479                                         *xf++ = pri;
480                                         room--;
481                                 }
482                                 want++;
483                                 need = want;
484                         }
485                 }
486         }
487         free(tr);
488         return (need);
489
490 fail:
491         free(tr);
492         return ((size_t)(-1));
493 }
494
495 /*
496  * In the non-POSIX case, we transform each character into a string of
497  * characters representing the character's priority.  Since char is usually
498  * signed, we are limited by 7 bits per byte.  To avoid zero, we need to add
499  * XFRM_OFFSET, so we can't use a full 7 bits.  For simplicity, we choose 6
500  * bits per byte.
501  *
502  * It turns out that we sometimes have real priorities that are
503  * 31-bits wide.  (But: be careful using priorities where the high
504  * order bit is set -- i.e. the priority is negative.  The sort order
505  * may be surprising!)
506  *
507  * TODO: This would be a good area to optimize somewhat.  It turns out
508  * that real prioririties *except for the last UNDEFINED pass* are generally
509  * very small.  We need the localedef code to precalculate the max
510  * priority for us, and ideally also give us a mask, and then we could
511  * severely limit what we expand to.
512  */
513 #define XFRM_BYTES      6
514 #define XFRM_OFFSET     ('0')   /* make all printable characters */
515 #define XFRM_SHIFT      6
516 #define XFRM_MASK       ((1 << XFRM_SHIFT) - 1)
517 #define XFRM_SEP        ('.')   /* chosen to be less than XFRM_OFFSET */
518
519 static int
520 xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass)
521 {
522         /* we use unsigned to ensure zero fill on right shift */
523         uint32_t val = (uint32_t)table->info->pri_count[pass];
524         int nc = 0;
525
526         while (val) {
527                 *p = (pri & XFRM_MASK) + XFRM_OFFSET;
528                 pri >>= XFRM_SHIFT;
529                 val >>= XFRM_SHIFT;
530                 p++;
531                 nc++;
532         }
533         return (nc);
534 }
535
536 size_t
537 _collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf,
538     size_t room)
539 {
540         int             pri;
541         int             len;
542         const wchar_t   *t;
543         wchar_t         *tr = NULL;
544         int             direc;
545         int             pass;
546         const int32_t   *state;
547         size_t          want = 0;
548         size_t          need = 0;
549         int             b;
550         uint8_t         buf[XFRM_BYTES];
551         int             ndir = table->info->directive_count;
552
553         assert(src);
554
555         for (pass = 0; pass <= ndir; pass++) {
556
557                 state = NULL;
558
559                 if (pass != 0) {
560                         /* insert level separator from the previous pass */
561                         if (room) {
562                                 *xf++ = XFRM_SEP;
563                                 room--;
564                         }
565                         want++;
566                 }
567
568                 /* special pass for undefined */
569                 if (pass == ndir) {
570                         direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED;
571                 } else {
572                         direc = table->info->directive[pass];
573                 }
574
575                 t = src;
576
577                 if (direc & DIRECTIVE_BACKWARD) {
578                         wchar_t *bp, *fp, c;
579                         free(tr);
580                         if ((tr = wcsdup(t)) == NULL) {
581                                 errno = ENOMEM;
582                                 goto fail;
583                         }
584                         bp = tr;
585                         fp = tr + wcslen(tr) - 1;
586                         while (bp < fp) {
587                                 c = *bp;
588                                 *bp++ = *fp;
589                                 *fp-- = c;
590                         }
591                         t = (const wchar_t *)tr;
592                 }
593
594                 if (direc & DIRECTIVE_POSITION) {
595                         while (*t || state) {
596
597                                 _collate_lookup(table, t, &len, &pri, pass, &state);
598                                 t += len;
599                                 if (pri <= 0) {
600                                         if (pri < 0) {
601                                                 errno = EINVAL;
602                                                 goto fail;
603                                         }
604                                         state = NULL;
605                                         pri = COLLATE_MAX_PRIORITY;
606                                 }
607
608                                 b = xfrm(table, buf, pri, pass);
609                                 want += b;
610                                 if (room) {
611                                         while (b) {
612                                                 b--;
613                                                 if (room) {
614                                                         *xf++ = buf[b];
615                                                         room--;
616                                                 }
617                                         }
618                                 }
619                                 need = want;
620                         }
621                 } else {
622                         while (*t || state) {
623                                 _collate_lookup(table, t, &len, &pri, pass, &state);
624                                 t += len;
625                                 if (pri <= 0) {
626                                         if (pri < 0) {
627                                                 errno = EINVAL;
628                                                 goto fail;
629                                         }
630                                         state = NULL;
631                                         continue;
632                                 }
633
634                                 b = xfrm(table, buf, pri, pass);
635                                 want += b;
636                                 if (room) {
637
638                                         while (b) {
639                                                 b--;
640                                                 if (room) {
641                                                         *xf++ = buf[b];
642                                                         room--;
643                                                 }
644                                         }
645                                 }
646                                 need = want;
647                         }
648                 }
649         }
650         free(tr);
651         return (need);
652
653 fail:
654         free(tr);
655         return ((size_t)(-1));
656 }
657
658 /*
659  * __collate_equiv_value returns the primary collation value for the given
660  * collating symbol specified by str and len.  Zero or negative is returned
661  * if the collating symbol was not found.  This function is used by bracket
662  * code in the TRE regex library.
663  */
664 int
665 __collate_equiv_value(locale_t locale, const wchar_t *str, size_t len)
666 {
667         int32_t e;
668
669         if (len < 1 || len >= COLLATE_STR_LEN)
670                 return (-1);
671
672         FIX_LOCALE(locale);
673         struct xlocale_collate *table =
674                 (struct xlocale_collate*)locale->components[XLC_COLLATE];
675
676         if (table->__collate_load_error)
677                 return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1);
678
679         if (len == 1) {
680                 e = -1;
681                 if (*str <= UCHAR_MAX)
682                         e = table->char_pri_table[*str].pri[0];
683                 else if (table->info->large_count > 0) {
684                         collate_large_t *match_large;
685                         match_large = largesearch(table, *str);
686                         if (match_large)
687                                 e = match_large->pri.pri[0];
688                 }
689                 if (e == 0)
690                         return (1);
691                 return (e > 0 ? e : 0);
692         }
693         if (table->info->chain_count > 0) {
694                 wchar_t name[COLLATE_STR_LEN];
695                 collate_chain_t *match_chain;
696                 int clen;
697
698                 wcsncpy (name, str, len);
699                 name[len] = 0;
700                 match_chain = chainsearch(table, name, &clen);
701                 if (match_chain) {
702                         e = match_chain->pri[0];
703                         if (e == 0)
704                                 return (1);
705                         return (e < 0 ? -e : e);
706                 }
707         }
708         return (0);
709 }