contrib/file/ascmagic.c

   1 /*
   2  * Copyright (c) Ian F. Darwin 1986-1995.
   3  * Software written by Ian F. Darwin and others;
   4  * maintained 1995-present by Christos Zoulas and others.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice immediately at the beginning of the file, without modification,
  11  *    this list of conditions, and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28 /*
  29  * ASCII magic -- file types that we know based on keywords
  30  * that can appear anywhere in the file.
  31  *
  32  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
  33  * to handle character codes other than ASCII on a unified basis.
  34  *
  35  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
  36  * international characters, now subsumed into this file.
  37  */
  38
  39 #include "file.h"
  40 #include "magic.h"
  41 #include <stdio.h>
  42 #include <string.h>
  43 #include <memory.h>
  44 #include <ctype.h>
  45 #include <stdlib.h>
  46 #ifdef HAVE_UNISTD_H
  47 #include <unistd.h>
  48 #endif
  49 #include "names.h"
  50
  51 #ifndef lint
  52 FILE_RCSID("@(#)$File: ascmagic.c,v 1.64 2008/07/16 18:00:57 christos Exp $")
  53 #endif  /* lint */
  54
  55 #define MAXLINELEN 300  /* longest sane line length */
  56 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
  57                   || (x) == 0x85 || (x) == '\f')
  58
  59 private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
  60 private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
  61     size_t *);
  62 private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
  63 private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
  64 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
  65 private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
  66 private int ascmatch(const unsigned char *, const unichar *, size_t);
  67 private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
  68
  69
  70 protected int
  71 file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
  72 {
  73         size_t i;
  74         unsigned char *nbuf = NULL, *utf8_buf = NULL, *utf8_end;
  75         unichar *ubuf = NULL;
  76         size_t ulen, mlen;
  77         const struct names *p;
  78         int rv = -1;
  79         int mime = ms->flags & MAGIC_MIME;
  80
  81         const char *code = NULL;
  82         const char *code_mime = NULL;
  83         const char *type = NULL;
  84         const char *subtype = NULL;
  85         const char *subtype_mime = NULL;
  86
  87         int has_escapes = 0;
  88         int has_backspace = 0;
  89         int seen_cr = 0;
  90
  91         int n_crlf = 0;
  92         int n_lf = 0;
  93         int n_cr = 0;
  94         int n_nel = 0;
  95
  96         size_t last_line_end = (size_t)-1;
  97         int has_long_lines = 0;
  98
  99         /*
 100          * Undo the NUL-termination kindly provided by process()
 101          * but leave at least one byte to look at
 102          */
 103         while (nbytes > 1 && buf[nbytes - 1] == '\0')
 104                 nbytes--;
 105
 106         if ((nbuf = CAST(unsigned char *, calloc((size_t)1,
 107             (nbytes + 1) * sizeof(nbuf[0])))) == NULL)
 108                 goto done;
 109         if ((ubuf = CAST(unichar *, calloc((size_t)1,
 110             (nbytes + 1) * sizeof(ubuf[0])))) == NULL)
 111                 goto done;
 112
 113         /*
 114          * Then try to determine whether it's any character code we can
 115          * identify.  Each of these tests, if it succeeds, will leave
 116          * the text converted into one-unichar-per-character Unicode in
 117          * ubuf, and the number of characters converted in ulen.
 118          */
 119         if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
 120                 code = "ASCII";
 121                 code_mime = "us-ascii";
 122                 type = "text";
 123         } else if (looks_utf8_with_BOM(buf, nbytes, ubuf, &ulen) > 0) {
 124                 code = "UTF-8 Unicode (with BOM)";
 125                 code_mime = "utf-8";
 126                 type = "text";
 127         } else if (file_looks_utf8(buf, nbytes, ubuf, &ulen) > 1) {
 128                 code = "UTF-8 Unicode";
 129                 code_mime = "utf-8";
 130                 type = "text";
 131         } else if ((i = looks_ucs16(buf, nbytes, ubuf, &ulen)) != 0) {
 132                 if (i == 1)
 133                         code = "Little-endian UTF-16 Unicode";
 134                 else
 135                         code = "Big-endian UTF-16 Unicode";
 136
 137                 type = "character data";
 138                 code_mime = "utf-16";    /* is this defined? */
 139         } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
 140                 code = "ISO-8859";
 141                 type = "text";
 142                 code_mime = "iso-8859-1";
 143         } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
 144                 code = "Non-ISO extended-ASCII";
 145                 type = "text";
 146                 code_mime = "unknown";
 147         } else {
 148                 from_ebcdic(buf, nbytes, nbuf);
 149
 150                 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
 151                         code = "EBCDIC";
 152                         type = "character data";
 153                         code_mime = "ebcdic";
 154                 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
 155                         code = "International EBCDIC";
 156                         type = "character data";
 157                         code_mime = "ebcdic";
 158                 } else {
 159                         rv = 0;
 160                         goto done;  /* doesn't look like text at all */
 161                 }
 162         }
 163
 164         if (nbytes <= 1) {
 165                 rv = 0;
 166                 goto done;
 167         }
 168
 169         /* Convert ubuf to UTF-8 and try text soft magic */
 170         /* If original was ASCII or UTF-8, could use nbuf instead of
 171            re-converting. */
 172         /* malloc size is a conservative overestimate; could be
 173            re-converting improved, or at least realloced after
 174            re-converting conversion. */
 175         mlen = ulen * 6;
 176         if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
 177                 file_oomem(ms, mlen);
 178                 goto done;
 179         }
 180         if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL)
 181                 goto done;
 182         if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) {
 183                 rv = 1;
 184                 goto done;
 185         }
 186
 187         /* look for tokens from names.h - this is expensive! */
 188         if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
 189                 goto subtype_identified;
 190
 191         i = 0;
 192         while (i < ulen) {
 193                 size_t end;
 194
 195                 /* skip past any leading space */
 196                 while (i < ulen && ISSPC(ubuf[i]))
 197                         i++;
 198                 if (i >= ulen)
 199                         break;
 200
 201                 /* find the next whitespace */
 202                 for (end = i + 1; end < nbytes; end++)
 203                         if (ISSPC(ubuf[end]))
 204                                 break;
 205
 206                 /* compare the word thus isolated against the token list */
 207                 for (p = names; p < names + NNAMES; p++) {
 208                         if (ascmatch((const unsigned char *)p->name, ubuf + i,
 209                             end - i)) {
 210                                 subtype = types[p->type].human;
 211                                 subtype_mime = types[p->type].mime;
 212                                 goto subtype_identified;
 213                         }
 214                 }
 215
 216                 i = end;
 217         }
 218
 219 subtype_identified:
 220
 221         /* Now try to discover other details about the file. */
 222         for (i = 0; i < ulen; i++) {
 223                 if (ubuf[i] == '\n') {
 224                         if (seen_cr)
 225                                 n_crlf++;
 226                         else
 227                                 n_lf++;
 228                         last_line_end = i;
 229                 } else if (seen_cr)
 230                         n_cr++;
 231
 232                 seen_cr = (ubuf[i] == '\r');
 233                 if (seen_cr)
 234                         last_line_end = i;
 235
 236                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
 237                         n_nel++;
 238                         last_line_end = i;
 239                 }
 240
 241                 /* If this line is _longer_ than MAXLINELEN, remember it. */
 242                 if (i > last_line_end + MAXLINELEN)
 243                         has_long_lines = 1;
 244
 245                 if (ubuf[i] == '\033')
 246                         has_escapes = 1;
 247                 if (ubuf[i] == '\b')
 248                         has_backspace = 1;
 249         }
 250
 251         /* Beware, if the data has been truncated, the final CR could have
 252            been followed by a LF.  If we have HOWMANY bytes, it indicates
 253            that the data might have been truncated, probably even before
 254            this function was called. */
 255         if (seen_cr && nbytes < HOWMANY)
 256                 n_cr++;
 257
 258         if (mime) {
 259                 if (mime & MAGIC_MIME_TYPE) {
 260                         if (subtype_mime) {
 261                                 if (file_printf(ms, subtype_mime) == -1)
 262                                         goto done;
 263                         } else {
 264                                 if (file_printf(ms, "text/plain") == -1)
 265                                         goto done;
 266                         }
 267                 }
 268
 269                 if ((mime == 0 || mime == MAGIC_MIME) && code_mime) {
 270                         if ((mime & MAGIC_MIME_TYPE) &&
 271                             file_printf(ms, " charset=") == -1)
 272                                 goto done;
 273                         if (file_printf(ms, code_mime) == -1)
 274                                 goto done;
 275                 }
 276
 277                 if (mime == MAGIC_MIME_ENCODING)
 278                         file_printf(ms, "binary");
 279         } else {
 280                 if (file_printf(ms, code) == -1)
 281                         goto done;
 282
 283                 if (subtype) {
 284                         if (file_printf(ms, " ") == -1)
 285                                 goto done;
 286                         if (file_printf(ms, subtype) == -1)
 287                                 goto done;
 288                 }
 289
 290                 if (file_printf(ms, " ") == -1)
 291                         goto done;
 292                 if (file_printf(ms, type) == -1)
 293                         goto done;
 294
 295                 if (has_long_lines)
 296                         if (file_printf(ms, ", with very long lines") == -1)
 297                                 goto done;
 298
 299                 /*
 300                  * Only report line terminators if we find one other than LF,
 301                  * or if we find none at all.
 302                  */
 303                 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
 304                     (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
 305                         if (file_printf(ms, ", with") == -1)
 306                                 goto done;
 307
 308                         if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)                        {
 309                                 if (file_printf(ms, " no") == -1)
 310                                         goto done;
 311                         } else {
 312                                 if (n_crlf) {
 313                                         if (file_printf(ms, " CRLF") == -1)
 314                                                 goto done;
 315                                         if (n_cr || n_lf || n_nel)
 316                                                 if (file_printf(ms, ",") == -1)
 317                                                         goto done;
 318                                 }
 319                                 if (n_cr) {
 320                                         if (file_printf(ms, " CR") == -1)
 321                                                 goto done;
 322                                         if (n_lf || n_nel)
 323                                                 if (file_printf(ms, ",") == -1)
 324                                                         goto done;
 325                                 }
 326                                 if (n_lf) {
 327                                         if (file_printf(ms, " LF") == -1)
 328                                                 goto done;
 329                                         if (n_nel)
 330                                                 if (file_printf(ms, ",") == -1)
 331                                                         goto done;
 332                                 }
 333                                 if (n_nel)
 334                                         if (file_printf(ms, " NEL") == -1)
 335                                                 goto done;
 336                         }
 337
 338                         if (file_printf(ms, " line terminators") == -1)
 339                                 goto done;
 340                 }
 341
 342                 if (has_escapes)
 343                         if (file_printf(ms, ", with escape sequences") == -1)
 344                                 goto done;
 345                 if (has_backspace)
 346                         if (file_printf(ms, ", with overstriking") == -1)
 347                                 goto done;
 348         }
 349         rv = 1;
 350 done:
 351         if (nbuf)
 352                 free(nbuf);
 353         if (ubuf)
 354                 free(ubuf);
 355         if (utf8_buf)
 356                 free(utf8_buf);
 357
 358         return rv;
 359 }
 360
 361 private int
 362 ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
 363 {
 364         size_t i;
 365
 366         for (i = 0; i < ulen; i++) {
 367                 if (s[i] != us[i])
 368                         return 0;
 369         }
 370
 371         if (s[i])
 372                 return 0;
 373         else
 374                 return 1;
 375 }
 376
 377 /*
 378  * This table reflects a particular philosophy about what constitutes
 379  * "text," and there is room for disagreement about it.
 380  *
 381  * Version 3.31 of the file command considered a file to be ASCII if
 382  * each of its characters was approved by either the isascii() or
 383  * isalpha() function.  On most systems, this would mean that any
 384  * file consisting only of characters in the range 0x00 ... 0x7F
 385  * would be called ASCII text, but many systems might reasonably
 386  * consider some characters outside this range to be alphabetic,
 387  * so the file command would call such characters ASCII.  It might
 388  * have been more accurate to call this "considered textual on the
 389  * local system" than "ASCII."
 390  *
 391  * It considered a file to be "International language text" if each
 392  * of its characters was either an ASCII printing character (according
 393  * to the real ASCII standard, not the above test), a character in
 394  * the range 0x80 ... 0xFF, or one of the following control characters:
 395  * backspace, tab, line feed, vertical tab, form feed, carriage return,
 396  * escape.  No attempt was made to determine the language in which files
 397  * of this type were written.
 398  *
 399  *
 400  * The table below considers a file to be ASCII if all of its characters
 401  * are either ASCII printing characters (again, according to the X3.4
 402  * standard, not isascii()) or any of the following controls: bell,
 403  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
 404  *
 405  * I include bell because some programs (particularly shell scripts)
 406  * use it literally, even though it is rare in normal text.  I exclude
 407  * vertical tab because it never seems to be used in real text.  I also
 408  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
 409  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
 410  * character to.  It might be more appropriate to include it in the 8859
 411  * set instead of the ASCII set, but it's got to be included in *something*
 412  * we recognize or EBCDIC files aren't going to be considered textual.
 413  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
 414  * and Latin characters, so these should possibly be allowed.  But they
 415  * make a real mess on VT100-style displays if they're not paired properly,
 416  * so we are probably better off not calling them text.
 417  *
 418  * A file is considered to be ISO-8859 text if its characters are all
 419  * either ASCII, according to the above definition, or printing characters
 420  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
 421  *
 422  * Finally, a file is considered to be international text from some other
 423  * character code if its characters are all either ISO-8859 (according to
 424  * the above definition) or characters in the range 0x80 ... 0x9F, which
 425  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
 426  * consider to be printing characters.
 427  */
 428
 429 #define F 0   /* character never appears in text */
 430 #define T 1   /* character appears in plain ASCII text */
 431 #define I 2   /* character appears in ISO-8859 text */
 432 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
 433
 434 private char text_chars[256] = {
 435         /*                  BEL BS HT LF    FF CR    */
 436         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
 437         /*                              ESC          */
 438         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
 439         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
 440         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
 441         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
 442         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
 443         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
 444         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
 445         /*            NEL                            */
 446         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
 447         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
 448         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
 449         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
 450         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
 451         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
 452         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
 453         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
 454 };
 455
 456 private int
 457 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
 458     size_t *ulen)
 459 {
 460         size_t i;
 461
 462         *ulen = 0;
 463
 464         for (i = 0; i < nbytes; i++) {
 465                 int t = text_chars[buf[i]];
 466
 467                 if (t != T)
 468                         return 0;
 469
 470                 ubuf[(*ulen)++] = buf[i];
 471         }
 472
 473         return 1;
 474 }
 475
 476 private int
 477 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
 478 {
 479         size_t i;
 480
 481         *ulen = 0;
 482
 483         for (i = 0; i < nbytes; i++) {
 484                 int t = text_chars[buf[i]];
 485
 486                 if (t != T && t != I)
 487                         return 0;
 488
 489                 ubuf[(*ulen)++] = buf[i];
 490         }
 491
 492         return 1;
 493 }
 494
 495 private int
 496 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
 497     size_t *ulen)
 498 {
 499         size_t i;
 500
 501         *ulen = 0;
 502
 503         for (i = 0; i < nbytes; i++) {
 504                 int t = text_chars[buf[i]];
 505
 506                 if (t != T && t != I && t != X)
 507                         return 0;
 508
 509                 ubuf[(*ulen)++] = buf[i];
 510         }
 511
 512         return 1;
 513 }
 514
 515 /*
 516  * Encode Unicode string as UTF-8, returning pointer to character
 517  * after end of string, or NULL if an invalid character is found.
 518  */
 519 private unsigned char *
 520 encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
 521 {
 522         size_t i;
 523         unsigned char *end = buf + len;
 524
 525         for (i = 0; i < ulen; i++) {
 526                 if (ubuf[i] <= 0x7f) {
 527                         if (end - buf < 1)
 528                                 return NULL;
 529                         *buf++ = (unsigned char)ubuf[i];
 530                 } else if (ubuf[i] <= 0x7ff) {
 531                         if (end - buf < 2)
 532                                 return NULL;
 533                         *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0);
 534                         *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
 535                 } else if (ubuf[i] <= 0xffff) {
 536                         if (end - buf < 3)
 537                                 return NULL;
 538                         *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0);
 539                         *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
 540                         *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
 541                 } else if (ubuf[i] <= 0x1fffff) {
 542                         if (end - buf < 4)
 543                                 return NULL;
 544                         *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0);
 545                         *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
 546                         *buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
 547                         *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
 548                 } else if (ubuf[i] <= 0x3ffffff) {
 549                         if (end - buf < 5)
 550                                 return NULL;
 551                         *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8);
 552                         *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
 553                         *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
 554                         *buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
 555                         *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
 556                 } else if (ubuf[i] <= 0x7fffffff) {
 557                         if (end - buf < 6)
 558                                 return NULL;
 559                         *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc);
 560                         *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80);
 561                         *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
 562                         *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
 563                         *buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
 564                         *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
 565                 } else /* Invalid character */
 566                         return NULL;
 567         }
 568
 569         return buf;
 570 }
 571
 572 /*
 573  * Decide whether some text looks like UTF-8. Returns:
 574  *
 575  *     -1: invalid UTF-8
 576  *      0: uses odd control characters, so doesn't look like text
 577  *      1: 7-bit text
 578  *      2: definitely UTF-8 text (valid high-bit set bytes)
 579  *
 580  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
 581  * ubuf must be big enough!
 582  */
 583 protected int
 584 file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
 585 {
 586         size_t i;
 587         int n;
 588         unichar c;
 589         int gotone = 0, ctrl = 0;
 590
 591         if (ubuf)
 592                 *ulen = 0;
 593
 594         for (i = 0; i < nbytes; i++) {
 595                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
 596                         /*
 597                          * Even if the whole file is valid UTF-8 sequences,
 598                          * still reject it if it uses weird control characters.
 599                          */
 600
 601                         if (text_chars[buf[i]] != T)
 602                                 ctrl = 1;
 603
 604                         if (ubuf)
 605                                 ubuf[(*ulen)++] = buf[i];
 606                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
 607                         return -1;
 608                 } else {                           /* 11xxxxxx begins UTF-8 */
 609                         int following;
 610
 611                         if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
 612                                 c = buf[i] & 0x1f;
 613                                 following = 1;
 614                         } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
 615                                 c = buf[i] & 0x0f;
 616                                 following = 2;
 617                         } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
 618                                 c = buf[i] & 0x07;
 619                                 following = 3;
 620                         } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
 621                                 c = buf[i] & 0x03;
 622                                 following = 4;
 623                         } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
 624                                 c = buf[i] & 0x01;
 625                                 following = 5;
 626                         } else
 627                                 return -1;
 628
 629                         for (n = 0; n < following; n++) {
 630                                 i++;
 631                                 if (i >= nbytes)
 632                                         goto done;
 633
 634                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
 635                                         return -1;
 636
 637                                 c = (c << 6) + (buf[i] & 0x3f);
 638                         }
 639
 640                         if (ubuf)
 641                                 ubuf[(*ulen)++] = c;
 642                         gotone = 1;
 643                 }
 644         }
 645 done:
 646         return ctrl ? 0 : (gotone ? 2 : 1);
 647 }
 648
 649 /*
 650  * Decide whether some text looks like UTF-8 with BOM. If there is no
 651  * BOM, return -1; otherwise return the result of looks_utf8 on the
 652  * rest of the text.
 653  */
 654 private int
 655 looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
 656     size_t *ulen)
 657 {
 658         if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
 659                 return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
 660         else
 661                 return -1;
 662 }
 663
 664 private int
 665 looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
 666     size_t *ulen)
 667 {
 668         int bigend;
 669         size_t i;
 670
 671         if (nbytes < 2)
 672                 return 0;
 673
 674         if (buf[0] == 0xff && buf[1] == 0xfe)
 675                 bigend = 0;
 676         else if (buf[0] == 0xfe && buf[1] == 0xff)
 677                 bigend = 1;
 678         else
 679                 return 0;
 680
 681         *ulen = 0;
 682
 683         for (i = 2; i + 1 < nbytes; i += 2) {
 684                 /* XXX fix to properly handle chars > 65536 */
 685
 686                 if (bigend)
 687                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
 688                 else
 689                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
 690
 691                 if (ubuf[*ulen - 1] == 0xfffe)
 692                         return 0;
 693                 if (ubuf[*ulen - 1] < 128 &&
 694                     text_chars[(size_t)ubuf[*ulen - 1]] != T)
 695                         return 0;
 696         }
 697
 698         return 1 + bigend;
 699 }
 700
 701 #undef F
 702 #undef T
 703 #undef I
 704 #undef X
 705
 706 /*
 707  * This table maps each EBCDIC character to an (8-bit extended) ASCII
 708  * character, as specified in the rationale for the dd(1) command in
 709  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
 710  *
 711  * Unfortunately it does not seem to correspond exactly to any of the
 712  * five variants of EBCDIC documented in IBM's _Enterprise Systems
 713  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
 714  * Edition, July, 1999, pp. I-1 - I-4.
 715  *
 716  * Fortunately, though, all versions of EBCDIC, including this one, agree
 717  * on most of the printing characters that also appear in (7-bit) ASCII.
 718  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
 719  *
 720  * Fortunately too, there is general agreement that codes 0x00 through
 721  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
 722  * remainder printing characters.
 723  *
 724  * This is sufficient to allow us to identify EBCDIC text and to distinguish
 725  * between old-style and internationalized examples of text.
 726  */
 727
 728 private unsigned char ebcdic_to_ascii[] = {
 729   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
 730  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
 731 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
 732 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
 733 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
 734 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
 735 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
 736 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
 737 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
 738 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
 739 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
 740 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
 741 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
 742 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
 743 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
 744 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
 745 };
 746
 747 #ifdef notdef
 748 /*
 749  * The following EBCDIC-to-ASCII table may relate more closely to reality,
 750  * or at least to modern reality.  It comes from
 751  *
 752  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
 753  *
 754  * and maps the characters of EBCDIC code page 1047 (the code used for
 755  * Unix-derived software on IBM's 390 systems) to the corresponding
 756  * characters from ISO 8859-1.
 757  *
 758  * If this table is used instead of the above one, some of the special
 759  * cases for the NEL character can be taken out of the code.
 760  */
 761
 762 private unsigned char ebcdic_1047_to_8859[] = {
 763 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
 764 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
 765 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
 766 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
 767 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
 768 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
 769 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
 770 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
 771 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
 772 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
 773 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
 774 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
 775 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
 776 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
 777 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
 778 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
 779 };
 780 #endif
 781
 782 /*
 783  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
 784  */
 785 private void
 786 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
 787 {
 788         size_t i;
 789
 790         for (i = 0; i < nbytes; i++) {
 791                 out[i] = ebcdic_to_ascii[buf[i]];
 792         }
 793 }