contrib/file/ascmagic.c

   1 /*
   2  * Copyright (c) Ian F. Darwin 1986-1995.
   3  * Software written by Ian F. Darwin and others;
   4  * maintained 1995-present by Christos Zoulas and others.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice immediately at the beginning of the file, without modification,
  11  *    this list of conditions, and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28 /*
  29  * ASCII magic -- file types that we know based on keywords
  30  * that can appear anywhere in the file.
  31  *
  32  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
  33  * to handle character codes other than ASCII on a unified basis.
  34  *
  35  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
  36  * international characters, now subsumed into this file.
  37  */
  38
  39 #include "file.h"
  40 #include "magic.h"
  41 #include <stdio.h>
  42 #include <string.h>
  43 #include <memory.h>
  44 #include <ctype.h>
  45 #include <stdlib.h>
  46 #ifdef HAVE_UNISTD_H
  47 #include <unistd.h>
  48 #endif
  49 #include "names.h"
  50
  51 #ifndef lint
  52 FILE_RCSID("@(#)$Id: ascmagic.c,v 1.41 2004/09/11 19:15:57 christos Exp $")
  53 #endif  /* lint */
  54
  55 typedef unsigned long unichar;
  56
  57 #define MAXLINELEN 300  /* longest sane line length */
  58 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
  59                   || (x) == 0x85 || (x) == '\f')
  60
  61 private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
  62 private int looks_utf8(const unsigned char *, size_t, unichar *, size_t *);
  63 private int looks_unicode(const unsigned char *, size_t, unichar *, size_t *);
  64 private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
  65 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
  66 private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
  67 private int ascmatch(const unsigned char *, const unichar *, size_t);
  68
  69
  70 protected int
  71 file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
  72 {
  73         size_t i;
  74         unsigned char nbuf[HOWMANY+1];  /* one extra for terminating '\0' */
  75         unichar ubuf[HOWMANY+1];        /* one extra for terminating '\0' */
  76         size_t ulen;
  77         struct names *p;
  78
  79         const char *code = NULL;
  80         const char *code_mime = NULL;
  81         const char *type = NULL;
  82         const char *subtype = NULL;
  83         const char *subtype_mime = NULL;
  84
  85         int has_escapes = 0;
  86         int has_backspace = 0;
  87
  88         int n_crlf = 0;
  89         int n_lf = 0;
  90         int n_cr = 0;
  91         int n_nel = 0;
  92
  93         int last_line_end = -1;
  94         int has_long_lines = 0;
  95
  96         /*
  97          * Undo the NUL-termination kindly provided by process()
  98          * but leave at least one byte to look at
  99          */
 100
 101         while (nbytes > 1 && buf[nbytes - 1] == '\0')
 102                 nbytes--;
 103
 104         /* nbuf and ubuf relies on this */
 105         if (nbytes > HOWMANY)
 106                 nbytes = HOWMANY;
 107
 108         /*
 109          * Then try to determine whether it's any character code we can
 110          * identify.  Each of these tests, if it succeeds, will leave
 111          * the text converted into one-unichar-per-character Unicode in
 112          * ubuf, and the number of characters converted in ulen.
 113          */
 114         if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
 115                 code = "ASCII";
 116                 code_mime = "us-ascii";
 117                 type = "text";
 118         } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
 119                 code = "UTF-8 Unicode";
 120                 code_mime = "utf-8";
 121                 type = "text";
 122         } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
 123                 if (i == 1)
 124                         code = "Little-endian UTF-16 Unicode";
 125                 else
 126                         code = "Big-endian UTF-16 Unicode";
 127
 128                 type = "character data";
 129                 code_mime = "utf-16";    /* is this defined? */
 130         } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
 131                 code = "ISO-8859";
 132                 type = "text";
 133                 code_mime = "iso-8859-1";
 134         } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
 135                 code = "Non-ISO extended-ASCII";
 136                 type = "text";
 137                 code_mime = "unknown";
 138         } else {
 139                 from_ebcdic(buf, nbytes, nbuf);
 140
 141                 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
 142                         code = "EBCDIC";
 143                         type = "character data";
 144                         code_mime = "ebcdic";
 145                 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
 146                         code = "International EBCDIC";
 147                         type = "character data";
 148                         code_mime = "ebcdic";
 149                 } else {
 150                         return 0;  /* doesn't look like text at all */
 151                 }
 152         }
 153
 154         /*
 155          * for troff, look for . + letter + letter or .\";
 156          * this must be done to disambiguate tar archives' ./file
 157          * and other trash from real troff input.
 158          *
 159          * I believe Plan 9 troff allows non-ASCII characters in the names
 160          * of macros, so this test might possibly fail on such a file.
 161          */
 162         if (*ubuf == '.') {
 163                 unichar *tp = ubuf + 1;
 164
 165                 while (ISSPC(*tp))
 166                         ++tp;   /* skip leading whitespace */
 167                 if ((tp[0] == '\\' && tp[1] == '\"') ||
 168                     (isascii((unsigned char)tp[0]) &&
 169                      isalnum((unsigned char)tp[0]) &&
 170                      isascii((unsigned char)tp[1]) &&
 171                      isalnum((unsigned char)tp[1]) &&
 172                      ISSPC(tp[2]))) {
 173                         subtype_mime = "text/troff";
 174                         subtype = "troff or preprocessor input";
 175                         goto subtype_identified;
 176                 }
 177         }
 178
 179         if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
 180                 subtype_mime = "text/fortran";
 181                 subtype = "fortran program";
 182                 goto subtype_identified;
 183         }
 184
 185         /* look for tokens from names.h - this is expensive! */
 186
 187         i = 0;
 188         while (i < ulen) {
 189                 size_t end;
 190
 191                 /*
 192                  * skip past any leading space
 193                  */
 194                 while (i < ulen && ISSPC(ubuf[i]))
 195                         i++;
 196                 if (i >= ulen)
 197                         break;
 198
 199                 /*
 200                  * find the next whitespace
 201                  */
 202                 for (end = i + 1; end < nbytes; end++)
 203                         if (ISSPC(ubuf[end]))
 204                                 break;
 205
 206                 /*
 207                  * compare the word thus isolated against the token list
 208                  */
 209                 for (p = names; p < names + NNAMES; p++) {
 210                         if (ascmatch((const unsigned char *)p->name, ubuf + i,
 211                             end - i)) {
 212                                 subtype = types[p->type].human;
 213                                 subtype_mime = types[p->type].mime;
 214                                 goto subtype_identified;
 215                         }
 216                 }
 217
 218                 i = end;
 219         }
 220
 221 subtype_identified:
 222
 223         /*
 224          * Now try to discover other details about the file.
 225          */
 226         for (i = 0; i < ulen; i++) {
 227                 if (i > last_line_end + MAXLINELEN)
 228                         has_long_lines = 1;
 229
 230                 if (ubuf[i] == '\033')
 231                         has_escapes = 1;
 232                 if (ubuf[i] == '\b')
 233                         has_backspace = 1;
 234
 235                 if (ubuf[i] == '\r' && (i + 1 <  ulen && ubuf[i + 1] == '\n')) {
 236                         n_crlf++;
 237                         last_line_end = i;
 238                 }
 239                 if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
 240                         n_cr++;
 241                         last_line_end = i;
 242                 }
 243                 if (ubuf[i] == '\n' && ((int)i - 1 < 0 || ubuf[i - 1] != '\r')){
 244                         n_lf++;
 245                         last_line_end = i;
 246                 }
 247                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
 248                         n_nel++;
 249                         last_line_end = i;
 250                 }
 251         }
 252
 253         if ((ms->flags & MAGIC_MIME)) {
 254                 if (subtype_mime) {
 255                         if (file_printf(ms, subtype_mime) == -1)
 256                                 return -1;
 257                 } else {
 258                         if (file_printf(ms, "text/plain") == -1)
 259                                 return -1;
 260                 }
 261
 262                 if (code_mime) {
 263                         if (file_printf(ms, "; charset=") == -1)
 264                                 return -1;
 265                         if (file_printf(ms, code_mime) == -1)
 266                                 return -1;
 267                 }
 268         } else {
 269                 if (file_printf(ms, code) == -1)
 270                         return -1;
 271
 272                 if (subtype) {
 273                         if (file_printf(ms, " ") == -1)
 274                                 return -1;
 275                         if (file_printf(ms, subtype) == -1)
 276                                 return -1;
 277                 }
 278
 279                 if (file_printf(ms, " ") == -1)
 280                         return -1;
 281                 if (file_printf(ms, type) == -1)
 282                         return -1;
 283
 284                 if (has_long_lines)
 285                         if (file_printf(ms, ", with very long lines") == -1)
 286                                 return -1;
 287
 288                 /*
 289                  * Only report line terminators if we find one other than LF,
 290                  * or if we find none at all.
 291                  */
 292                 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
 293                     (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
 294                         if (file_printf(ms, ", with") == -1)
 295                                 return -1;
 296
 297                         if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)                        {
 298                                 if (file_printf(ms, " no") == -1)
 299                                         return -1;
 300                         } else {
 301                                 if (n_crlf) {
 302                                         if (file_printf(ms, " CRLF") == -1)
 303                                                 return -1;
 304                                         if (n_cr || n_lf || n_nel)
 305                                                 if (file_printf(ms, ",") == -1)
 306                                                         return -1;
 307                                 }
 308                                 if (n_cr) {
 309                                         if (file_printf(ms, " CR") == -1)
 310                                                 return -1;
 311                                         if (n_lf || n_nel)
 312                                                 if (file_printf(ms, ",") == -1)
 313                                                         return -1;
 314                                 }
 315                                 if (n_lf) {
 316                                         if (file_printf(ms, " LF") == -1)
 317                                                 return -1;
 318                                         if (n_nel)
 319                                                 if (file_printf(ms, ",") == -1)
 320                                                         return -1;
 321                                 }
 322                                 if (n_nel)
 323                                         if (file_printf(ms, " NEL") == -1)
 324                                                 return -1;
 325                         }
 326
 327                         if (file_printf(ms, " line terminators") == -1)
 328                                 return -1;
 329                 }
 330
 331                 if (has_escapes)
 332                         if (file_printf(ms, ", with escape sequences") == -1)
 333                                 return -1;
 334                 if (has_backspace)
 335                         if (file_printf(ms, ", with overstriking") == -1)
 336                                 return -1;
 337         }
 338
 339         return 1;
 340 }
 341
 342 private int
 343 ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
 344 {
 345         size_t i;
 346
 347         for (i = 0; i < ulen; i++) {
 348                 if (s[i] != us[i])
 349                         return 0;
 350         }
 351
 352         if (s[i])
 353                 return 0;
 354         else
 355                 return 1;
 356 }
 357
 358 /*
 359  * This table reflects a particular philosophy about what constitutes
 360  * "text," and there is room for disagreement about it.
 361  *
 362  * Version 3.31 of the file command considered a file to be ASCII if
 363  * each of its characters was approved by either the isascii() or
 364  * isalpha() function.  On most systems, this would mean that any
 365  * file consisting only of characters in the range 0x00 ... 0x7F
 366  * would be called ASCII text, but many systems might reasonably
 367  * consider some characters outside this range to be alphabetic,
 368  * so the file command would call such characters ASCII.  It might
 369  * have been more accurate to call this "considered textual on the
 370  * local system" than "ASCII."
 371  *
 372  * It considered a file to be "International language text" if each
 373  * of its characters was either an ASCII printing character (according
 374  * to the real ASCII standard, not the above test), a character in
 375  * the range 0x80 ... 0xFF, or one of the following control characters:
 376  * backspace, tab, line feed, vertical tab, form feed, carriage return,
 377  * escape.  No attempt was made to determine the language in which files
 378  * of this type were written.
 379  *
 380  *
 381  * The table below considers a file to be ASCII if all of its characters
 382  * are either ASCII printing characters (again, according to the X3.4
 383  * standard, not isascii()) or any of the following controls: bell,
 384  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
 385  *
 386  * I include bell because some programs (particularly shell scripts)
 387  * use it literally, even though it is rare in normal text.  I exclude
 388  * vertical tab because it never seems to be used in real text.  I also
 389  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
 390  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
 391  * character to.  It might be more appropriate to include it in the 8859
 392  * set instead of the ASCII set, but it's got to be included in *something*
 393  * we recognize or EBCDIC files aren't going to be considered textual.
 394  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
 395  * and Latin characters, so these should possibly be allowed.  But they
 396  * make a real mess on VT100-style displays if they're not paired properly,
 397  * so we are probably better off not calling them text.
 398  *
 399  * A file is considered to be ISO-8859 text if its characters are all
 400  * either ASCII, according to the above definition, or printing characters
 401  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
 402  *
 403  * Finally, a file is considered to be international text from some other
 404  * character code if its characters are all either ISO-8859 (according to
 405  * the above definition) or characters in the range 0x80 ... 0x9F, which
 406  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
 407  * consider to be printing characters.
 408  */
 409
 410 #define F 0   /* character never appears in text */
 411 #define T 1   /* character appears in plain ASCII text */
 412 #define I 2   /* character appears in ISO-8859 text */
 413 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
 414
 415 private char text_chars[256] = {
 416         /*                  BEL BS HT LF    FF CR    */
 417         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
 418         /*                              ESC          */
 419         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
 420         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
 421         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
 422         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
 423         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
 424         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
 425         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
 426         /*            NEL                            */
 427         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
 428         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
 429         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
 430         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
 431         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
 432         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
 433         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
 434         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
 435 };
 436
 437 private int
 438 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
 439     size_t *ulen)
 440 {
 441         int i;
 442
 443         *ulen = 0;
 444
 445         for (i = 0; i < nbytes; i++) {
 446                 int t = text_chars[buf[i]];
 447
 448                 if (t != T)
 449                         return 0;
 450
 451                 ubuf[(*ulen)++] = buf[i];
 452         }
 453
 454         return 1;
 455 }
 456
 457 private int
 458 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
 459 {
 460         int i;
 461
 462         *ulen = 0;
 463
 464         for (i = 0; i < nbytes; i++) {
 465                 int t = text_chars[buf[i]];
 466
 467                 if (t != T && t != I)
 468                         return 0;
 469
 470                 ubuf[(*ulen)++] = buf[i];
 471         }
 472
 473         return 1;
 474 }
 475
 476 private int
 477 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
 478     size_t *ulen)
 479 {
 480         int i;
 481
 482         *ulen = 0;
 483
 484         for (i = 0; i < nbytes; i++) {
 485                 int t = text_chars[buf[i]];
 486
 487                 if (t != T && t != I && t != X)
 488                         return 0;
 489
 490                 ubuf[(*ulen)++] = buf[i];
 491         }
 492
 493         return 1;
 494 }
 495
 496 private int
 497 looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
 498 {
 499         int i, n;
 500         unichar c;
 501         int gotone = 0;
 502
 503         *ulen = 0;
 504
 505         for (i = 0; i < nbytes; i++) {
 506                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
 507                         /*
 508                          * Even if the whole file is valid UTF-8 sequences,
 509                          * still reject it if it uses weird control characters.
 510                          */
 511
 512                         if (text_chars[buf[i]] != T)
 513                                 return 0;
 514
 515                         ubuf[(*ulen)++] = buf[i];
 516                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
 517                         return 0;
 518                 } else {                           /* 11xxxxxx begins UTF-8 */
 519                         int following;
 520
 521                         if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
 522                                 c = buf[i] & 0x1f;
 523                                 following = 1;
 524                         } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
 525                                 c = buf[i] & 0x0f;
 526                                 following = 2;
 527                         } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
 528                                 c = buf[i] & 0x07;
 529                                 following = 3;
 530                         } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
 531                                 c = buf[i] & 0x03;
 532                                 following = 4;
 533                         } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
 534                                 c = buf[i] & 0x01;
 535                                 following = 5;
 536                         } else
 537                                 return 0;
 538
 539                         for (n = 0; n < following; n++) {
 540                                 i++;
 541                                 if (i >= nbytes)
 542                                         goto done;
 543
 544                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
 545                                         return 0;
 546
 547                                 c = (c << 6) + (buf[i] & 0x3f);
 548                         }
 549
 550                         ubuf[(*ulen)++] = c;
 551                         gotone = 1;
 552                 }
 553         }
 554 done:
 555         return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
 556 }
 557
 558 private int
 559 looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
 560     size_t *ulen)
 561 {
 562         int bigend;
 563         int i;
 564
 565         if (nbytes < 2)
 566                 return 0;
 567
 568         if (buf[0] == 0xff && buf[1] == 0xfe)
 569                 bigend = 0;
 570         else if (buf[0] == 0xfe && buf[1] == 0xff)
 571                 bigend = 1;
 572         else
 573                 return 0;
 574
 575         *ulen = 0;
 576
 577         for (i = 2; i + 1 < nbytes; i += 2) {
 578                 /* XXX fix to properly handle chars > 65536 */
 579
 580                 if (bigend)
 581                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
 582                 else
 583                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
 584
 585                 if (ubuf[*ulen - 1] == 0xfffe)
 586                         return 0;
 587                 if (ubuf[*ulen - 1] < 128 &&
 588                     text_chars[(size_t)ubuf[*ulen - 1]] != T)
 589                         return 0;
 590         }
 591
 592         return 1 + bigend;
 593 }
 594
 595 #undef F
 596 #undef T
 597 #undef I
 598 #undef X
 599
 600 /*
 601  * This table maps each EBCDIC character to an (8-bit extended) ASCII
 602  * character, as specified in the rationale for the dd(1) command in
 603  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
 604  *
 605  * Unfortunately it does not seem to correspond exactly to any of the
 606  * five variants of EBCDIC documented in IBM's _Enterprise Systems
 607  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
 608  * Edition, July, 1999, pp. I-1 - I-4.
 609  *
 610  * Fortunately, though, all versions of EBCDIC, including this one, agree
 611  * on most of the printing characters that also appear in (7-bit) ASCII.
 612  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
 613  *
 614  * Fortunately too, there is general agreement that codes 0x00 through
 615  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
 616  * remainder printing characters.
 617  *
 618  * This is sufficient to allow us to identify EBCDIC text and to distinguish
 619  * between old-style and internationalized examples of text.
 620  */
 621
 622 private unsigned char ebcdic_to_ascii[] = {
 623   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
 624  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
 625 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
 626 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
 627 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
 628 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
 629 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
 630 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
 631 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
 632 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
 633 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
 634 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
 635 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
 636 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
 637 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
 638 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
 639 };
 640
 641 #ifdef notdef
 642 /*
 643  * The following EBCDIC-to-ASCII table may relate more closely to reality,
 644  * or at least to modern reality.  It comes from
 645  *
 646  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
 647  *
 648  * and maps the characters of EBCDIC code page 1047 (the code used for
 649  * Unix-derived software on IBM's 390 systems) to the corresponding
 650  * characters from ISO 8859-1.
 651  *
 652  * If this table is used instead of the above one, some of the special
 653  * cases for the NEL character can be taken out of the code.
 654  */
 655
 656 private unsigned char ebcdic_1047_to_8859[] = {
 657 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
 658 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
 659 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
 660 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
 661 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
 662 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
 663 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
 664 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
 665 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
 666 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
 667 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
 668 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
 669 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
 670 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
 671 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
 672 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
 673 };
 674 #endif
 675
 676 /*
 677  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
 678  */
 679 private void
 680 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
 681 {
 682         int i;
 683
 684         for (i = 0; i < nbytes; i++) {
 685                 out[i] = ebcdic_to_ascii[buf[i]];
 686         }
 687 }