contrib/file/ascmagic.c

   1 /*
   2  * ASCII magic -- file types that we know based on keywords
   3  * that can appear anywhere in the file.
   4  *
   5  * Copyright (c) Ian F. Darwin, 1987.
   6  * Written by Ian F. Darwin.
   7  *
   8  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
   9  * to handle character codes other than ASCII on a unified basis.
  10  *
  11  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
  12  * international characters, now subsumed into this file.
  13  */
  14
  15 /*
  16  * This software is not subject to any license of the American Telephone
  17  * and Telegraph Company or of the Regents of the University of California.
  18  *
  19  * Permission is granted to anyone to use this software for any purpose on
  20  * any computer system, and to alter it and redistribute it freely, subject
  21  * to the following restrictions:
  22  *
  23  * 1. The author is not responsible for the consequences of use of this
  24  *    software, no matter how awful, even if they arise from flaws in it.
  25  *
  26  * 2. The origin of this software must not be misrepresented, either by
  27  *    explicit claim or by omission.  Since few users ever read sources,
  28  *    credits must appear in the documentation.
  29  *
  30  * 3. Altered versions must be plainly marked as such, and must not be
  31  *    misrepresented as being the original software.  Since few users
  32  *    ever read sources, credits must appear in the documentation.
  33  *
  34  * 4. This notice may not be removed or altered.
  35  */
  36
  37 #include "file.h"
  38 #include <string.h>
  39 #include <memory.h>
  40 #include <ctype.h>
  41 #include <stdlib.h>
  42 #ifdef HAVE_UNISTD_H
  43 #include <unistd.h>
  44 #endif
  45 #include "names.h"
  46
  47 #ifndef lint
  48 FILE_RCSID("@(#)$Id: ascmagic.c,v 1.32 2002/07/03 18:26:37 christos Exp $")
  49 #endif  /* lint */
  50
  51 typedef unsigned long unichar;
  52
  53 #define MAXLINELEN 300  /* longest sane line length */
  54 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
  55                   || (x) == 0x85 || (x) == '\f')
  56
  57 static int looks_ascii(const unsigned char *, int, unichar *, int *);
  58 static int looks_utf8(const unsigned char *, int, unichar *, int *);
  59 static int looks_unicode(const unsigned char *, int, unichar *, int *);
  60 static int looks_latin1(const unsigned char *, int, unichar *, int *);
  61 static int looks_extended(const unsigned char *, int, unichar *, int *);
  62 static void from_ebcdic(const unsigned char *, int, unsigned char *);
  63 static int ascmatch(const unsigned char *, const unichar *, int);
  64
  65 /* int nbytes: size actually read */
  66 int
  67 ascmagic(unsigned char *buf, int nbytes)
  68 {
  69         int i;
  70         char nbuf[HOWMANY+1];           /* one extra for terminating '\0' */
  71         unichar ubuf[HOWMANY+1];        /* one extra for terminating '\0' */
  72         int ulen;
  73         struct names *p;
  74
  75         char *code = NULL;
  76         char *code_mime = NULL;
  77         char *type = NULL;
  78         char *subtype = NULL;
  79         char *subtype_mime = NULL;
  80
  81         int has_escapes = 0;
  82         int has_backspace = 0;
  83
  84         int n_crlf = 0;
  85         int n_lf = 0;
  86         int n_cr = 0;
  87         int n_nel = 0;
  88
  89         int last_line_end = -1;
  90         int has_long_lines = 0;
  91
  92         /*
  93          * Do the tar test first, because if the first file in the tar
  94          * archive starts with a dot, we can confuse it with an nroff file.
  95          */
  96         switch (is_tar(buf, nbytes)) {
  97         case 1:
  98                 ckfputs(iflag ? "application/x-tar" : "tar archive", stdout);
  99                 return 1;
 100         case 2:
 101                 ckfputs(iflag ? "application/x-tar, POSIX"
 102                                 : "POSIX tar archive", stdout);
 103                 return 1;
 104         }
 105
 106         /*
 107          * Undo the NUL-termination kindly provided by process()
 108          * but leave at least one byte to look at
 109          */
 110
 111         while (nbytes > 1 && buf[nbytes - 1] == '\0')
 112                 nbytes--;
 113
 114         /*
 115          * Then try to determine whether it's any character code we can
 116          * identify.  Each of these tests, if it succeeds, will leave
 117          * the text converted into one-unichar-per-character Unicode in
 118          * ubuf, and the number of characters converted in ulen.
 119          */
 120         if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
 121                 code = "ASCII";
 122                 code_mime = "us-ascii";
 123                 type = "text";
 124         } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
 125                 code = "UTF-8 Unicode";
 126                 code_mime = "utf-8";
 127                 type = "text";
 128         } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen))) {
 129                 if (i == 1)
 130                         code = "Little-endian UTF-16 Unicode";
 131                 else
 132                         code = "Big-endian UTF-16 Unicode";
 133
 134                 type = "character data";
 135                 code_mime = "utf-16";    /* is this defined? */
 136         } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
 137                 code = "ISO-8859";
 138                 type = "text";
 139                 code_mime = "iso-8859-1";
 140         } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
 141                 code = "Non-ISO extended-ASCII";
 142                 type = "text";
 143                 code_mime = "unknown";
 144         } else {
 145                 from_ebcdic(buf, nbytes, nbuf);
 146
 147                 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
 148                         code = "EBCDIC";
 149                         type = "character data";
 150                         code_mime = "ebcdic";
 151                 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
 152                         code = "International EBCDIC";
 153                         type = "character data";
 154                         code_mime = "ebcdic";
 155                 } else {
 156                         return 0;  /* doesn't look like text at all */
 157                 }
 158         }
 159
 160         /*
 161          * for troff, look for . + letter + letter or .\";
 162          * this must be done to disambiguate tar archives' ./file
 163          * and other trash from real troff input.
 164          *
 165          * I believe Plan 9 troff allows non-ASCII characters in the names
 166          * of macros, so this test might possibly fail on such a file.
 167          */
 168         if (*ubuf == '.') {
 169                 unichar *tp = ubuf + 1;
 170
 171                 while (ISSPC(*tp))
 172                         ++tp;   /* skip leading whitespace */
 173                 if ((tp[0] == '\\' && tp[1] == '\"') ||
 174                     (isascii(tp[0]) && isalnum(tp[0]) &&
 175                      isascii(tp[1]) && isalnum(tp[1]) &&
 176                      ISSPC(tp[2]))) {
 177                         subtype_mime = "text/troff";
 178                         subtype = "troff or preprocessor input";
 179                         goto subtype_identified;
 180                 }
 181         }
 182
 183         if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
 184                 subtype_mime = "text/fortran";
 185                 subtype = "fortran program";
 186                 goto subtype_identified;
 187         }
 188
 189         /* look for tokens from names.h - this is expensive! */
 190
 191         i = 0;
 192         while (i < ulen) {
 193                 int end;
 194
 195                 /*
 196                  * skip past any leading space
 197                  */
 198                 while (i < ulen && ISSPC(ubuf[i]))
 199                         i++;
 200                 if (i >= ulen)
 201                         break;
 202
 203                 /*
 204                  * find the next whitespace
 205                  */
 206                 for (end = i + 1; end < nbytes; end++)
 207                         if (ISSPC(ubuf[end]))
 208                                 break;
 209
 210                 /*
 211                  * compare the word thus isolated against the token list
 212                  */
 213                 for (p = names; p < names + NNAMES; p++) {
 214                         if (ascmatch(p->name, ubuf + i, end - i)) {
 215                                 subtype = types[p->type].human;
 216                                 subtype_mime = types[p->type].mime;
 217                                 goto subtype_identified;
 218                         }
 219                 }
 220
 221                 i = end;
 222         }
 223
 224 subtype_identified:
 225
 226         /*
 227          * Now try to discover other details about the file.
 228          */
 229         for (i = 0; i < ulen; i++) {
 230                 if (i > last_line_end + MAXLINELEN)
 231                         has_long_lines = 1;
 232
 233                 if (ubuf[i] == '\033')
 234                         has_escapes = 1;
 235                 if (ubuf[i] == '\b')
 236                         has_backspace = 1;
 237
 238                 if (ubuf[i] == '\r' && (i + 1 <  ulen && ubuf[i + 1] == '\n')) {
 239                         n_crlf++;
 240                         last_line_end = i;
 241                 }
 242                 if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
 243                         n_cr++;
 244                         last_line_end = i;
 245                 }
 246                 if (ubuf[i] == '\n' && (i - 1 <  0    || ubuf[i - 1] != '\r')) {
 247                         n_lf++;
 248                         last_line_end = i;
 249                 }
 250                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
 251                         n_nel++;
 252                         last_line_end = i;
 253                 }
 254         }
 255
 256         if (iflag) {
 257                 if (subtype_mime)
 258                         ckfputs(subtype_mime, stdout);
 259                 else
 260                         ckfputs("text/plain", stdout);
 261
 262                 if (code_mime) {
 263                         ckfputs("; charset=", stdout);
 264                         ckfputs(code_mime, stdout);
 265                 }
 266         } else {
 267                 ckfputs(code, stdout);
 268
 269                 if (subtype) {
 270                         ckfputs(" ", stdout);
 271                         ckfputs(subtype, stdout);
 272                 }
 273
 274                 ckfputs(" ", stdout);
 275                 ckfputs(type, stdout);
 276
 277                 if (has_long_lines)
 278                         ckfputs(", with very long lines", stdout);
 279
 280                 /*
 281                  * Only report line terminators if we find one other than LF,
 282                  * or if we find none at all.
 283                  */
 284                 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
 285                     (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
 286                         ckfputs(", with", stdout);
 287
 288                         if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
 289                                 ckfputs(" no", stdout);
 290                         else {
 291                                 if (n_crlf) {
 292                                         ckfputs(" CRLF", stdout);
 293                                         if (n_cr || n_lf || n_nel)
 294                                                 ckfputs(",", stdout);
 295                                 }
 296                                 if (n_cr) {
 297                                         ckfputs(" CR", stdout);
 298                                         if (n_lf || n_nel)
 299                                                 ckfputs(",", stdout);
 300                                 }
 301                                 if (n_lf) {
 302                                         ckfputs(" LF", stdout);
 303                                         if (n_nel)
 304                                                 ckfputs(",", stdout);
 305                                 }
 306                                 if (n_nel)
 307                                         ckfputs(" NEL", stdout);
 308                         }
 309
 310                         ckfputs(" line terminators", stdout);
 311                 }
 312
 313                 if (has_escapes)
 314                         ckfputs(", with escape sequences", stdout);
 315                 if (has_backspace)
 316                         ckfputs(", with overstriking", stdout);
 317         }
 318
 319         return 1;
 320 }
 321
 322 static int
 323 ascmatch(const unsigned char *s, const unichar *us, int ulen)
 324 {
 325         size_t i;
 326
 327         for (i = 0; i < ulen; i++) {
 328                 if (s[i] != us[i])
 329                         return 0;
 330         }
 331
 332         if (s[i])
 333                 return 0;
 334         else
 335                 return 1;
 336 }
 337
 338 /*
 339  * This table reflects a particular philosophy about what constitutes
 340  * "text," and there is room for disagreement about it.
 341  *
 342  * Version 3.31 of the file command considered a file to be ASCII if
 343  * each of its characters was approved by either the isascii() or
 344  * isalpha() function.  On most systems, this would mean that any
 345  * file consisting only of characters in the range 0x00 ... 0x7F
 346  * would be called ASCII text, but many systems might reasonably
 347  * consider some characters outside this range to be alphabetic,
 348  * so the file command would call such characters ASCII.  It might
 349  * have been more accurate to call this "considered textual on the
 350  * local system" than "ASCII."
 351  *
 352  * It considered a file to be "International language text" if each
 353  * of its characters was either an ASCII printing character (according
 354  * to the real ASCII standard, not the above test), a character in
 355  * the range 0x80 ... 0xFF, or one of the following control characters:
 356  * backspace, tab, line feed, vertical tab, form feed, carriage return,
 357  * escape.  No attempt was made to determine the language in which files
 358  * of this type were written.
 359  *
 360  *
 361  * The table below considers a file to be ASCII if all of its characters
 362  * are either ASCII printing characters (again, according to the X3.4
 363  * standard, not isascii()) or any of the following controls: bell,
 364  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
 365  *
 366  * I include bell because some programs (particularly shell scripts)
 367  * use it literally, even though it is rare in normal text.  I exclude
 368  * vertical tab because it never seems to be used in real text.  I also
 369  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
 370  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
 371  * character to.  It might be more appropriate to include it in the 8859
 372  * set instead of the ASCII set, but it's got to be included in *something*
 373  * we recognize or EBCDIC files aren't going to be considered textual.
 374  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
 375  * and Latin characters, so these should possibly be allowed.  But they
 376  * make a real mess on VT100-style displays if they're not paired properly,
 377  * so we are probably better off not calling them text.
 378  *
 379  * A file is considered to be ISO-8859 text if its characters are all
 380  * either ASCII, according to the above definition, or printing characters
 381  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
 382  *
 383  * Finally, a file is considered to be international text from some other
 384  * character code if its characters are all either ISO-8859 (according to
 385  * the above definition) or characters in the range 0x80 ... 0x9F, which
 386  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
 387  * consider to be printing characters.
 388  */
 389
 390 #define F 0   /* character never appears in text */
 391 #define T 1   /* character appears in plain ASCII text */
 392 #define I 2   /* character appears in ISO-8859 text */
 393 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
 394
 395 static char text_chars[256] = {
 396         /*                  BEL BS HT LF    FF CR    */
 397         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
 398         /*                              ESC          */
 399         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
 400         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
 401         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
 402         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
 403         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
 404         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
 405         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
 406         /*            NEL                            */
 407         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
 408         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
 409         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
 410         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
 411         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
 412         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
 413         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
 414         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
 415 };
 416
 417 static int
 418 looks_ascii(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen)
 419 {
 420         int i;
 421
 422         *ulen = 0;
 423
 424         for (i = 0; i < nbytes; i++) {
 425                 int t = text_chars[buf[i]];
 426
 427                 if (t != T)
 428                         return 0;
 429
 430                 ubuf[(*ulen)++] = buf[i];
 431         }
 432
 433         return 1;
 434 }
 435
 436 static int
 437 looks_latin1(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen)
 438 {
 439         int i;
 440
 441         *ulen = 0;
 442
 443         for (i = 0; i < nbytes; i++) {
 444                 int t = text_chars[buf[i]];
 445
 446                 if (t != T && t != I)
 447                         return 0;
 448
 449                 ubuf[(*ulen)++] = buf[i];
 450         }
 451
 452         return 1;
 453 }
 454
 455 static int
 456 looks_extended(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen)
 457 {
 458         int i;
 459
 460         *ulen = 0;
 461
 462         for (i = 0; i < nbytes; i++) {
 463                 int t = text_chars[buf[i]];
 464
 465                 if (t != T && t != I && t != X)
 466                         return 0;
 467
 468                 ubuf[(*ulen)++] = buf[i];
 469         }
 470
 471         return 1;
 472 }
 473
 474 int
 475 looks_utf8(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen)
 476 {
 477         int i, n;
 478         unichar c;
 479         int gotone = 0;
 480
 481         *ulen = 0;
 482
 483         for (i = 0; i < nbytes; i++) {
 484                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
 485                         /*
 486                          * Even if the whole file is valid UTF-8 sequences,
 487                          * still reject it if it uses weird control characters.
 488                          */
 489
 490                         if (text_chars[buf[i]] != T)
 491                                 return 0;
 492
 493                         ubuf[(*ulen)++] = buf[i];
 494                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
 495                         return 0;
 496                 } else {                           /* 11xxxxxx begins UTF-8 */
 497                         int following;
 498
 499                         if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
 500                                 c = buf[i] & 0x1f;
 501                                 following = 1;
 502                         } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
 503                                 c = buf[i] & 0x0f;
 504                                 following = 2;
 505                         } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
 506                                 c = buf[i] & 0x07;
 507                                 following = 3;
 508                         } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
 509                                 c = buf[i] & 0x03;
 510                                 following = 4;
 511                         } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
 512                                 c = buf[i] & 0x01;
 513                                 following = 5;
 514                         } else
 515                                 return 0;
 516
 517                         for (n = 0; n < following; n++) {
 518                                 i++;
 519                                 if (i >= nbytes)
 520                                         goto done;
 521
 522                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
 523                                         return 0;
 524
 525                                 c = (c << 6) + (buf[i] & 0x3f);
 526                         }
 527
 528                         ubuf[(*ulen)++] = c;
 529                         gotone = 1;
 530                 }
 531         }
 532 done:
 533         return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
 534 }
 535
 536 static int
 537 looks_unicode(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen)
 538 {
 539         int bigend;
 540         int i;
 541
 542         if (nbytes < 2)
 543                 return 0;
 544
 545         if (buf[0] == 0xff && buf[1] == 0xfe)
 546                 bigend = 0;
 547         else if (buf[0] == 0xfe && buf[1] == 0xff)
 548                 bigend = 1;
 549         else
 550                 return 0;
 551
 552         *ulen = 0;
 553
 554         for (i = 2; i + 1 < nbytes; i += 2) {
 555                 /* XXX fix to properly handle chars > 65536 */
 556
 557                 if (bigend)
 558                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
 559                 else
 560                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
 561
 562                 if (ubuf[*ulen - 1] == 0xfffe)
 563                         return 0;
 564                 if (ubuf[*ulen - 1] < 128 && text_chars[ubuf[*ulen - 1]] != T)
 565                         return 0;
 566         }
 567
 568         return 1;
 569 }
 570
 571 #undef F
 572 #undef T
 573 #undef I
 574 #undef X
 575
 576 /*
 577  * This table maps each EBCDIC character to an (8-bit extended) ASCII
 578  * character, as specified in the rationale for the dd(1) command in
 579  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
 580  *
 581  * Unfortunately it does not seem to correspond exactly to any of the
 582  * five variants of EBCDIC documented in IBM's _Enterprise Systems
 583  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
 584  * Edition, July, 1999, pp. I-1 - I-4.
 585  *
 586  * Fortunately, though, all versions of EBCDIC, including this one, agree
 587  * on most of the printing characters that also appear in (7-bit) ASCII.
 588  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
 589  *
 590  * Fortunately too, there is general agreement that codes 0x00 through
 591  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
 592  * remainder printing characters.
 593  *
 594  * This is sufficient to allow us to identify EBCDIC text and to distinguish
 595  * between old-style and internationalized examples of text.
 596  */
 597
 598 unsigned char ebcdic_to_ascii[] = {
 599   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
 600  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
 601 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
 602 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
 603 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
 604 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
 605 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
 606 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
 607 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
 608 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
 609 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
 610 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
 611 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
 612 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
 613 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
 614 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
 615 };
 616
 617 /*
 618  * The following EBCDIC-to-ASCII table may relate more closely to reality,
 619  * or at least to modern reality.  It comes from
 620  *
 621  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
 622  *
 623  * and maps the characters of EBCDIC code page 1047 (the code used for
 624  * Unix-derived software on IBM's 390 systems) to the corresponding
 625  * characters from ISO 8859-1.
 626  *
 627  * If this table is used instead of the above one, some of the special
 628  * cases for the NEL character can be taken out of the code.
 629  */
 630
 631 unsigned char ebcdic_1047_to_8859[] = {
 632 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
 633 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
 634 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
 635 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
 636 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
 637 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
 638 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
 639 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
 640 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
 641 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
 642 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
 643 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
 644 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
 645 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
 646 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
 647 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
 648 };
 649
 650 /*
 651  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
 652  */
 653 static void
 654 from_ebcdic(const unsigned char *buf, int nbytes, unsigned char *out)
 655 {
 656         int i;
 657
 658         for (i = 0; i < nbytes; i++) {
 659                 out[i] = ebcdic_to_ascii[buf[i]];
 660         }
 661 }