contrib/file/ascmagic.c

   1 /*
   2  * ASCII magic -- file types that we know based on keywords
   3  * that can appear anywhere in the file.
   4  *
   5  * Copyright (c) Ian F. Darwin, 1987.
   6  * Written by Ian F. Darwin.
   7  *
   8  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
   9  * to handle character codes other than ASCII on a unified basis.
  10  *
  11  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
  12  * international characters, now subsumed into this file.
  13  */
  14
  15 /*
  16  * This software is not subject to any license of the American Telephone
  17  * and Telegraph Company or of the Regents of the University of California.
  18  *
  19  * Permission is granted to anyone to use this software for any purpose on
  20  * any computer system, and to alter it and redistribute it freely, subject
  21  * to the following restrictions:
  22  *
  23  * 1. The author is not responsible for the consequences of use of this
  24  *    software, no matter how awful, even if they arise from flaws in it.
  25  *
  26  * 2. The origin of this software must not be misrepresented, either by
  27  *    explicit claim or by omission.  Since few users ever read sources,
  28  *    credits must appear in the documentation.
  29  *
  30  * 3. Altered versions must be plainly marked as such, and must not be
  31  *    misrepresented as being the original software.  Since few users
  32  *    ever read sources, credits must appear in the documentation.
  33  *
  34  * 4. This notice may not be removed or altered.
  35  */
  36
  37 #include "file.h"
  38 #include <stdio.h>
  39 #include <string.h>
  40 #include <memory.h>
  41 #include <ctype.h>
  42 #include <stdlib.h>
  43 #ifdef HAVE_UNISTD_H
  44 #include <unistd.h>
  45 #endif
  46 #include "names.h"
  47
  48 #ifndef lint
  49 FILE_RCSID("@(#)$Id: ascmagic.c,v 1.30 2001/07/26 13:15:49 christos Exp $")
  50 #endif  /* lint */
  51
  52 typedef unsigned long unichar;
  53
  54 #define MAXLINELEN 300  /* longest sane line length */
  55 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
  56                   || (x) == 0x85 || (x) == '\f')
  57
  58 static int looks_ascii __P((const unsigned char *, int, unichar *, int *));
  59 static int looks_utf8 __P((const unsigned char *, int, unichar *, int *));
  60 static int looks_unicode __P((const unsigned char *, int, unichar *, int *));
  61 static int looks_latin1 __P((const unsigned char *, int, unichar *, int *));
  62 static int looks_extended __P((const unsigned char *, int, unichar *, int *));
  63 static void from_ebcdic __P((const unsigned char *, int, unsigned char *));
  64 static int ascmatch __P((const unsigned char *, const unichar *, int));
  65
  66 int
  67 ascmagic(buf, nbytes)
  68         unsigned char *buf;
  69         int nbytes;     /* size actually read */
  70 {
  71         int i;
  72         char nbuf[HOWMANY+1];           /* one extra for terminating '\0' */
  73         unichar ubuf[HOWMANY+1];        /* one extra for terminating '\0' */
  74         int ulen;
  75         struct names *p;
  76
  77         char *code = NULL;
  78         char *code_mime = NULL;
  79         char *type = NULL;
  80         char *subtype = NULL;
  81         char *subtype_mime = NULL;
  82
  83         int has_escapes = 0;
  84         int has_backspace = 0;
  85
  86         int n_crlf = 0;
  87         int n_lf = 0;
  88         int n_cr = 0;
  89         int n_nel = 0;
  90
  91         int last_line_end = -1;
  92         int has_long_lines = 0;
  93
  94         /*
  95          * Do the tar test first, because if the first file in the tar
  96          * archive starts with a dot, we can confuse it with an nroff file.
  97          */
  98         switch (is_tar(buf, nbytes)) {
  99         case 1:
 100                 ckfputs(iflag ? "application/x-tar" : "tar archive", stdout);
 101                 return 1;
 102         case 2:
 103                 ckfputs(iflag ? "application/x-tar, POSIX"
 104                                 : "POSIX tar archive", stdout);
 105                 return 1;
 106         }
 107
 108         /*
 109          * Undo the NUL-termination kindly provided by process()
 110          * but leave at least one byte to look at
 111          */
 112
 113         while (nbytes > 1 && buf[nbytes - 1] == '\0')
 114                 nbytes--;
 115
 116         /*
 117          * Then try to determine whether it's any character code we can
 118          * identify.  Each of these tests, if it succeeds, will leave
 119          * the text converted into one-unichar-per-character Unicode in
 120          * ubuf, and the number of characters converted in ulen.
 121          */
 122         if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
 123                 code = "ASCII";
 124                 code_mime = "us-ascii";
 125                 type = "text";
 126         } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
 127                 code = "UTF-8 Unicode";
 128                 code_mime = "utf-8";
 129                 type = "text";
 130         } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen))) {
 131                 if (i == 1)
 132                         code = "Little-endian UTF-16 Unicode";
 133                 else
 134                         code = "Big-endian UTF-16 Unicode";
 135
 136                 type = "character data";
 137                 code_mime = "utf-16";    /* is this defined? */
 138         } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
 139                 code = "ISO-8859";
 140                 type = "text";
 141                 code_mime = "iso-8859-1";
 142         } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
 143                 code = "Non-ISO extended-ASCII";
 144                 type = "text";
 145                 code_mime = "unknown";
 146         } else {
 147                 from_ebcdic(buf, nbytes, nbuf);
 148
 149                 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
 150                         code = "EBCDIC";
 151                         type = "character data";
 152                         code_mime = "ebcdic";
 153                 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
 154                         code = "International EBCDIC";
 155                         type = "character data";
 156                         code_mime = "ebcdic";
 157                 } else {
 158                         return 0;  /* doesn't look like text at all */
 159                 }
 160         }
 161
 162         /*
 163          * for troff, look for . + letter + letter or .\";
 164          * this must be done to disambiguate tar archives' ./file
 165          * and other trash from real troff input.
 166          *
 167          * I believe Plan 9 troff allows non-ASCII characters in the names
 168          * of macros, so this test might possibly fail on such a file.
 169          */
 170         if (*ubuf == '.') {
 171                 unichar *tp = ubuf + 1;
 172
 173                 while (ISSPC(*tp))
 174                         ++tp;   /* skip leading whitespace */
 175                 if ((tp[0] == '\\' && tp[1] == '\"') ||
 176                     (isascii(tp[0]) && isalnum(tp[0]) &&
 177                      isascii(tp[1]) && isalnum(tp[1]) &&
 178                      ISSPC(tp[2]))) {
 179                         subtype_mime = "text/troff";
 180                         subtype = "troff or preprocessor input";
 181                         goto subtype_identified;
 182                 }
 183         }
 184
 185         if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
 186                 subtype_mime = "text/fortran";
 187                 subtype = "fortran program";
 188                 goto subtype_identified;
 189         }
 190
 191         /* look for tokens from names.h - this is expensive! */
 192
 193         i = 0;
 194         while (i < ulen) {
 195                 int end;
 196
 197                 /*
 198                  * skip past any leading space
 199                  */
 200                 while (i < ulen && ISSPC(ubuf[i]))
 201                         i++;
 202                 if (i >= ulen)
 203                         break;
 204
 205                 /*
 206                  * find the next whitespace
 207                  */
 208                 for (end = i + 1; end < nbytes; end++)
 209                         if (ISSPC(ubuf[end]))
 210                                 break;
 211
 212                 /*
 213                  * compare the word thus isolated against the token list
 214                  */
 215                 for (p = names; p < names + NNAMES; p++) {
 216                         if (ascmatch(p->name, ubuf + i, end - i)) {
 217                                 subtype = types[p->type].human;
 218                                 subtype_mime = types[p->type].mime;
 219                                 goto subtype_identified;
 220                         }
 221                 }
 222
 223                 i = end;
 224         }
 225
 226 subtype_identified:
 227
 228         /*
 229          * Now try to discover other details about the file.
 230          */
 231         for (i = 0; i < ulen; i++) {
 232                 if (i > last_line_end + MAXLINELEN)
 233                         has_long_lines = 1;
 234
 235                 if (ubuf[i] == '\033')
 236                         has_escapes = 1;
 237                 if (ubuf[i] == '\b')
 238                         has_backspace = 1;
 239
 240                 if (ubuf[i] == '\r' && (i + 1 <  ulen && ubuf[i + 1] == '\n')) {
 241                         n_crlf++;
 242                         last_line_end = i;
 243                 }
 244                 if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
 245                         n_cr++;
 246                         last_line_end = i;
 247                 }
 248                 if (ubuf[i] == '\n' && (i - 1 <  0    || ubuf[i - 1] != '\r')) {
 249                         n_lf++;
 250                         last_line_end = i;
 251                 }
 252                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
 253                         n_nel++;
 254                         last_line_end = i;
 255                 }
 256         }
 257
 258         if (iflag) {
 259                 if (subtype_mime)
 260                         ckfputs(subtype_mime, stdout);
 261                 else
 262                         ckfputs("text/plain", stdout);
 263
 264                 if (code_mime) {
 265                         ckfputs("; charset=", stdout);
 266                         ckfputs(code_mime, stdout);
 267                 }
 268         } else {
 269                 ckfputs(code, stdout);
 270
 271                 if (subtype) {
 272                         ckfputs(" ", stdout);
 273                         ckfputs(subtype, stdout);
 274                 }
 275
 276                 ckfputs(" ", stdout);
 277                 ckfputs(type, stdout);
 278
 279                 if (has_long_lines)
 280                         ckfputs(", with very long lines", stdout);
 281
 282                 /*
 283                  * Only report line terminators if we find one other than LF,
 284                  * or if we find none at all.
 285                  */
 286                 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
 287                     (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
 288                         ckfputs(", with", stdout);
 289
 290                         if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
 291                                 ckfputs(" no", stdout);
 292                         else {
 293                                 if (n_crlf) {
 294                                         ckfputs(" CRLF", stdout);
 295                                         if (n_cr || n_lf || n_nel)
 296                                                 ckfputs(",", stdout);
 297                                 }
 298                                 if (n_cr) {
 299                                         ckfputs(" CR", stdout);
 300                                         if (n_lf || n_nel)
 301                                                 ckfputs(",", stdout);
 302                                 }
 303                                 if (n_lf) {
 304                                         ckfputs(" LF", stdout);
 305                                         if (n_nel)
 306                                                 ckfputs(",", stdout);
 307                                 }
 308                                 if (n_nel)
 309                                         ckfputs(" NEL", stdout);
 310                         }
 311
 312                         ckfputs(" line terminators", stdout);
 313                 }
 314
 315                 if (has_escapes)
 316                         ckfputs(", with escape sequences", stdout);
 317                 if (has_backspace)
 318                         ckfputs(", with overstriking", stdout);
 319         }
 320
 321         return 1;
 322 }
 323
 324 static int
 325 ascmatch(s, us, ulen)
 326         const unsigned char *s;
 327         const unichar *us;
 328         int ulen;
 329 {
 330         size_t i;
 331
 332         for (i = 0; i < ulen; i++) {
 333                 if (s[i] != us[i])
 334                         return 0;
 335         }
 336
 337         if (s[i])
 338                 return 0;
 339         else
 340                 return 1;
 341 }
 342
 343 /*
 344  * This table reflects a particular philosophy about what constitutes
 345  * "text," and there is room for disagreement about it.
 346  *
 347  * Version 3.31 of the file command considered a file to be ASCII if
 348  * each of its characters was approved by either the isascii() or
 349  * isalpha() function.  On most systems, this would mean that any
 350  * file consisting only of characters in the range 0x00 ... 0x7F
 351  * would be called ASCII text, but many systems might reasonably
 352  * consider some characters outside this range to be alphabetic,
 353  * so the file command would call such characters ASCII.  It might
 354  * have been more accurate to call this "considered textual on the
 355  * local system" than "ASCII."
 356  *
 357  * It considered a file to be "International language text" if each
 358  * of its characters was either an ASCII printing character (according
 359  * to the real ASCII standard, not the above test), a character in
 360  * the range 0x80 ... 0xFF, or one of the following control characters:
 361  * backspace, tab, line feed, vertical tab, form feed, carriage return,
 362  * escape.  No attempt was made to determine the language in which files
 363  * of this type were written.
 364  *
 365  *
 366  * The table below considers a file to be ASCII if all of its characters
 367  * are either ASCII printing characters (again, according to the X3.4
 368  * standard, not isascii()) or any of the following controls: bell,
 369  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
 370  *
 371  * I include bell because some programs (particularly shell scripts)
 372  * use it literally, even though it is rare in normal text.  I exclude
 373  * vertical tab because it never seems to be used in real text.  I also
 374  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
 375  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
 376  * character to.  It might be more appropriate to include it in the 8859
 377  * set instead of the ASCII set, but it's got to be included in *something*
 378  * we recognize or EBCDIC files aren't going to be considered textual.
 379  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
 380  * and Latin characters, so these should possibly be allowed.  But they
 381  * make a real mess on VT100-style displays if they're not paired properly,
 382  * so we are probably better off not calling them text.
 383  *
 384  * A file is considered to be ISO-8859 text if its characters are all
 385  * either ASCII, according to the above definition, or printing characters
 386  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
 387  *
 388  * Finally, a file is considered to be international text from some other
 389  * character code if its characters are all either ISO-8859 (according to
 390  * the above definition) or characters in the range 0x80 ... 0x9F, which
 391  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
 392  * consider to be printing characters.
 393  */
 394
 395 #define F 0   /* character never appears in text */
 396 #define T 1   /* character appears in plain ASCII text */
 397 #define I 2   /* character appears in ISO-8859 text */
 398 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
 399
 400 static char text_chars[256] = {
 401         /*                  BEL BS HT LF    FF CR    */
 402         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
 403         /*                              ESC          */
 404         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
 405         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
 406         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
 407         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
 408         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
 409         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
 410         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
 411         /*            NEL                            */
 412         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
 413         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
 414         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
 415         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
 416         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
 417         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
 418         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
 419         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
 420 };
 421
 422 static int
 423 looks_ascii(buf, nbytes, ubuf, ulen)
 424         const unsigned char *buf;
 425         int nbytes;
 426         unichar *ubuf;
 427         int *ulen;
 428 {
 429         int i;
 430
 431         *ulen = 0;
 432
 433         for (i = 0; i < nbytes; i++) {
 434                 int t = text_chars[buf[i]];
 435
 436                 if (t != T)
 437                         return 0;
 438
 439                 ubuf[(*ulen)++] = buf[i];
 440         }
 441
 442         return 1;
 443 }
 444
 445 static int
 446 looks_latin1(buf, nbytes, ubuf, ulen)
 447         const unsigned char *buf;
 448         int nbytes;
 449         unichar *ubuf;
 450         int *ulen;
 451 {
 452         int i;
 453
 454         *ulen = 0;
 455
 456         for (i = 0; i < nbytes; i++) {
 457                 int t = text_chars[buf[i]];
 458
 459                 if (t != T && t != I)
 460                         return 0;
 461
 462                 ubuf[(*ulen)++] = buf[i];
 463         }
 464
 465         return 1;
 466 }
 467
 468 static int
 469 looks_extended(buf, nbytes, ubuf, ulen)
 470         const unsigned char *buf;
 471         int nbytes;
 472         unichar *ubuf;
 473         int *ulen;
 474 {
 475         int i;
 476
 477         *ulen = 0;
 478
 479         for (i = 0; i < nbytes; i++) {
 480                 int t = text_chars[buf[i]];
 481
 482                 if (t != T && t != I && t != X)
 483                         return 0;
 484
 485                 ubuf[(*ulen)++] = buf[i];
 486         }
 487
 488         return 1;
 489 }
 490
 491 int
 492 looks_utf8(buf, nbytes, ubuf, ulen)
 493         const unsigned char *buf;
 494         int nbytes;
 495         unichar *ubuf;
 496         int *ulen;
 497 {
 498         int i, n;
 499         unichar c;
 500         int gotone = 0;
 501
 502         *ulen = 0;
 503
 504         for (i = 0; i < nbytes; i++) {
 505                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
 506                         /*
 507                          * Even if the whole file is valid UTF-8 sequences,
 508                          * still reject it if it uses weird control characters.
 509                          */
 510
 511                         if (text_chars[buf[i]] != T)
 512                                 return 0;
 513
 514                         ubuf[(*ulen)++] = buf[i];
 515                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
 516                         return 0;
 517                 } else {                           /* 11xxxxxx begins UTF-8 */
 518                         int following;
 519
 520                         if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
 521                                 c = buf[i] & 0x1f;
 522                                 following = 1;
 523                         } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
 524                                 c = buf[i] & 0x0f;
 525                                 following = 2;
 526                         } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
 527                                 c = buf[i] & 0x07;
 528                                 following = 3;
 529                         } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
 530                                 c = buf[i] & 0x03;
 531                                 following = 4;
 532                         } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
 533                                 c = buf[i] & 0x01;
 534                                 following = 5;
 535                         } else
 536                                 return 0;
 537
 538                         for (n = 0; n < following; n++) {
 539                                 i++;
 540                                 if (i >= nbytes)
 541                                         goto done;
 542
 543                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
 544                                         return 0;
 545
 546                                 c = (c << 6) + (buf[i] & 0x3f);
 547                         }
 548
 549                         ubuf[(*ulen)++] = c;
 550                         gotone = 1;
 551                 }
 552         }
 553 done:
 554         return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
 555 }
 556
 557 static int
 558 looks_unicode(buf, nbytes, ubuf, ulen)
 559         const unsigned char *buf;
 560         int nbytes;
 561         unichar *ubuf;
 562         int *ulen;
 563 {
 564         int bigend;
 565         int i;
 566
 567         if (nbytes < 2)
 568                 return 0;
 569
 570         if (buf[0] == 0xff && buf[1] == 0xfe)
 571                 bigend = 0;
 572         else if (buf[0] == 0xfe && buf[1] == 0xff)
 573                 bigend = 1;
 574         else
 575                 return 0;
 576
 577         *ulen = 0;
 578
 579         for (i = 2; i + 1 < nbytes; i += 2) {
 580                 /* XXX fix to properly handle chars > 65536 */
 581
 582                 if (bigend)
 583                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
 584                 else
 585                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
 586
 587                 if (ubuf[*ulen - 1] == 0xfffe)
 588                         return 0;
 589                 if (ubuf[*ulen - 1] < 128 && text_chars[ubuf[*ulen - 1]] != T)
 590                         return 0;
 591         }
 592
 593         return 1;
 594 }
 595
 596 #undef F
 597 #undef T
 598 #undef I
 599 #undef X
 600
 601 /*
 602  * This table maps each EBCDIC character to an (8-bit extended) ASCII
 603  * character, as specified in the rationale for the dd(1) command in
 604  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
 605  *
 606  * Unfortunately it does not seem to correspond exactly to any of the
 607  * five variants of EBCDIC documented in IBM's _Enterprise Systems
 608  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
 609  * Edition, July, 1999, pp. I-1 - I-4.
 610  *
 611  * Fortunately, though, all versions of EBCDIC, including this one, agree
 612  * on most of the printing characters that also appear in (7-bit) ASCII.
 613  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
 614  *
 615  * Fortunately too, there is general agreement that codes 0x00 through
 616  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
 617  * remainder printing characters.
 618  *
 619  * This is sufficient to allow us to identify EBCDIC text and to distinguish
 620  * between old-style and internationalized examples of text.
 621  */
 622
 623 unsigned char ebcdic_to_ascii[] = {
 624   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
 625  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
 626 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
 627 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
 628 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
 629 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
 630 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
 631 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
 632 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
 633 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
 634 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
 635 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
 636 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
 637 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
 638 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
 639 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
 640 };
 641
 642 /*
 643  * The following EBCDIC-to-ASCII table may relate more closely to reality,
 644  * or at least to modern reality.  It comes from
 645  *
 646  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
 647  *
 648  * and maps the characters of EBCDIC code page 1047 (the code used for
 649  * Unix-derived software on IBM's 390 systems) to the corresponding
 650  * characters from ISO 8859-1.
 651  *
 652  * If this table is used instead of the above one, some of the special
 653  * cases for the NEL character can be taken out of the code.
 654  */
 655
 656 unsigned char ebcdic_1047_to_8859[] = {
 657 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
 658 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
 659 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
 660 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
 661 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
 662 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
 663 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
 664 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
 665 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
 666 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
 667 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
 668 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
 669 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
 670 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
 671 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
 672 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
 673 };
 674
 675 /*
 676  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
 677  */
 678 static void
 679 from_ebcdic(buf, nbytes, out)
 680         const unsigned char *buf;
 681         int nbytes;
 682         unsigned char *out;
 683 {
 684         int i;
 685
 686         for (i = 0; i < nbytes; i++) {
 687                 out[i] = ebcdic_to_ascii[buf[i]];
 688         }
 689 }