2 * ASCII magic -- file types that we know based on keywords
3 * that can appear anywhere in the file.
5 * Copyright (c) Ian F. Darwin, 1987.
6 * Written by Ian F. Darwin.
8 * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
9 * to handle character codes other than ASCII on a unified basis.
11 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
12 * international characters, now subsumed into this file.
16 * This software is not subject to any license of the American Telephone
17 * and Telegraph Company or of the Regents of the University of California.
19 * Permission is granted to anyone to use this software for any purpose on
20 * any computer system, and to alter it and redistribute it freely, subject
21 * to the following restrictions:
23 * 1. The author is not responsible for the consequences of use of this
24 * software, no matter how awful, even if they arise from flaws in it.
26 * 2. The origin of this software must not be misrepresented, either by
27 * explicit claim or by omission. Since few users ever read sources,
28 * credits must appear in the documentation.
30 * 3. Altered versions must be plainly marked as such, and must not be
31 * misrepresented as being the original software. Since few users
32 * ever read sources, credits must appear in the documentation.
34 * 4. This notice may not be removed or altered.
49 FILE_RCSID("@(#)$Id: ascmagic.c,v 1.30 2001/07/26 13:15:49 christos Exp $")
52 typedef unsigned long unichar;
54 #define MAXLINELEN 300 /* longest sane line length */
55 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
56 || (x) == 0x85 || (x) == '\f')
58 static int looks_ascii __P((const unsigned char *, int, unichar *, int *));
59 static int looks_utf8 __P((const unsigned char *, int, unichar *, int *));
60 static int looks_unicode __P((const unsigned char *, int, unichar *, int *));
61 static int looks_latin1 __P((const unsigned char *, int, unichar *, int *));
62 static int looks_extended __P((const unsigned char *, int, unichar *, int *));
63 static void from_ebcdic __P((const unsigned char *, int, unsigned char *));
64 static int ascmatch __P((const unsigned char *, const unichar *, int));
69 int nbytes; /* size actually read */
72 char nbuf[HOWMANY+1]; /* one extra for terminating '\0' */
73 unichar ubuf[HOWMANY+1]; /* one extra for terminating '\0' */
78 char *code_mime = NULL;
81 char *subtype_mime = NULL;
84 int has_backspace = 0;
91 int last_line_end = -1;
92 int has_long_lines = 0;
95 * Do the tar test first, because if the first file in the tar
96 * archive starts with a dot, we can confuse it with an nroff file.
98 switch (is_tar(buf, nbytes)) {
100 ckfputs(iflag ? "application/x-tar" : "tar archive", stdout);
103 ckfputs(iflag ? "application/x-tar, POSIX"
104 : "POSIX tar archive", stdout);
109 * Undo the NUL-termination kindly provided by process()
110 * but leave at least one byte to look at
113 while (nbytes > 1 && buf[nbytes - 1] == '\0')
117 * Then try to determine whether it's any character code we can
118 * identify. Each of these tests, if it succeeds, will leave
119 * the text converted into one-unichar-per-character Unicode in
120 * ubuf, and the number of characters converted in ulen.
122 if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
124 code_mime = "us-ascii";
126 } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
127 code = "UTF-8 Unicode";
130 } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen))) {
132 code = "Little-endian UTF-16 Unicode";
134 code = "Big-endian UTF-16 Unicode";
136 type = "character data";
137 code_mime = "utf-16"; /* is this defined? */
138 } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
141 code_mime = "iso-8859-1";
142 } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
143 code = "Non-ISO extended-ASCII";
145 code_mime = "unknown";
147 from_ebcdic(buf, nbytes, nbuf);
149 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
151 type = "character data";
152 code_mime = "ebcdic";
153 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
154 code = "International EBCDIC";
155 type = "character data";
156 code_mime = "ebcdic";
158 return 0; /* doesn't look like text at all */
163 * for troff, look for . + letter + letter or .\";
164 * this must be done to disambiguate tar archives' ./file
165 * and other trash from real troff input.
167 * I believe Plan 9 troff allows non-ASCII characters in the names
168 * of macros, so this test might possibly fail on such a file.
171 unichar *tp = ubuf + 1;
174 ++tp; /* skip leading whitespace */
175 if ((tp[0] == '\\' && tp[1] == '\"') ||
176 (isascii(tp[0]) && isalnum(tp[0]) &&
177 isascii(tp[1]) && isalnum(tp[1]) &&
179 subtype_mime = "text/troff";
180 subtype = "troff or preprocessor input";
181 goto subtype_identified;
185 if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
186 subtype_mime = "text/fortran";
187 subtype = "fortran program";
188 goto subtype_identified;
191 /* look for tokens from names.h - this is expensive! */
198 * skip past any leading space
200 while (i < ulen && ISSPC(ubuf[i]))
206 * find the next whitespace
208 for (end = i + 1; end < nbytes; end++)
209 if (ISSPC(ubuf[end]))
213 * compare the word thus isolated against the token list
215 for (p = names; p < names + NNAMES; p++) {
216 if (ascmatch(p->name, ubuf + i, end - i)) {
217 subtype = types[p->type].human;
218 subtype_mime = types[p->type].mime;
219 goto subtype_identified;
229 * Now try to discover other details about the file.
231 for (i = 0; i < ulen; i++) {
232 if (i > last_line_end + MAXLINELEN)
235 if (ubuf[i] == '\033')
240 if (ubuf[i] == '\r' && (i + 1 < ulen && ubuf[i + 1] == '\n')) {
244 if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
248 if (ubuf[i] == '\n' && (i - 1 < 0 || ubuf[i - 1] != '\r')) {
252 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
260 ckfputs(subtype_mime, stdout);
262 ckfputs("text/plain", stdout);
265 ckfputs("; charset=", stdout);
266 ckfputs(code_mime, stdout);
269 ckfputs(code, stdout);
272 ckfputs(" ", stdout);
273 ckfputs(subtype, stdout);
276 ckfputs(" ", stdout);
277 ckfputs(type, stdout);
280 ckfputs(", with very long lines", stdout);
283 * Only report line terminators if we find one other than LF,
284 * or if we find none at all.
286 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
287 (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
288 ckfputs(", with", stdout);
290 if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
291 ckfputs(" no", stdout);
294 ckfputs(" CRLF", stdout);
295 if (n_cr || n_lf || n_nel)
296 ckfputs(",", stdout);
299 ckfputs(" CR", stdout);
301 ckfputs(",", stdout);
304 ckfputs(" LF", stdout);
306 ckfputs(",", stdout);
309 ckfputs(" NEL", stdout);
312 ckfputs(" line terminators", stdout);
316 ckfputs(", with escape sequences", stdout);
318 ckfputs(", with overstriking", stdout);
325 ascmatch(s, us, ulen)
326 const unsigned char *s;
332 for (i = 0; i < ulen; i++) {
344 * This table reflects a particular philosophy about what constitutes
345 * "text," and there is room for disagreement about it.
347 * Version 3.31 of the file command considered a file to be ASCII if
348 * each of its characters was approved by either the isascii() or
349 * isalpha() function. On most systems, this would mean that any
350 * file consisting only of characters in the range 0x00 ... 0x7F
351 * would be called ASCII text, but many systems might reasonably
352 * consider some characters outside this range to be alphabetic,
353 * so the file command would call such characters ASCII. It might
354 * have been more accurate to call this "considered textual on the
355 * local system" than "ASCII."
357 * It considered a file to be "International language text" if each
358 * of its characters was either an ASCII printing character (according
359 * to the real ASCII standard, not the above test), a character in
360 * the range 0x80 ... 0xFF, or one of the following control characters:
361 * backspace, tab, line feed, vertical tab, form feed, carriage return,
362 * escape. No attempt was made to determine the language in which files
363 * of this type were written.
366 * The table below considers a file to be ASCII if all of its characters
367 * are either ASCII printing characters (again, according to the X3.4
368 * standard, not isascii()) or any of the following controls: bell,
369 * backspace, tab, line feed, form feed, carriage return, esc, nextline.
371 * I include bell because some programs (particularly shell scripts)
372 * use it literally, even though it is rare in normal text. I exclude
373 * vertical tab because it never seems to be used in real text. I also
374 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
375 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
376 * character to. It might be more appropriate to include it in the 8859
377 * set instead of the ASCII set, but it's got to be included in *something*
378 * we recognize or EBCDIC files aren't going to be considered textual.
379 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
380 * and Latin characters, so these should possibly be allowed. But they
381 * make a real mess on VT100-style displays if they're not paired properly,
382 * so we are probably better off not calling them text.
384 * A file is considered to be ISO-8859 text if its characters are all
385 * either ASCII, according to the above definition, or printing characters
386 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
388 * Finally, a file is considered to be international text from some other
389 * character code if its characters are all either ISO-8859 (according to
390 * the above definition) or characters in the range 0x80 ... 0x9F, which
391 * ISO-8859 considers to be control characters but the IBM PC and Macintosh
392 * consider to be printing characters.
395 #define F 0 /* character never appears in text */
396 #define T 1 /* character appears in plain ASCII text */
397 #define I 2 /* character appears in ISO-8859 text */
398 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
400 static char text_chars[256] = {
401 /* BEL BS HT LF FF CR */
402 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
404 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
405 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
406 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
407 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
408 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
409 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
410 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
412 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
413 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
414 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
415 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
416 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
417 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
418 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
419 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
423 looks_ascii(buf, nbytes, ubuf, ulen)
424 const unsigned char *buf;
433 for (i = 0; i < nbytes; i++) {
434 int t = text_chars[buf[i]];
439 ubuf[(*ulen)++] = buf[i];
446 looks_latin1(buf, nbytes, ubuf, ulen)
447 const unsigned char *buf;
456 for (i = 0; i < nbytes; i++) {
457 int t = text_chars[buf[i]];
459 if (t != T && t != I)
462 ubuf[(*ulen)++] = buf[i];
469 looks_extended(buf, nbytes, ubuf, ulen)
470 const unsigned char *buf;
479 for (i = 0; i < nbytes; i++) {
480 int t = text_chars[buf[i]];
482 if (t != T && t != I && t != X)
485 ubuf[(*ulen)++] = buf[i];
492 looks_utf8(buf, nbytes, ubuf, ulen)
493 const unsigned char *buf;
504 for (i = 0; i < nbytes; i++) {
505 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
507 * Even if the whole file is valid UTF-8 sequences,
508 * still reject it if it uses weird control characters.
511 if (text_chars[buf[i]] != T)
514 ubuf[(*ulen)++] = buf[i];
515 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
517 } else { /* 11xxxxxx begins UTF-8 */
520 if ((buf[i] & 0x20) == 0) { /* 110xxxxx */
523 } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */
526 } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */
529 } else if ((buf[i] & 0x04) == 0) { /* 111110xx */
532 } else if ((buf[i] & 0x02) == 0) { /* 1111110x */
538 for (n = 0; n < following; n++) {
543 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
546 c = (c << 6) + (buf[i] & 0x3f);
554 return gotone; /* don't claim it's UTF-8 if it's all 7-bit */
558 looks_unicode(buf, nbytes, ubuf, ulen)
559 const unsigned char *buf;
570 if (buf[0] == 0xff && buf[1] == 0xfe)
572 else if (buf[0] == 0xfe && buf[1] == 0xff)
579 for (i = 2; i + 1 < nbytes; i += 2) {
580 /* XXX fix to properly handle chars > 65536 */
583 ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
585 ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
587 if (ubuf[*ulen - 1] == 0xfffe)
589 if (ubuf[*ulen - 1] < 128 && text_chars[ubuf[*ulen - 1]] != T)
602 * This table maps each EBCDIC character to an (8-bit extended) ASCII
603 * character, as specified in the rationale for the dd(1) command in
604 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
606 * Unfortunately it does not seem to correspond exactly to any of the
607 * five variants of EBCDIC documented in IBM's _Enterprise Systems
608 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
609 * Edition, July, 1999, pp. I-1 - I-4.
611 * Fortunately, though, all versions of EBCDIC, including this one, agree
612 * on most of the printing characters that also appear in (7-bit) ASCII.
613 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
615 * Fortunately too, there is general agreement that codes 0x00 through
616 * 0x3F represent control characters, 0x41 a nonbreaking space, and the
617 * remainder printing characters.
619 * This is sufficient to allow us to identify EBCDIC text and to distinguish
620 * between old-style and internationalized examples of text.
623 unsigned char ebcdic_to_ascii[] = {
624 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
625 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
626 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
627 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
628 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
629 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
630 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
631 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
632 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
633 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
634 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
635 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
636 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
637 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
638 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
639 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
643 * The following EBCDIC-to-ASCII table may relate more closely to reality,
644 * or at least to modern reality. It comes from
646 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html
648 * and maps the characters of EBCDIC code page 1047 (the code used for
649 * Unix-derived software on IBM's 390 systems) to the corresponding
650 * characters from ISO 8859-1.
652 * If this table is used instead of the above one, some of the special
653 * cases for the NEL character can be taken out of the code.
656 unsigned char ebcdic_1047_to_8859[] = {
657 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
658 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
659 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
660 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
661 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
662 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
663 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
664 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
665 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
666 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
667 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
668 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
669 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
670 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
671 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
672 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
676 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
679 from_ebcdic(buf, nbytes, out)
680 const unsigned char *buf;
686 for (i = 0; i < nbytes; i++) {
687 out[i] = ebcdic_to_ascii[buf[i]];