2 * ASCII magic -- file types that we know based on keywords
3 * that can appear anywhere in the file.
5 * Copyright (c) Ian F. Darwin, 1987.
6 * Written by Ian F. Darwin.
8 * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
9 * to handle character codes other than ASCII on a unified basis.
11 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
12 * international characters, now subsumed into this file.
16 * This software is not subject to any license of the American Telephone
17 * and Telegraph Company or of the Regents of the University of California.
19 * Permission is granted to anyone to use this software for any purpose on
20 * any computer system, and to alter it and redistribute it freely, subject
21 * to the following restrictions:
23 * 1. The author is not responsible for the consequences of use of this
24 * software, no matter how awful, even if they arise from flaws in it.
26 * 2. The origin of this software must not be misrepresented, either by
27 * explicit claim or by omission. Since few users ever read sources,
28 * credits must appear in the documentation.
30 * 3. Altered versions must be plainly marked as such, and must not be
31 * misrepresented as being the original software. Since few users
32 * ever read sources, credits must appear in the documentation.
34 * 4. This notice may not be removed or altered.
48 FILE_RCSID("@(#)$Id: ascmagic.c,v 1.32 2002/07/03 18:26:37 christos Exp $")
51 typedef unsigned long unichar;
53 #define MAXLINELEN 300 /* longest sane line length */
54 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
55 || (x) == 0x85 || (x) == '\f')
57 static int looks_ascii(const unsigned char *, int, unichar *, int *);
58 static int looks_utf8(const unsigned char *, int, unichar *, int *);
59 static int looks_unicode(const unsigned char *, int, unichar *, int *);
60 static int looks_latin1(const unsigned char *, int, unichar *, int *);
61 static int looks_extended(const unsigned char *, int, unichar *, int *);
62 static void from_ebcdic(const unsigned char *, int, unsigned char *);
63 static int ascmatch(const unsigned char *, const unichar *, int);
65 /* int nbytes: size actually read */
67 ascmagic(unsigned char *buf, int nbytes)
70 char nbuf[HOWMANY+1]; /* one extra for terminating '\0' */
71 unichar ubuf[HOWMANY+1]; /* one extra for terminating '\0' */
76 char *code_mime = NULL;
79 char *subtype_mime = NULL;
82 int has_backspace = 0;
89 int last_line_end = -1;
90 int has_long_lines = 0;
93 * Do the tar test first, because if the first file in the tar
94 * archive starts with a dot, we can confuse it with an nroff file.
96 switch (is_tar(buf, nbytes)) {
98 ckfputs(iflag ? "application/x-tar" : "tar archive", stdout);
101 ckfputs(iflag ? "application/x-tar, POSIX"
102 : "POSIX tar archive", stdout);
107 * Undo the NUL-termination kindly provided by process()
108 * but leave at least one byte to look at
111 while (nbytes > 1 && buf[nbytes - 1] == '\0')
115 * Then try to determine whether it's any character code we can
116 * identify. Each of these tests, if it succeeds, will leave
117 * the text converted into one-unichar-per-character Unicode in
118 * ubuf, and the number of characters converted in ulen.
120 if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
122 code_mime = "us-ascii";
124 } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
125 code = "UTF-8 Unicode";
128 } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen))) {
130 code = "Little-endian UTF-16 Unicode";
132 code = "Big-endian UTF-16 Unicode";
134 type = "character data";
135 code_mime = "utf-16"; /* is this defined? */
136 } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
139 code_mime = "iso-8859-1";
140 } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
141 code = "Non-ISO extended-ASCII";
143 code_mime = "unknown";
145 from_ebcdic(buf, nbytes, nbuf);
147 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
149 type = "character data";
150 code_mime = "ebcdic";
151 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
152 code = "International EBCDIC";
153 type = "character data";
154 code_mime = "ebcdic";
156 return 0; /* doesn't look like text at all */
161 * for troff, look for . + letter + letter or .\";
162 * this must be done to disambiguate tar archives' ./file
163 * and other trash from real troff input.
165 * I believe Plan 9 troff allows non-ASCII characters in the names
166 * of macros, so this test might possibly fail on such a file.
169 unichar *tp = ubuf + 1;
172 ++tp; /* skip leading whitespace */
173 if ((tp[0] == '\\' && tp[1] == '\"') ||
174 (isascii(tp[0]) && isalnum(tp[0]) &&
175 isascii(tp[1]) && isalnum(tp[1]) &&
177 subtype_mime = "text/troff";
178 subtype = "troff or preprocessor input";
179 goto subtype_identified;
183 if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
184 subtype_mime = "text/fortran";
185 subtype = "fortran program";
186 goto subtype_identified;
189 /* look for tokens from names.h - this is expensive! */
196 * skip past any leading space
198 while (i < ulen && ISSPC(ubuf[i]))
204 * find the next whitespace
206 for (end = i + 1; end < nbytes; end++)
207 if (ISSPC(ubuf[end]))
211 * compare the word thus isolated against the token list
213 for (p = names; p < names + NNAMES; p++) {
214 if (ascmatch(p->name, ubuf + i, end - i)) {
215 subtype = types[p->type].human;
216 subtype_mime = types[p->type].mime;
217 goto subtype_identified;
227 * Now try to discover other details about the file.
229 for (i = 0; i < ulen; i++) {
230 if (i > last_line_end + MAXLINELEN)
233 if (ubuf[i] == '\033')
238 if (ubuf[i] == '\r' && (i + 1 < ulen && ubuf[i + 1] == '\n')) {
242 if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
246 if (ubuf[i] == '\n' && (i - 1 < 0 || ubuf[i - 1] != '\r')) {
250 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
258 ckfputs(subtype_mime, stdout);
260 ckfputs("text/plain", stdout);
263 ckfputs("; charset=", stdout);
264 ckfputs(code_mime, stdout);
267 ckfputs(code, stdout);
270 ckfputs(" ", stdout);
271 ckfputs(subtype, stdout);
274 ckfputs(" ", stdout);
275 ckfputs(type, stdout);
278 ckfputs(", with very long lines", stdout);
281 * Only report line terminators if we find one other than LF,
282 * or if we find none at all.
284 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
285 (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
286 ckfputs(", with", stdout);
288 if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
289 ckfputs(" no", stdout);
292 ckfputs(" CRLF", stdout);
293 if (n_cr || n_lf || n_nel)
294 ckfputs(",", stdout);
297 ckfputs(" CR", stdout);
299 ckfputs(",", stdout);
302 ckfputs(" LF", stdout);
304 ckfputs(",", stdout);
307 ckfputs(" NEL", stdout);
310 ckfputs(" line terminators", stdout);
314 ckfputs(", with escape sequences", stdout);
316 ckfputs(", with overstriking", stdout);
323 ascmatch(const unsigned char *s, const unichar *us, int ulen)
327 for (i = 0; i < ulen; i++) {
339 * This table reflects a particular philosophy about what constitutes
340 * "text," and there is room for disagreement about it.
342 * Version 3.31 of the file command considered a file to be ASCII if
343 * each of its characters was approved by either the isascii() or
344 * isalpha() function. On most systems, this would mean that any
345 * file consisting only of characters in the range 0x00 ... 0x7F
346 * would be called ASCII text, but many systems might reasonably
347 * consider some characters outside this range to be alphabetic,
348 * so the file command would call such characters ASCII. It might
349 * have been more accurate to call this "considered textual on the
350 * local system" than "ASCII."
352 * It considered a file to be "International language text" if each
353 * of its characters was either an ASCII printing character (according
354 * to the real ASCII standard, not the above test), a character in
355 * the range 0x80 ... 0xFF, or one of the following control characters:
356 * backspace, tab, line feed, vertical tab, form feed, carriage return,
357 * escape. No attempt was made to determine the language in which files
358 * of this type were written.
361 * The table below considers a file to be ASCII if all of its characters
362 * are either ASCII printing characters (again, according to the X3.4
363 * standard, not isascii()) or any of the following controls: bell,
364 * backspace, tab, line feed, form feed, carriage return, esc, nextline.
366 * I include bell because some programs (particularly shell scripts)
367 * use it literally, even though it is rare in normal text. I exclude
368 * vertical tab because it never seems to be used in real text. I also
369 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
370 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
371 * character to. It might be more appropriate to include it in the 8859
372 * set instead of the ASCII set, but it's got to be included in *something*
373 * we recognize or EBCDIC files aren't going to be considered textual.
374 * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
375 * and Latin characters, so these should possibly be allowed. But they
376 * make a real mess on VT100-style displays if they're not paired properly,
377 * so we are probably better off not calling them text.
379 * A file is considered to be ISO-8859 text if its characters are all
380 * either ASCII, according to the above definition, or printing characters
381 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
383 * Finally, a file is considered to be international text from some other
384 * character code if its characters are all either ISO-8859 (according to
385 * the above definition) or characters in the range 0x80 ... 0x9F, which
386 * ISO-8859 considers to be control characters but the IBM PC and Macintosh
387 * consider to be printing characters.
390 #define F 0 /* character never appears in text */
391 #define T 1 /* character appears in plain ASCII text */
392 #define I 2 /* character appears in ISO-8859 text */
393 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
395 static char text_chars[256] = {
396 /* BEL BS HT LF FF CR */
397 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
399 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
400 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
401 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
402 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
403 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
404 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
405 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
407 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
408 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
409 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
410 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
411 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
412 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
413 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
414 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
418 looks_ascii(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen)
424 for (i = 0; i < nbytes; i++) {
425 int t = text_chars[buf[i]];
430 ubuf[(*ulen)++] = buf[i];
437 looks_latin1(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen)
443 for (i = 0; i < nbytes; i++) {
444 int t = text_chars[buf[i]];
446 if (t != T && t != I)
449 ubuf[(*ulen)++] = buf[i];
456 looks_extended(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen)
462 for (i = 0; i < nbytes; i++) {
463 int t = text_chars[buf[i]];
465 if (t != T && t != I && t != X)
468 ubuf[(*ulen)++] = buf[i];
475 looks_utf8(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen)
483 for (i = 0; i < nbytes; i++) {
484 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
486 * Even if the whole file is valid UTF-8 sequences,
487 * still reject it if it uses weird control characters.
490 if (text_chars[buf[i]] != T)
493 ubuf[(*ulen)++] = buf[i];
494 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
496 } else { /* 11xxxxxx begins UTF-8 */
499 if ((buf[i] & 0x20) == 0) { /* 110xxxxx */
502 } else if ((buf[i] & 0x10) == 0) { /* 1110xxxx */
505 } else if ((buf[i] & 0x08) == 0) { /* 11110xxx */
508 } else if ((buf[i] & 0x04) == 0) { /* 111110xx */
511 } else if ((buf[i] & 0x02) == 0) { /* 1111110x */
517 for (n = 0; n < following; n++) {
522 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
525 c = (c << 6) + (buf[i] & 0x3f);
533 return gotone; /* don't claim it's UTF-8 if it's all 7-bit */
537 looks_unicode(const unsigned char *buf, int nbytes, unichar *ubuf, int *ulen)
545 if (buf[0] == 0xff && buf[1] == 0xfe)
547 else if (buf[0] == 0xfe && buf[1] == 0xff)
554 for (i = 2; i + 1 < nbytes; i += 2) {
555 /* XXX fix to properly handle chars > 65536 */
558 ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
560 ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
562 if (ubuf[*ulen - 1] == 0xfffe)
564 if (ubuf[*ulen - 1] < 128 && text_chars[ubuf[*ulen - 1]] != T)
577 * This table maps each EBCDIC character to an (8-bit extended) ASCII
578 * character, as specified in the rationale for the dd(1) command in
579 * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
581 * Unfortunately it does not seem to correspond exactly to any of the
582 * five variants of EBCDIC documented in IBM's _Enterprise Systems
583 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
584 * Edition, July, 1999, pp. I-1 - I-4.
586 * Fortunately, though, all versions of EBCDIC, including this one, agree
587 * on most of the printing characters that also appear in (7-bit) ASCII.
588 * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
590 * Fortunately too, there is general agreement that codes 0x00 through
591 * 0x3F represent control characters, 0x41 a nonbreaking space, and the
592 * remainder printing characters.
594 * This is sufficient to allow us to identify EBCDIC text and to distinguish
595 * between old-style and internationalized examples of text.
598 unsigned char ebcdic_to_ascii[] = {
599 0, 1, 2, 3, 156, 9, 134, 127, 151, 141, 142, 11, 12, 13, 14, 15,
600 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
601 128, 129, 130, 131, 132, 10, 23, 27, 136, 137, 138, 139, 140, 5, 6, 7,
602 144, 145, 22, 147, 148, 149, 150, 4, 152, 153, 154, 155, 20, 21, 158, 26,
603 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
604 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
605 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
606 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
607 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
608 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
609 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
610 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
611 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
612 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
613 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
614 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
618 * The following EBCDIC-to-ASCII table may relate more closely to reality,
619 * or at least to modern reality. It comes from
621 * http://ftp.s390.ibm.com/products/oe/bpxqp9.html
623 * and maps the characters of EBCDIC code page 1047 (the code used for
624 * Unix-derived software on IBM's 390 systems) to the corresponding
625 * characters from ISO 8859-1.
627 * If this table is used instead of the above one, some of the special
628 * cases for the NEL character can be taken out of the code.
631 unsigned char ebcdic_1047_to_8859[] = {
632 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
633 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
634 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
635 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
636 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
637 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
638 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
639 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
640 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
641 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
642 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
643 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
644 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
645 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
646 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
647 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
651 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.
654 from_ebcdic(const unsigned char *buf, int nbytes, unsigned char *out)
658 for (i = 0; i < nbytes; i++) {
659 out[i] = ebcdic_to_ascii[buf[i]];