]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/file/src/ascmagic.c
MFV r362254: file 5.39.
[FreeBSD/FreeBSD.git] / contrib / file / src / ascmagic.c
1 /*
2  * Copyright (c) Ian F. Darwin 1986-1995.
3  * Software written by Ian F. Darwin and others;
4  * maintained 1995-present by Christos Zoulas and others.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice immediately at the beginning of the file, without modification,
11  *    this list of conditions, and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 /*
29  * ASCII magic -- try to detect text encoding.
30  *
31  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
32  * to handle character codes other than ASCII on a unified basis.
33  */
34
35 #include "file.h"
36
37 #ifndef lint
38 FILE_RCSID("@(#)$File: ascmagic.c,v 1.107 2020/06/08 19:58:36 christos Exp $")
39 #endif  /* lint */
40
41 #include "magic.h"
42 #include <string.h>
43 #include <ctype.h>
44 #include <stdlib.h>
45 #ifdef HAVE_UNISTD_H
46 #include <unistd.h>
47 #endif
48
49 #define MAXLINELEN 300  /* longest sane line length */
50 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
51                   || (x) == 0x85 || (x) == '\f')
52
53 private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
54 private size_t trim_nuls(const unsigned char *, size_t);
55
56 /*
57  * Undo the NUL-termination kindly provided by process()
58  * but leave at least one byte to look at
59  */
60 private size_t
61 trim_nuls(const unsigned char *buf, size_t nbytes)
62 {
63         while (nbytes > 1 && buf[nbytes - 1] == '\0')
64                 nbytes--;
65
66         return nbytes;
67 }
68
69 protected int
70 file_ascmagic(struct magic_set *ms, const struct buffer *b, int text)
71 {
72         unichar *ubuf = NULL;
73         size_t ulen = 0;
74         int rv = 1;
75         struct buffer bb;
76
77         const char *code = NULL;
78         const char *code_mime = NULL;
79         const char *type = NULL;
80
81         bb = *b;
82         bb.flen = trim_nuls(CAST(const unsigned char *, b->fbuf), b->flen);
83         /*
84          * Avoid trimming at an odd byte if the original buffer was evenly
85          * sized; this avoids losing the last character on UTF-16 LE text
86          */
87         if ((bb.flen & 1) && !(b->flen & 1))
88                 bb.flen++;
89
90         /* If file doesn't look like any sort of text, give up. */
91         if (file_encoding(ms, &bb, &ubuf, &ulen, &code, &code_mime,
92             &type) == 0)
93                 rv = 0;
94         else
95                 rv = file_ascmagic_with_encoding(ms, &bb,
96                     ubuf, ulen, code, type, text);
97
98         free(ubuf);
99
100         return rv;
101 }
102
103 protected int
104 file_ascmagic_with_encoding(struct magic_set *ms,
105     const struct buffer *b, unichar *ubuf, size_t ulen, const char *code,
106     const char *type, int text)
107 {
108         struct buffer bb;
109         const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
110         size_t nbytes = b->flen;
111         unsigned char *utf8_buf = NULL, *utf8_end;
112         size_t mlen, i, len;
113         int rv = -1;
114         int mime = ms->flags & MAGIC_MIME;
115         int need_separator = 0;
116
117         const char *subtype = NULL;
118
119         int has_escapes = 0;
120         int has_backspace = 0;
121         int seen_cr = 0;
122
123         int n_crlf = 0;
124         int n_lf = 0;
125         int n_cr = 0;
126         int n_nel = 0;
127         int executable = 0;
128
129         size_t last_line_end = CAST(size_t, -1);
130         int has_long_lines = 0;
131
132         nbytes = trim_nuls(buf, nbytes);
133
134         /* If we have fewer than 2 bytes, give up. */
135         if (nbytes <= 1) {
136                 rv = 0;
137                 goto done;
138         }
139
140         if (ulen > 0 && (ms->flags & MAGIC_NO_CHECK_SOFT) == 0) {
141                 /* Convert ubuf to UTF-8 and try text soft magic */
142                 /* malloc size is a conservative overestimate; could be
143                    improved, or at least realloced after conversion. */
144                 mlen = ulen * 6;
145                 if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
146                         file_oomem(ms, mlen);
147                         goto done;
148                 }
149                 if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen))
150                     == NULL)
151                         goto done;
152                 buffer_init(&bb, b->fd, &b->st, utf8_buf,
153                     CAST(size_t, utf8_end - utf8_buf));
154
155                 if ((rv = file_softmagic(ms, &bb, NULL, NULL,
156                     TEXTTEST, text)) == 0)
157                         rv = -1;
158                 else
159                         need_separator = 1;
160                 buffer_fini(&bb);
161                 if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION))) {
162                         rv = rv == -1 ? 0 : 1;
163                         goto done;
164                 }
165         }
166
167         if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION))) {
168                 rv = 0;
169                 goto done;
170         }
171
172         /* Now try to discover other details about the file. */
173         for (i = 0; i < ulen; i++) {
174                 if (ubuf[i] == '\n') {
175                         if (seen_cr)
176                                 n_crlf++;
177                         else
178                                 n_lf++;
179                         last_line_end = i;
180                 } else if (seen_cr)
181                         n_cr++;
182
183                 seen_cr = (ubuf[i] == '\r');
184                 if (seen_cr)
185                         last_line_end = i;
186
187                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
188                         n_nel++;
189                         last_line_end = i;
190                 }
191
192                 /* If this line is _longer_ than MAXLINELEN, remember it. */
193                 if (i > last_line_end + MAXLINELEN)
194                         has_long_lines = 1;
195
196                 if (ubuf[i] == '\033')
197                         has_escapes = 1;
198                 if (ubuf[i] == '\b')
199                         has_backspace = 1;
200         }
201
202         /* Beware, if the data has been truncated, the final CR could have
203            been followed by a LF.  If we have ms->bytes_max bytes, it indicates
204            that the data might have been truncated, probably even before
205            this function was called. */
206         if (seen_cr && nbytes < ms->bytes_max)
207                 n_cr++;
208
209         if (strcmp(type, "binary") == 0) {
210                 rv = 0;
211                 goto done;
212         }
213         len = file_printedlen(ms);
214         if (mime) {
215                 if ((mime & MAGIC_MIME_TYPE) != 0) {
216                         if (len) {
217                                 /*
218                                  * Softmagic printed something, we
219                                  * are either done, or we need a separator
220                                  */
221                                 if ((ms->flags & MAGIC_CONTINUE) == 0) {
222                                         rv = 1;
223                                         goto done;
224                                 }
225                                 if (need_separator && file_separator(ms) == -1)
226                                         goto done;
227                         } else {
228                                 if (file_printf(ms, "text/plain") == -1)
229                                         goto done;
230                         }
231                 }
232         } else {
233                 if (len) {
234                         switch (file_replace(ms, " text$", ", ")) {
235                         case 0:
236                                 switch (file_replace(ms, " text executable$",
237                                     ", ")) {
238                                 case 0:
239                                         if (file_printf(ms, ", ") == -1)
240                                                 goto done;
241                                         break;
242                                 case -1:
243                                         goto done;
244                                 default:
245                                         executable = 1;
246                                         break;
247                                 }
248                                 break;
249                         case -1:
250                                 goto done;
251                         default:
252                                 break;
253                         }
254                 }
255
256                 if (file_printf(ms, "%s", code) == -1)
257                         goto done;
258
259                 if (subtype) {
260                         if (file_printf(ms, " %s", subtype) == -1)
261                                 goto done;
262                 }
263
264                 if (file_printf(ms, " %s", type) == -1)
265                         goto done;
266
267                 if (executable)
268                         if (file_printf(ms, " executable") == -1)
269                                 goto done;
270
271                 if (has_long_lines)
272                         if (file_printf(ms, ", with very long lines") == -1)
273                                 goto done;
274
275                 /*
276                  * Only report line terminators if we find one other than LF,
277                  * or if we find none at all.
278                  */
279                 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
280                     (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
281                         if (file_printf(ms, ", with") == -1)
282                                 goto done;
283
284                         if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
285                                 if (file_printf(ms, " no") == -1)
286                                         goto done;
287                         } else {
288                                 if (n_crlf) {
289                                         if (file_printf(ms, " CRLF") == -1)
290                                                 goto done;
291                                         if (n_cr || n_lf || n_nel)
292                                                 if (file_printf(ms, ",") == -1)
293                                                         goto done;
294                                 }
295                                 if (n_cr) {
296                                         if (file_printf(ms, " CR") == -1)
297                                                 goto done;
298                                         if (n_lf || n_nel)
299                                                 if (file_printf(ms, ",") == -1)
300                                                         goto done;
301                                 }
302                                 if (n_lf) {
303                                         if (file_printf(ms, " LF") == -1)
304                                                 goto done;
305                                         if (n_nel)
306                                                 if (file_printf(ms, ",") == -1)
307                                                         goto done;
308                                 }
309                                 if (n_nel)
310                                         if (file_printf(ms, " NEL") == -1)
311                                                 goto done;
312                         }
313
314                         if (file_printf(ms, " line terminators") == -1)
315                                 goto done;
316                 }
317
318                 if (has_escapes)
319                         if (file_printf(ms, ", with escape sequences") == -1)
320                                 goto done;
321                 if (has_backspace)
322                         if (file_printf(ms, ", with overstriking") == -1)
323                                 goto done;
324         }
325         rv = 1;
326 done:
327         free(utf8_buf);
328
329         return rv;
330 }
331
332 /*
333  * Encode Unicode string as UTF-8, returning pointer to character
334  * after end of string, or NULL if an invalid character is found.
335  */
336 private unsigned char *
337 encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
338 {
339         size_t i;
340         unsigned char *end = buf + len;
341
342         for (i = 0; i < ulen; i++) {
343                 if (ubuf[i] <= 0x7f) {
344                         if (end - buf < 1)
345                                 return NULL;
346                         *buf++ = CAST(unsigned char, ubuf[i]);
347                 } else if (ubuf[i] <= 0x7ff) {
348                         if (end - buf < 2)
349                                 return NULL;
350                         *buf++ = CAST(unsigned char, (ubuf[i] >> 6) + 0xc0);
351                         *buf++ = CAST(unsigned char, (ubuf[i] & 0x3f) + 0x80);
352                 } else if (ubuf[i] <= 0xffff) {
353                         if (end - buf < 3)
354                                 return NULL;
355                         *buf++ = CAST(unsigned char, (ubuf[i] >> 12) + 0xe0);
356                         *buf++ = CAST(unsigned char, ((ubuf[i] >> 6) & 0x3f) + 0x80);
357                         *buf++ = CAST(unsigned char, (ubuf[i] & 0x3f) + 0x80);
358                 } else if (ubuf[i] <= 0x1fffff) {
359                         if (end - buf < 4)
360                                 return NULL;
361                         *buf++ = CAST(unsigned char, (ubuf[i] >> 18) + 0xf0);
362                         *buf++ = CAST(unsigned char, ((ubuf[i] >> 12) & 0x3f) + 0x80);
363                         *buf++ = CAST(unsigned char, ((ubuf[i] >>  6) & 0x3f) + 0x80);
364                         *buf++ = CAST(unsigned char, (ubuf[i] & 0x3f) + 0x80);
365                 } else if (ubuf[i] <= 0x3ffffff) {
366                         if (end - buf < 5)
367                                 return NULL;
368                         *buf++ = CAST(unsigned char, (ubuf[i] >> 24) + 0xf8);
369                         *buf++ = CAST(unsigned char, ((ubuf[i] >> 18) & 0x3f) + 0x80);
370                         *buf++ = CAST(unsigned char, ((ubuf[i] >> 12) & 0x3f) + 0x80);
371                         *buf++ = CAST(unsigned char, ((ubuf[i] >>  6) & 0x3f) + 0x80);
372                         *buf++ = CAST(unsigned char, (ubuf[i] & 0x3f) + 0x80);
373                 } else if (ubuf[i] <= 0x7fffffff) {
374                         if (end - buf < 6)
375                                 return NULL;
376                         *buf++ = CAST(unsigned char, (ubuf[i] >> 30) + 0xfc);
377                         *buf++ = CAST(unsigned char, ((ubuf[i] >> 24) & 0x3f) + 0x80);
378                         *buf++ = CAST(unsigned char, ((ubuf[i] >> 18) & 0x3f) + 0x80);
379                         *buf++ = CAST(unsigned char, ((ubuf[i] >> 12) & 0x3f) + 0x80);
380                         *buf++ = CAST(unsigned char, ((ubuf[i] >>  6) & 0x3f) + 0x80);
381                         *buf++ = CAST(unsigned char, (ubuf[i] & 0x3f) + 0x80);
382                 } else /* Invalid character */
383                         return NULL;
384         }
385
386         return buf;
387 }