1 /* $Id: read.c,v 1.173 2017/06/08 00:23:30 schwarze Exp $ */
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2010-2017 Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
21 #include <sys/types.h>
39 #include "mandoc_aux.h"
44 #include "libmandoc.h"
47 #define REPARSE_LIMIT 1000
50 struct roff *roff; /* roff parser (!NULL) */
51 struct roff_man *man; /* man parser */
52 char *sodest; /* filename pointed to by .so */
53 const char *file; /* filename of current input file */
54 struct buf *primary; /* buffer currently being parsed */
55 struct buf *secondary; /* preprocessed copy of input */
56 const char *defos; /* default operating system */
57 mandocmsg mmsg; /* warning/error message handler */
58 enum mandoclevel file_status; /* status of current parse */
59 enum mandoclevel wlevel; /* ignore messages below this */
60 int options; /* parser options */
61 int gzip; /* current input file is gzipped */
62 int filenc; /* encoding of the current file */
63 int reparse_count; /* finite interp. stack */
64 int line; /* line number in the file */
67 static void choose_parser(struct mparse *);
68 static void resize_buf(struct buf *, size_t);
69 static int mparse_buf_r(struct mparse *, struct buf, size_t, int);
70 static int read_whole_file(struct mparse *, const char *, int,
72 static void mparse_end(struct mparse *);
73 static void mparse_parse_buffer(struct mparse *, struct buf,
76 static const enum mandocerr mandoclimits[MANDOCLEVEL_MAX] = {
86 static const char * const mandocerrs[MANDOCERR_MAX] = {
89 "generic style suggestion",
92 "consider using OS macro",
93 "errnos out of order",
95 "description line ends with a full stop",
99 /* related to the prologue */
100 "missing manual title, using UNTITLED",
101 "missing manual title, using \"\"",
102 "lower case character in document title",
103 "missing manual section, using \"\"",
104 "unknown manual section",
105 "missing date, using today's date",
106 "cannot parse date, using it verbatim",
107 "missing Os macro, using \"\"",
108 "duplicate prologue macro",
109 "late prologue macro",
110 "skipping late title macro",
111 "prologue macros out of order",
113 /* related to document structure */
114 ".so is fragile, better use ln(1)",
116 "content before first section header",
117 "first section is not \"NAME\"",
118 "NAME section without Nm before Nd",
119 "NAME section without description",
120 "description not at the end of NAME",
121 "bad NAME section content",
122 "missing comma before name",
123 "missing description line, using \"\"",
124 "description line outside NAME section",
125 "sections out of conventional order",
126 "duplicate section title",
127 "unexpected section",
129 "unusual Xr punctuation",
130 "AUTHORS section without An macro",
132 /* related to macros and nesting */
134 "macro neither callable nor escaped",
135 "skipping paragraph macro",
136 "moving paragraph macro out of list",
137 "skipping no-space macro",
138 "blocks badly nested",
139 "nested displays are not portable",
140 "moving content out of list",
141 "fill mode already enabled, skipping",
142 "fill mode already disabled, skipping",
144 "skipping blank line in line scope",
146 /* related to missing macro arguments */
147 "skipping empty request",
148 "conditional request controls empty scope",
149 "skipping empty macro",
151 "empty argument, using 0n",
152 "missing display type, using -ragged",
153 "list type is not the first argument",
154 "missing -width in -tag list, using 6n",
155 "missing utility name, using \"\"",
156 "missing function name, using \"\"",
157 "empty head in list item",
159 "missing font type, using \\fR",
160 "unknown font type, using \\fR",
161 "nothing follows prefix",
162 "empty reference block",
163 "missing section argument",
164 "missing -std argument, adding it",
165 "missing option string, using \"\"",
166 "missing resource identifier, using \"\"",
167 "missing eqn box, using \"\"",
169 /* related to bad macro arguments */
170 "unterminated quoted argument",
171 "duplicate argument",
172 "skipping duplicate argument",
173 "skipping duplicate display type",
174 "skipping duplicate list type",
175 "skipping -width argument",
176 "wrong number of cells",
177 "unknown AT&T UNIX version",
178 "comma in function argument",
179 "parenthesis in function name",
180 "unknown library name",
181 "invalid content in Rs block",
182 "invalid Boolean argument",
183 "unknown font, skipping request",
184 "odd number of characters in request",
186 /* related to plain text */
187 "blank line in fill mode, using .sp",
188 "tab in filled text",
189 "whitespace at end of input line",
190 "new sentence, new line",
192 "invalid escape sequence",
193 "undefined string, using \"\"",
195 /* related to tables */
196 "tbl line starts with span",
197 "tbl column starts with span",
198 "skipping vertical bar in tbl layout",
202 /* related to tables */
203 "non-alphabetic character in tbl options",
204 "skipping unknown tbl option",
205 "missing tbl option argument",
206 "wrong tbl option argument size",
208 "invalid character in tbl layout",
209 "unmatched parenthesis in tbl layout",
210 "tbl without any data cells",
211 "ignoring data in spanned tbl cell",
212 "ignoring extra tbl data cells",
213 "data block open at end of tbl",
215 /* related to document structure and macros */
217 "input stack limit exceeded, infinite loop?",
218 "skipping bad character",
219 "skipping unknown macro",
220 "skipping insecure request",
221 "skipping item outside list",
222 "skipping column outside column list",
223 "skipping end of block that is not open",
224 "fewer RS blocks open, skipping",
225 "inserting missing end of block",
226 "appending missing end of block",
228 /* related to request and macro arguments */
229 "escaped character not allowed in a name",
230 "NOT IMPLEMENTED: Bd -file",
231 "skipping display without arguments",
232 "missing list type, using -item",
233 "argument is not numeric, using 1",
234 "missing manual name, using \"\"",
235 "uname(3) system call failed, using UNKNOWN",
236 "unknown standard specifier",
237 "skipping request without numeric argument",
238 "NOT IMPLEMENTED: .so with absolute path or \"..\"",
239 ".so request failed",
240 "skipping all arguments",
241 "skipping excess arguments",
244 "unsupported feature",
246 "unsupported control character",
247 "unsupported roff request",
248 "eqn delim option in tbl",
249 "unsupported tbl layout modifier",
250 "ignoring macro in table",
253 static const char * const mandoclevels[MANDOCLEVEL_MAX] = {
265 resize_buf(struct buf *buf, size_t initial)
268 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
269 buf->buf = mandoc_realloc(buf->buf, buf->sz);
273 choose_parser(struct mparse *curp)
279 * If neither command line arguments -mdoc or -man select
280 * a parser nor the roff parser found a .Dd or .TH macro
281 * yet, look ahead in the main input buffer.
284 if ((format = roff_getformat(curp->roff)) == 0) {
285 cp = curp->primary->buf;
286 ep = cp + curp->primary->sz;
288 if (*cp == '.' || *cp == '\'') {
290 if (cp[0] == 'D' && cp[1] == 'd') {
291 format = MPARSE_MDOC;
294 if (cp[0] == 'T' && cp[1] == 'H') {
299 cp = memchr(cp, '\n', ep - cp);
306 if (format == MPARSE_MDOC) {
307 curp->man->macroset = MACROSET_MDOC;
308 if (curp->man->mdocmac == NULL)
309 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
311 curp->man->macroset = MACROSET_MAN;
312 if (curp->man->manmac == NULL)
313 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
315 curp->man->first->tok = TOKEN_NONE;
319 * Main parse routine for a buffer.
320 * It assumes encoding and line numbering are already set up.
321 * It can recurse directly (for invocations of user-defined
322 * macros, inline equations, and input line traps)
323 * and indirectly (for .so file inclusion).
326 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
328 const struct tbl_span *span;
330 const char *save_file;
332 size_t pos; /* byte number in the ln buffer */
335 int lnn; /* line number in the real file */
339 memset(&ln, 0, sizeof(ln));
345 if (0 == pos && '\0' == blk.buf[i])
350 curp->reparse_count = 0;
353 curp->filenc & MPARSE_UTF8 &&
354 curp->filenc & MPARSE_LATIN1)
355 curp->filenc = preconv_cue(&blk, i);
358 while (i < blk.sz && (start || blk.buf[i] != '\0')) {
361 * When finding an unescaped newline character,
362 * leave the character loop to process the line.
363 * Skip a preceding carriage return, if any.
366 if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
367 '\n' == blk.buf[i + 1])
369 if ('\n' == blk.buf[i]) {
376 * Make sure we have space for the worst
377 * case of 11 bytes: "\\[u10ffff]\0"
380 if (pos + 11 > ln.sz)
381 resize_buf(&ln, 256);
384 * Encode 8-bit input.
389 if ( ! (curp->filenc && preconv_encode(
390 &blk, &i, &ln, &pos, &curp->filenc))) {
391 mandoc_vmsg(MANDOCERR_CHAR_BAD, curp,
392 curp->line, pos, "0x%x", c);
400 * Exclude control characters.
403 if (c == 0x7f || (c < 0x20 && c != 0x09)) {
404 mandoc_vmsg(c == 0x00 || c == 0x04 ||
405 c > 0x0a ? MANDOCERR_CHAR_BAD :
406 MANDOCERR_CHAR_UNSUPP,
407 curp, curp->line, pos, "0x%x", c);
414 ln.buf[pos++] = blk.buf[i++];
417 if (pos + 1 >= ln.sz)
418 resize_buf(&ln, 256);
420 if (i == blk.sz || blk.buf[i] == '\0')
421 ln.buf[pos++] = '\n';
425 * A significant amount of complexity is contained by
426 * the roff preprocessor. It's line-oriented but can be
427 * expressed on one line, so we need at times to
428 * readjust our starting point and re-run it. The roff
429 * preprocessor can also readjust the buffers with new
430 * data, so we pass them in wholesale.
436 * Maintain a lookaside buffer of all parsed lines. We
437 * only do this if mparse_keep() has been invoked (the
438 * buffer may be accessed with mparse_getkeep()).
441 if (curp->secondary) {
442 curp->secondary->buf = mandoc_realloc(
443 curp->secondary->buf,
444 curp->secondary->sz + pos + 2);
445 memcpy(curp->secondary->buf +
448 curp->secondary->sz += pos;
450 [curp->secondary->sz] = '\n';
451 curp->secondary->sz++;
453 [curp->secondary->sz] = '\0';
456 rr = roff_parseln(curp->roff, curp->line, &ln, &of);
460 if (++curp->reparse_count > REPARSE_LIMIT)
461 mandoc_msg(MANDOCERR_ROFFLOOP, curp,
462 curp->line, pos, NULL);
463 else if (mparse_buf_r(curp, ln, of, 0) == 1 ||
471 pos = strlen(ln.buf);
479 if ( ! (curp->options & MPARSE_SO) &&
480 (i >= blk.sz || blk.buf[i] == '\0')) {
481 curp->sodest = mandoc_strdup(ln.buf + of);
486 * We remove `so' clauses from our lookaside
487 * buffer because we're going to descend into
488 * the file recursively.
491 curp->secondary->sz -= pos + 1;
492 save_file = curp->file;
493 if ((fd = mparse_open(curp, ln.buf + of)) != -1) {
494 mparse_readfd(curp, fd, ln.buf + of);
496 curp->file = save_file;
498 curp->file = save_file;
499 mandoc_vmsg(MANDOCERR_SO_FAIL,
500 curp, curp->line, pos,
501 ".so %s", ln.buf + of);
502 ln.sz = mandoc_asprintf(&cp,
503 ".sp\nSee the file %s.\n.sp",
508 mparse_buf_r(curp, ln, of, 0);
516 if (curp->man->macroset == MACROSET_NONE)
520 * Lastly, push down into the parsers themselves.
521 * If libroff returns ROFF_TBL, then add it to the
522 * currently open parse. Since we only get here if
523 * there does exist data (see tbl_data.c), we're
524 * guaranteed that something's been allocated.
525 * Do the same for ROFF_EQN.
529 while ((span = roff_span(curp->roff)) != NULL)
530 roff_addtbl(curp->man, span);
531 else if (rr == ROFF_EQN)
532 roff_addeqn(curp->man, roff_eqn(curp->roff));
533 else if ((curp->man->macroset == MACROSET_MDOC ?
534 mdoc_parseln(curp->man, curp->line, ln.buf, of) :
535 man_parseln(curp->man, curp->line, ln.buf, of)) == 2)
538 /* Temporary buffers typically are not full. */
540 if (0 == start && '\0' == blk.buf[i])
543 /* Start the next input line. */
553 read_whole_file(struct mparse *curp, const char *file, int fd,
554 struct buf *fb, int *with_mmap)
561 if (fstat(fd, &st) == -1)
562 err((int)MANDOCLEVEL_SYSERR, "%s", file);
565 * If we're a regular file, try just reading in the whole entry
566 * via mmap(). This is faster than reading it into blocks, and
567 * since each file is only a few bytes to begin with, I'm not
568 * concerned that this is going to tank any machines.
571 if (curp->gzip == 0 && S_ISREG(st.st_mode)) {
572 if (st.st_size > 0x7fffffff) {
573 mandoc_msg(MANDOCERR_TOOLARGE, curp, 0, 0, NULL);
577 fb->sz = (size_t)st.st_size;
578 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
579 if (fb->buf != MAP_FAILED)
584 if ((gz = gzdopen(fd, "rb")) == NULL)
585 err((int)MANDOCLEVEL_SYSERR, "%s", file);
590 * If this isn't a regular file (like, say, stdin), then we must
591 * go the old way and just read things in bit by bit.
600 if (fb->sz == (1U << 31)) {
601 mandoc_msg(MANDOCERR_TOOLARGE, curp,
605 resize_buf(fb, 65536);
608 gzread(gz, fb->buf + (int)off, fb->sz - off) :
609 read(fd, fb->buf + (int)off, fb->sz - off);
615 err((int)MANDOCLEVEL_SYSERR, "%s", file);
625 mparse_end(struct mparse *curp)
627 if (curp->man->macroset == MACROSET_NONE)
628 curp->man->macroset = MACROSET_MAN;
629 if (curp->man->macroset == MACROSET_MDOC)
630 mdoc_endparse(curp->man);
632 man_endparse(curp->man);
633 roff_endparse(curp->roff);
637 mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
639 struct buf *svprimary;
642 static int recursion_depth;
644 if (64 < recursion_depth) {
645 mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, 0, NULL);
649 /* Line number is per-file. */
652 svprimary = curp->primary;
653 curp->primary = &blk;
657 /* Skip an UTF-8 byte order mark. */
658 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
659 (unsigned char)blk.buf[0] == 0xef &&
660 (unsigned char)blk.buf[1] == 0xbb &&
661 (unsigned char)blk.buf[2] == 0xbf) {
663 curp->filenc &= ~MPARSE_LATIN1;
667 mparse_buf_r(curp, blk, offset, 1);
669 if (--recursion_depth == 0)
672 curp->primary = svprimary;
677 mparse_readmem(struct mparse *curp, void *buf, size_t len,
685 mparse_parse_buffer(curp, blk, file);
686 return curp->file_status;
690 * Read the whole file into memory and call the parsers.
691 * Called recursively when an .so request is encountered.
694 mparse_readfd(struct mparse *curp, int fd, const char *file)
700 if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
701 save_filenc = curp->filenc;
702 curp->filenc = curp->options &
703 (MPARSE_UTF8 | MPARSE_LATIN1);
704 mparse_parse_buffer(curp, blk, file);
705 curp->filenc = save_filenc;
707 munmap(blk.buf, blk.sz);
711 return curp->file_status;
715 mparse_open(struct mparse *curp, const char *file)
721 cp = strrchr(file, '.');
722 curp->gzip = (cp != NULL && ! strcmp(cp + 1, "gz"));
724 /* First try to use the filename as it is. */
726 if ((fd = open(file, O_RDONLY)) != -1)
730 * If that doesn't work and the filename doesn't
731 * already end in .gz, try appending .gz.
735 mandoc_asprintf(&cp, "%s.gz", file);
736 fd = open(cp, O_RDONLY);
744 /* Neither worked, give up. */
746 mandoc_msg(MANDOCERR_FILE, curp, 0, 0, strerror(errno));
751 mparse_alloc(int options, enum mandoclevel wlevel, mandocmsg mmsg,
756 curp = mandoc_calloc(1, sizeof(struct mparse));
758 curp->options = options;
759 curp->wlevel = wlevel;
763 curp->roff = roff_alloc(curp, options);
764 curp->man = roff_man_alloc( curp->roff, curp, curp->defos,
765 curp->options & MPARSE_QUICK ? 1 : 0);
766 if (curp->options & MPARSE_MDOC) {
767 curp->man->macroset = MACROSET_MDOC;
768 if (curp->man->mdocmac == NULL)
769 curp->man->mdocmac = roffhash_alloc(MDOC_Dd, MDOC_MAX);
770 } else if (curp->options & MPARSE_MAN) {
771 curp->man->macroset = MACROSET_MAN;
772 if (curp->man->manmac == NULL)
773 curp->man->manmac = roffhash_alloc(MAN_TH, MAN_MAX);
775 curp->man->first->tok = TOKEN_NONE;
780 mparse_reset(struct mparse *curp)
782 roff_reset(curp->roff);
783 roff_man_reset(curp->man);
789 curp->secondary->sz = 0;
791 curp->file_status = MANDOCLEVEL_OK;
796 mparse_free(struct mparse *curp)
799 roffhash_free(curp->man->mdocmac);
800 roffhash_free(curp->man->manmac);
801 roff_man_free(curp->man);
802 roff_free(curp->roff);
804 free(curp->secondary->buf);
806 free(curp->secondary);
812 mparse_result(struct mparse *curp, struct roff_man **man,
816 if (sodest && NULL != (*sodest = curp->sodest)) {
825 mparse_updaterc(struct mparse *curp, enum mandoclevel *rc)
827 if (curp->file_status > *rc)
828 *rc = curp->file_status;
832 mandoc_vmsg(enum mandocerr t, struct mparse *m,
833 int ln, int pos, const char *fmt, ...)
839 (void)vsnprintf(buf, sizeof(buf), fmt, ap);
842 mandoc_msg(t, m, ln, pos, buf);
846 mandoc_msg(enum mandocerr er, struct mparse *m,
847 int ln, int col, const char *msg)
849 enum mandoclevel level;
851 level = MANDOCLEVEL_UNSUPP;
852 while (er < mandoclimits[level])
855 if (level < m->wlevel && er != MANDOCERR_FILE)
859 (*m->mmsg)(er, level, m->file, ln, col, msg);
861 if (m->file_status < level)
862 m->file_status = level;
866 mparse_strerror(enum mandocerr er)
869 return mandocerrs[er];
873 mparse_strlevel(enum mandoclevel lvl)
875 return mandoclevels[lvl];
879 mparse_keep(struct mparse *p)
882 assert(NULL == p->secondary);
883 p->secondary = mandoc_calloc(1, sizeof(struct buf));
887 mparse_getkeep(const struct mparse *p)
890 assert(p->secondary);
891 return p->secondary->sz ? p->secondary->buf : NULL;