1 /* $Id: read.c,v 1.101 2014/11/28 18:09:01 schwarze Exp $ */
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2010-2014 Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
21 #include <sys/types.h>
40 #include "mandoc_aux.h"
41 #include "libmandoc.h"
46 #define REPARSE_LIMIT 1000
49 struct man *pman; /* persistent man parser */
50 struct mdoc *pmdoc; /* persistent mdoc parser */
51 struct man *man; /* man parser */
52 struct mdoc *mdoc; /* mdoc parser */
53 struct roff *roff; /* roff parser (!NULL) */
54 const struct mchars *mchars; /* character table */
55 char *sodest; /* filename pointed to by .so */
56 const char *file; /* filename of current input file */
57 struct buf *primary; /* buffer currently being parsed */
58 struct buf *secondary; /* preprocessed copy of input */
59 const char *defos; /* default operating system */
60 mandocmsg mmsg; /* warning/error message handler */
61 enum mandoclevel file_status; /* status of current parse */
62 enum mandoclevel wlevel; /* ignore messages below this */
63 int options; /* parser options */
64 int filenc; /* encoding of the current file */
65 int reparse_count; /* finite interp. stack */
66 int line; /* line number in the file */
67 pid_t child; /* the gunzip(1) process */
70 static void choose_parser(struct mparse *);
71 static void resize_buf(struct buf *, size_t);
72 static void mparse_buf_r(struct mparse *, struct buf, size_t, int);
73 static int read_whole_file(struct mparse *, const char *, int,
75 static void mparse_end(struct mparse *);
76 static void mparse_parse_buffer(struct mparse *, struct buf,
79 static const enum mandocerr mandoclimits[MANDOCLEVEL_MAX] = {
89 static const char * const mandocerrs[MANDOCERR_MAX] = {
94 /* related to the prologue */
95 "missing manual title, using UNTITLED",
96 "missing manual title, using \"\"",
97 "lower case character in document title",
98 "missing manual section, using \"\"",
99 "unknown manual section",
100 "missing date, using today's date",
101 "cannot parse date, using it verbatim",
102 "missing Os macro, using \"\"",
103 "duplicate prologue macro",
104 "late prologue macro",
105 "skipping late title macro",
106 "prologue macros out of order",
108 /* related to document structure */
109 ".so is fragile, better use ln(1)",
111 "content before first section header",
112 "first section is not \"NAME\"",
113 "bad NAME section contents",
114 "sections out of conventional order",
115 "duplicate section title",
116 "unexpected section",
118 "unusual Xr punctuation",
119 "AUTHORS section without An macro",
121 /* related to macros and nesting */
123 "skipping paragraph macro",
124 "moving paragraph macro out of list",
125 "skipping no-space macro",
126 "blocks badly nested",
127 "nested displays are not portable",
128 "moving content out of list",
129 ".Vt block has child macro",
130 "fill mode already enabled, skipping",
131 "fill mode already disabled, skipping",
134 /* related to missing macro arguments */
135 "skipping empty request",
136 "conditional request controls empty scope",
137 "skipping empty macro",
138 "empty argument, using 0n",
139 "argument count wrong",
140 "missing display type, using -ragged",
141 "list type is not the first argument",
142 "missing -width in -tag list, using 8n",
143 "missing utility name, using \"\"",
144 "empty head in list item",
146 "missing font type, using \\fR",
147 "unknown font type, using \\fR",
148 "missing -std argument, adding it",
149 "missing eqn box, using \"\"",
151 /* related to bad macro arguments */
152 "unterminated quoted argument",
153 "duplicate argument",
154 "skipping duplicate argument",
155 "skipping duplicate display type",
156 "skipping duplicate list type",
157 "skipping -width argument",
158 "unknown AT&T UNIX version",
159 "comma in function argument",
160 "parenthesis in function name",
161 "invalid content in Rs block",
162 "invalid Boolean argument",
163 "unknown font, skipping request",
165 /* related to plain text */
166 "blank line in fill mode, using .sp",
167 "tab in filled text",
168 "whitespace at end of input line",
170 "invalid escape sequence",
171 "undefined string, using \"\"",
175 /* related to equations */
176 "unexpected equation scope closure",
177 "equation scope open on exit",
178 "overlapping equation scopes",
179 "unexpected end of equation",
181 /* related to tables */
185 "no table layout cells specified",
186 "no table data cells specified",
187 "ignore data in cell",
188 "data block still open",
189 "ignoring extra data cells",
191 /* related to document structure and macros */
192 "input stack limit exceeded, infinite loop?",
193 "skipping bad character",
194 "skipping unknown macro",
195 "skipping item outside list",
196 "skipping column outside column list",
197 "skipping end of block that is not open",
198 "inserting missing end of block",
199 "appending missing end of block",
201 /* related to request and macro arguments */
202 "escaped character not allowed in a name",
203 "argument count wrong",
204 "NOT IMPLEMENTED: Bd -file",
205 "missing list type, using -item",
206 "missing manual name, using \"\"",
207 "uname(3) system call failed, using UNKNOWN",
208 "unknown standard specifier",
209 "skipping request without numeric argument",
210 "skipping all arguments",
211 "skipping excess arguments",
214 "generic fatal error",
217 "NOT IMPLEMENTED: .so with absolute path or \"..\"",
218 ".so request failed",
221 "cannot dup file descriptor",
223 "gunzip failed with code",
228 "gunzip died from signal",
233 static const char * const mandoclevels[MANDOCLEVEL_MAX] = {
245 resize_buf(struct buf *buf, size_t initial)
248 buf->sz = buf->sz > initial/2 ? 2 * buf->sz : initial;
249 buf->buf = mandoc_realloc(buf->buf, buf->sz);
253 choose_parser(struct mparse *curp)
259 * If neither command line arguments -mdoc or -man select
260 * a parser nor the roff parser found a .Dd or .TH macro
261 * yet, look ahead in the main input buffer.
264 if ((format = roff_getformat(curp->roff)) == 0) {
265 cp = curp->primary->buf;
266 ep = cp + curp->primary->sz;
268 if (*cp == '.' || *cp == '\'') {
270 if (cp[0] == 'D' && cp[1] == 'd') {
271 format = MPARSE_MDOC;
274 if (cp[0] == 'T' && cp[1] == 'H') {
279 cp = memchr(cp, '\n', ep - cp);
286 if (format == MPARSE_MDOC) {
287 if (NULL == curp->pmdoc)
288 curp->pmdoc = mdoc_alloc(
289 curp->roff, curp, curp->defos,
290 MPARSE_QUICK & curp->options ? 1 : 0);
292 curp->mdoc = curp->pmdoc;
296 /* Fall back to man(7) as a last resort. */
298 if (NULL == curp->pman)
299 curp->pman = man_alloc(curp->roff, curp,
300 MPARSE_QUICK & curp->options ? 1 : 0);
302 curp->man = curp->pman;
306 * Main parse routine for a buffer.
307 * It assumes encoding and line numbering are already set up.
308 * It can recurse directly (for invocations of user-defined
309 * macros, inline equations, and input line traps)
310 * and indirectly (for .so file inclusion).
313 mparse_buf_r(struct mparse *curp, struct buf blk, size_t i, int start)
315 const struct tbl_span *span;
317 size_t pos; /* byte number in the ln buffer */
320 int lnn; /* line number in the real file */
323 memset(&ln, 0, sizeof(ln));
329 if (0 == pos && '\0' == blk.buf[i])
334 curp->reparse_count = 0;
337 curp->filenc & MPARSE_UTF8 &&
338 curp->filenc & MPARSE_LATIN1)
339 curp->filenc = preconv_cue(&blk, i);
342 while (i < blk.sz && (start || blk.buf[i] != '\0')) {
345 * When finding an unescaped newline character,
346 * leave the character loop to process the line.
347 * Skip a preceding carriage return, if any.
350 if ('\r' == blk.buf[i] && i + 1 < blk.sz &&
351 '\n' == blk.buf[i + 1])
353 if ('\n' == blk.buf[i]) {
360 * Make sure we have space for the worst
361 * case of 11 bytes: "\\[u10ffff]\0"
364 if (pos + 11 > ln.sz)
365 resize_buf(&ln, 256);
368 * Encode 8-bit input.
373 if ( ! (curp->filenc && preconv_encode(
374 &blk, &i, &ln, &pos, &curp->filenc))) {
375 mandoc_vmsg(MANDOCERR_BADCHAR,
376 curp, curp->line, pos,
385 * Exclude control characters.
388 if (c == 0x7f || (c < 0x20 && c != 0x09)) {
389 mandoc_vmsg(MANDOCERR_BADCHAR, curp,
390 curp->line, pos, "0x%x", c);
396 /* Trailing backslash = a plain char. */
398 if (blk.buf[i] != '\\' || i + 1 == blk.sz) {
399 ln.buf[pos++] = blk.buf[i++];
404 * Found escape and at least one other character.
405 * When it's a newline character, skip it.
406 * When there is a carriage return in between,
407 * skip that one as well.
410 if ('\r' == blk.buf[i + 1] && i + 2 < blk.sz &&
411 '\n' == blk.buf[i + 2])
413 if ('\n' == blk.buf[i + 1]) {
419 if ('"' == blk.buf[i + 1] || '#' == blk.buf[i + 1]) {
421 /* Comment, skip to end of line */
422 for (; i < blk.sz; ++i) {
423 if ('\n' == blk.buf[i]) {
430 /* Backout trailing whitespaces */
431 for (; pos > 0; --pos) {
432 if (ln.buf[pos - 1] != ' ')
434 if (pos > 2 && ln.buf[pos - 2] == '\\')
440 /* Catch escaped bogus characters. */
442 c = (unsigned char) blk.buf[i+1];
444 if ( ! (isascii(c) &&
445 (isgraph(c) || isblank(c)))) {
446 mandoc_vmsg(MANDOCERR_BADCHAR, curp,
447 curp->line, pos, "0x%x", c);
453 /* Some other escape sequence, copy & cont. */
455 ln.buf[pos++] = blk.buf[i++];
456 ln.buf[pos++] = blk.buf[i++];
460 resize_buf(&ln, 256);
465 * A significant amount of complexity is contained by
466 * the roff preprocessor. It's line-oriented but can be
467 * expressed on one line, so we need at times to
468 * readjust our starting point and re-run it. The roff
469 * preprocessor can also readjust the buffers with new
470 * data, so we pass them in wholesale.
476 * Maintain a lookaside buffer of all parsed lines. We
477 * only do this if mparse_keep() has been invoked (the
478 * buffer may be accessed with mparse_getkeep()).
481 if (curp->secondary) {
482 curp->secondary->buf = mandoc_realloc(
483 curp->secondary->buf,
484 curp->secondary->sz + pos + 2);
485 memcpy(curp->secondary->buf +
488 curp->secondary->sz += pos;
490 [curp->secondary->sz] = '\n';
491 curp->secondary->sz++;
493 [curp->secondary->sz] = '\0';
496 rr = roff_parseln(curp->roff, curp->line, &ln, &of);
500 if (REPARSE_LIMIT >= ++curp->reparse_count)
501 mparse_buf_r(curp, ln, of, 0);
503 mandoc_msg(MANDOCERR_ROFFLOOP, curp,
504 curp->line, pos, NULL);
508 pos = strlen(ln.buf);
516 assert(MANDOCLEVEL_FATAL <= curp->file_status);
519 if ( ! (curp->options & MPARSE_SO) &&
520 (i >= blk.sz || blk.buf[i] == '\0')) {
521 curp->sodest = mandoc_strdup(ln.buf + of);
526 * We remove `so' clauses from our lookaside
527 * buffer because we're going to descend into
528 * the file recursively.
531 curp->secondary->sz -= pos + 1;
532 mparse_readfd(curp, -1, ln.buf + of);
533 if (MANDOCLEVEL_FATAL <= curp->file_status) {
534 mandoc_vmsg(MANDOCERR_SO_FAIL,
535 curp, curp->line, pos,
536 ".so %s", ln.buf + of);
546 * If we encounter errors in the recursive parse, make
547 * sure we don't continue parsing.
550 if (MANDOCLEVEL_FATAL <= curp->file_status)
554 * If input parsers have not been allocated, do so now.
555 * We keep these instanced between parsers, but set them
556 * locally per parse routine since we can use different
557 * parsers with each one.
560 if ( ! (curp->man || curp->mdoc))
564 * Lastly, push down into the parsers themselves.
565 * If libroff returns ROFF_TBL, then add it to the
566 * currently open parse. Since we only get here if
567 * there does exist data (see tbl_data.c), we're
568 * guaranteed that something's been allocated.
569 * Do the same for ROFF_EQN.
572 if (rr == ROFF_TBL) {
573 while ((span = roff_span(curp->roff)) != NULL)
574 if (curp->man == NULL)
575 mdoc_addspan(curp->mdoc, span);
577 man_addspan(curp->man, span);
578 } else if (rr == ROFF_EQN) {
579 if (curp->man == NULL)
580 mdoc_addeqn(curp->mdoc, roff_eqn(curp->roff));
582 man_addeqn(curp->man, roff_eqn(curp->roff));
583 } else if ((curp->man == NULL ?
584 mdoc_parseln(curp->mdoc, curp->line, ln.buf, of) :
585 man_parseln(curp->man, curp->line, ln.buf, of)) == 2)
588 /* Temporary buffers typically are not full. */
590 if (0 == start && '\0' == blk.buf[i])
593 /* Start the next input line. */
602 read_whole_file(struct mparse *curp, const char *file, int fd,
603 struct buf *fb, int *with_mmap)
610 if (-1 == fstat(fd, &st)) {
611 curp->file_status = MANDOCLEVEL_SYSERR;
613 (*curp->mmsg)(MANDOCERR_SYSSTAT, curp->file_status,
614 file, 0, 0, strerror(errno));
619 * If we're a regular file, try just reading in the whole entry
620 * via mmap(). This is faster than reading it into blocks, and
621 * since each file is only a few bytes to begin with, I'm not
622 * concerned that this is going to tank any machines.
625 if (S_ISREG(st.st_mode)) {
626 if (st.st_size >= (1U << 31)) {
627 curp->file_status = MANDOCLEVEL_FATAL;
629 (*curp->mmsg)(MANDOCERR_TOOLARGE,
630 curp->file_status, file, 0, 0, NULL);
634 fb->sz = (size_t)st.st_size;
635 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
636 if (fb->buf != MAP_FAILED)
642 * If this isn't a regular file (like, say, stdin), then we must
643 * go the old way and just read things in bit by bit.
652 if (fb->sz == (1U << 31)) {
653 curp->file_status = MANDOCLEVEL_FATAL;
655 (*curp->mmsg)(MANDOCERR_TOOLARGE,
660 resize_buf(fb, 65536);
662 ssz = read(fd, fb->buf + (int)off, fb->sz - off);
668 curp->file_status = MANDOCLEVEL_SYSERR;
670 (*curp->mmsg)(MANDOCERR_SYSREAD,
671 curp->file_status, file, 0, 0,
684 mparse_end(struct mparse *curp)
687 if (MANDOCLEVEL_FATAL <= curp->file_status)
690 if (curp->mdoc == NULL &&
692 curp->sodest == NULL) {
693 if (curp->options & MPARSE_MDOC)
694 curp->mdoc = curp->pmdoc;
696 if (curp->pman == NULL)
697 curp->pman = man_alloc(curp->roff, curp,
698 curp->options & MPARSE_QUICK ? 1 : 0);
699 curp->man = curp->pman;
703 if (curp->mdoc && ! mdoc_endparse(curp->mdoc)) {
704 assert(MANDOCLEVEL_FATAL <= curp->file_status);
708 if (curp->man && ! man_endparse(curp->man)) {
709 assert(MANDOCLEVEL_FATAL <= curp->file_status);
713 roff_endparse(curp->roff);
717 mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
719 struct buf *svprimary;
722 static int recursion_depth;
724 if (64 < recursion_depth) {
725 mandoc_msg(MANDOCERR_ROFFLOOP, curp, curp->line, 0, NULL);
729 /* Line number is per-file. */
732 svprimary = curp->primary;
733 curp->primary = &blk;
737 /* Skip an UTF-8 byte order mark. */
738 if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
739 (unsigned char)blk.buf[0] == 0xef &&
740 (unsigned char)blk.buf[1] == 0xbb &&
741 (unsigned char)blk.buf[2] == 0xbf) {
743 curp->filenc &= ~MPARSE_LATIN1;
747 mparse_buf_r(curp, blk, offset, 1);
749 if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status)
752 curp->primary = svprimary;
757 mparse_readmem(struct mparse *curp, const void *buf, size_t len,
762 blk.buf = UNCONST(buf);
765 mparse_parse_buffer(curp, blk, file);
766 return(curp->file_status);
770 * If a file descriptor is given, use it and assume it points
771 * to the named file. Otherwise, open the named file.
772 * Read the whole file into memory and call the parsers.
773 * Called recursively when an .so request is encountered.
776 mparse_readfd(struct mparse *curp, int fd, const char *file)
783 save_child = curp->child;
786 else if (mparse_open(curp, &fd, file) >= MANDOCLEVEL_SYSERR)
789 if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
790 save_filenc = curp->filenc;
791 curp->filenc = curp->options &
792 (MPARSE_UTF8 | MPARSE_LATIN1);
793 mparse_parse_buffer(curp, blk, file);
794 curp->filenc = save_filenc;
797 munmap(blk.buf, blk.sz);
803 if (fd != STDIN_FILENO && close(fd) == -1)
808 curp->child = save_child;
809 return(curp->file_status);
813 mparse_open(struct mparse *curp, int *fd, const char *file)
823 /* Unless zipped, try to just open the file. */
825 if ((cp = strrchr(file, '.')) == NULL ||
826 strcmp(cp + 1, "gz")) {
828 if ((*fd = open(file, O_RDONLY)) != -1)
829 return(MANDOCLEVEL_OK);
831 /* Open failed; try to append ".gz". */
833 mandoc_asprintf(&cp, "%s.gz", file);
838 /* Before forking, make sure the file can be read. */
841 if (access(file, R_OK) == -1) {
844 err = MANDOCERR_SYSOPEN;
850 if (pipe(pfd) == -1) {
851 err = MANDOCERR_SYSPIPE;
855 switch (curp->child = fork()) {
857 err = MANDOCERR_SYSFORK;
864 if (dup2(pfd[1], STDOUT_FILENO) == -1) {
865 err = MANDOCERR_SYSDUP;
868 execlp("gunzip", "gunzip", "-c", file, NULL);
869 err = MANDOCERR_SYSEXEC;
874 return(MANDOCLEVEL_OK);
881 curp->file_status = MANDOCLEVEL_SYSERR;
883 (*curp->mmsg)(err, curp->file_status, curp->file,
884 0, 0, strerror(errno));
887 return(curp->file_status);
891 mparse_wait(struct mparse *curp)
895 if (curp->child == 0)
896 return(MANDOCLEVEL_OK);
898 if (waitpid(curp->child, &status, 0) == -1) {
899 mandoc_msg(MANDOCERR_SYSWAIT, curp, 0, 0,
901 curp->file_status = MANDOCLEVEL_SYSERR;
902 return(curp->file_status);
904 if (WIFSIGNALED(status)) {
905 mandoc_vmsg(MANDOCERR_SYSSIG, curp, 0, 0,
906 "%d", WTERMSIG(status));
907 curp->file_status = MANDOCLEVEL_SYSERR;
908 return(curp->file_status);
910 if (WEXITSTATUS(status)) {
911 mandoc_vmsg(MANDOCERR_SYSEXIT, curp, 0, 0,
912 "%d", WEXITSTATUS(status));
913 curp->file_status = MANDOCLEVEL_SYSERR;
914 return(curp->file_status);
916 return(MANDOCLEVEL_OK);
920 mparse_alloc(int options, enum mandoclevel wlevel, mandocmsg mmsg,
921 const struct mchars *mchars, const char *defos)
925 assert(wlevel <= MANDOCLEVEL_FATAL);
927 curp = mandoc_calloc(1, sizeof(struct mparse));
929 curp->options = options;
930 curp->wlevel = wlevel;
934 curp->mchars = mchars;
935 curp->roff = roff_alloc(curp, curp->mchars, options);
936 if (curp->options & MPARSE_MDOC)
937 curp->pmdoc = mdoc_alloc(
938 curp->roff, curp, curp->defos,
939 curp->options & MPARSE_QUICK ? 1 : 0);
940 if (curp->options & MPARSE_MAN)
941 curp->pman = man_alloc(curp->roff, curp,
942 curp->options & MPARSE_QUICK ? 1 : 0);
948 mparse_reset(struct mparse *curp)
951 roff_reset(curp->roff);
954 mdoc_reset(curp->mdoc);
956 man_reset(curp->man);
958 curp->secondary->sz = 0;
960 curp->file_status = MANDOCLEVEL_OK;
969 mparse_free(struct mparse *curp)
973 mdoc_free(curp->pmdoc);
975 man_free(curp->pman);
977 roff_free(curp->roff);
979 free(curp->secondary->buf);
981 free(curp->secondary);
987 mparse_result(struct mparse *curp,
988 struct mdoc **mdoc, struct man **man, char **sodest)
991 if (sodest && NULL != (*sodest = curp->sodest)) {
1003 mandoc_vmsg(enum mandocerr t, struct mparse *m,
1004 int ln, int pos, const char *fmt, ...)
1010 (void)vsnprintf(buf, sizeof(buf), fmt, ap);
1013 mandoc_msg(t, m, ln, pos, buf);
1017 mandoc_msg(enum mandocerr er, struct mparse *m,
1018 int ln, int col, const char *msg)
1020 enum mandoclevel level;
1022 level = MANDOCLEVEL_FATAL;
1023 while (er < mandoclimits[level])
1026 if (level < m->wlevel)
1030 (*m->mmsg)(er, level, m->file, ln, col, msg);
1032 if (m->file_status < level)
1033 m->file_status = level;
1037 mparse_strerror(enum mandocerr er)
1040 return(mandocerrs[er]);
1044 mparse_strlevel(enum mandoclevel lvl)
1046 return(mandoclevels[lvl]);
1050 mparse_keep(struct mparse *p)
1053 assert(NULL == p->secondary);
1054 p->secondary = mandoc_calloc(1, sizeof(struct buf));
1058 mparse_getkeep(const struct mparse *p)
1061 assert(p->secondary);
1062 return(p->secondary->sz ? p->secondary->buf : NULL);