scdoc2mdoc

A fork of scdoc to output mdoc(7)
git clone git://git.sgregoratto.me/scdoc2mdoc
Log | Files | Refs | README | LICENSE

parser.c (13431B)


      1 #include "config.h"
      2 #include <ctype.h>
      3 #if HAVE_ERR
      4 #include <err.h>
      5 #endif
      6 #include <errno.h>
      7 #include <limits.h>
      8 #include <stdbool.h>
      9 #include <stdint.h>
     10 #include <stdio.h>
     11 #include <stdlib.h>
     12 #include <string.h>
     13 #include <time.h>
     14 #include "utf8.h"
     15 #include "string.h"
     16 #include "parser.h"
     17 
     18 void
     19 parser_fatal(struct parser *p, const char *err)
     20 {
     21 	fclose(p->input);
     22 	errx(EXIT_FAILURE, "%s:%d:%d: %s", p->fname, p->line, p->col, err);
     23 }
     24 
     25 uint32_t
     26 parser_getch(struct parser *p)
     27 {
     28 	if (p->qhead) {
     29 		return p->queue[--p->qhead];
     30 	} else if (p->str) {
     31 		uint32_t ch = utf8_decode(&p->str);
     32 		if (!ch || ch == UTF8_INVALID) {
     33 			p->str = NULL;
     34 			return UTF8_INVALID;
     35 		}
     36 		return ch;
     37 	}
     38 	uint32_t ch = utf8_fgetch(p->input);
     39 	if (ch == '\n') {
     40 		p->col = 0;
     41 		++p->line;
     42 	} else {
     43 		++p->col;
     44 	}
     45 	return ch;
     46 }
     47 
     48 void
     49 parser_pushch(struct parser *p, uint32_t ch)
     50 {
     51 	if (ch != UTF8_INVALID)
     52 		p->queue[p->qhead++] = ch;
     53 }
     54 
     55 void
     56 parser_pushstr(struct parser *p, const char *str)
     57 {
     58 	p->str = str;
     59 }
     60 
     61 int
     62 parse_section(struct parser *p)
     63 {
     64 	str_t *section = str_create();
     65 	uint32_t ch;
     66 	int sec;
     67 	const char *errstr;
     68 	while ((ch = parser_getch(p)) != UTF8_INVALID) {
     69 		if (ch < 0x80 && isdigit(ch)) {
     70 			str_append_ch(section, ch);
     71 		} else if (ch == ')') {
     72 			if (!section->str)
     73 				break;
     74 			sec = strtonum(section->str, 0, 9, &errstr);
     75 			if (errstr != NULL)
     76 				parser_fatal(p, "Expected section between 0 and 9");
     77 			str_free(section);
     78 			return sec;
     79 		} else {
     80 			parser_fatal(p, "Expected digit or ')'");
     81 		}
     82 	}
     83 
     84 	parser_fatal(p, "Expected manual section");
     85 	return -1;
     86 }
     87 
     88 void
     89 parse_preamble(struct parser *p)
     90 {
     91 	str_t *name = str_create();
     92 	int section = -1;
     93 	uint32_t ch;
     94 	time_t date_time;
     95 	char date[256];
     96 	char *offset;
     97 	char *source_date_epoch = getenv("SOURCE_DATE_EPOCH");
     98 	if (source_date_epoch != NULL) {
     99 		long long epoch;
    100 		const char *errstr;
    101 		epoch = strtonum(source_date_epoch, 1, LLONG_MAX, &errstr);
    102 		if (errstr != NULL)
    103 			errx(EXIT_FAILURE, "$SOURCE_DATE_EPOCH is %s: %s",
    104 			     errstr, source_date_epoch);
    105 		date_time = epoch;
    106 	} else {
    107 		date_time = time(NULL);
    108 	}
    109 	struct tm *date_tm = gmtime(&date_time);
    110 	strftime(date, sizeof(date), "%B %d, %Y", date_tm);
    111 
    112 	while ((ch = parser_getch(p)) != UTF8_INVALID) {
    113 		if ((ch < 0x80 && isalnum(ch)) ||
    114 		     ch == '_' || ch == '-' || ch == '.') {
    115 			str_append_ch(name, ch);
    116 		} else if (ch == '(') {
    117 			section = parse_section(p);
    118 		} else if (ch == '\n') {
    119 			if (name->len == 0)
    120 				parser_fatal(p, "Expected preamble");
    121 			if (section == -1)
    122 				parser_fatal(p, "Expected manual section");
    123 
    124 			/*
    125 			 * the date format for Dd is in the form:
    126 			 *	month date, year
    127 			 * where date is a number from 1-31.
    128 			 * strftime's %d inserts a leading '0' if the date is
    129 			 * between 1-9. So, check for this '0' and skip it.
    130 			 */
    131 			fputs(".Dd ", stdout);
    132 			for(offset = date;;) {
    133 				putchar(*offset);
    134 				if (*(offset++) == ' ') {
    135 					if (*offset == '0')
    136 						offset++;
    137 					break;
    138 				}
    139 			}
    140 			puts(offset);
    141 			str_toupper(name);
    142 			printf(".Dt %s %d\n", name->str, section);
    143 			puts(".Os");
    144 			break;
    145 		} else {
    146 			parser_fatal(p, "Expected character in [A-Za-z0-9-]");
    147 		}
    148 	}
    149 
    150 	str_free(name);
    151 }
    152 
    153 void
    154 parse_format(struct parser *p, enum formatting fmt)
    155 {
    156 	char formats[FORMAT_LAST] = {
    157 		[FORMAT_BOLD] = 'B',
    158 		[FORMAT_UNDERLINE] = 'I',
    159 	};
    160 	char error[512];
    161 	if (p->flags) {
    162 		if ((p->flags & ~fmt)) {
    163 			snprintf(error, sizeof(error),
    164 				 "Cannot nest inline formatting "
    165 				 "(began with %c at %d:%d)",
    166 				 p->flags == FORMAT_BOLD ? '*' : '_',
    167 				 p->fmt_line, p->fmt_col);
    168 			parser_fatal(p, error);
    169 		}
    170 		fputs("\\fR", stdout);
    171 	} else {
    172 		printf("\\f%c", formats[fmt]);
    173 		p->fmt_line = p->line;
    174 		p->fmt_col = p->col;
    175 	}
    176 	p->flags ^= fmt;
    177 }
    178 
    179 void
    180 parse_linebreak(struct parser *p)
    181 {
    182 	uint32_t plus = parser_getch(p);
    183 	if (plus != '+') {
    184 		putchar('+');
    185 		parser_pushch(p, plus);
    186 		return;
    187 	}
    188 	uint32_t lf = parser_getch(p);
    189 	if (lf != '\n') {
    190 		putchar('+');
    191 		parser_pushch(p, plus);
    192 		parser_pushch(p, '\n');
    193 		return;
    194 	}
    195 	uint32_t ch = parser_getch(p);
    196 	if (ch == '\n')
    197 		parser_fatal(p, "Explicit line breaks cannot be followed by a blank line");
    198 	parser_pushch(p, ch);
    199 	puts("\n.br");
    200 }
    201 
    202 void
    203 parse_text(struct parser *p)
    204 {
    205 	uint32_t ch, next, last = ' ';
    206 	int i = 0;
    207 	while ((ch = parser_getch(p)) != UTF8_INVALID) {
    208 		switch (ch) {
    209 		case '\\':
    210 			ch = parser_getch(p);
    211 			if (ch == UTF8_INVALID)
    212 				parser_fatal(p, "Unexpected EOF");
    213 			else if (ch == '\\')
    214 				fputs("\\e", stdout);
    215 			else
    216 				utf8_putch(ch);
    217 			break;
    218 		case '*':
    219 			parse_format(p, FORMAT_BOLD);
    220 			break;
    221 		case '_':
    222 			next = parser_getch(p);
    223 			if (!isalnum(last)
    224 			    || ((p->flags & FORMAT_UNDERLINE) &&
    225 				 !isalnum(next)))
    226 				parse_format(p, FORMAT_UNDERLINE);
    227 			else
    228 				utf8_putch(ch);
    229 			if (next == UTF8_INVALID)
    230 				return;
    231 			parser_pushch(p, next);
    232 			break;
    233 		case '+':
    234 			parse_linebreak(p);
    235 			break;
    236 		case '\n':
    237 			utf8_putch(ch);
    238 			return;
    239 		case '.':
    240 			if (!i) {
    241 				/* Escape lone dots */
    242 				fputs("\\&.", stdout);
    243 				break;
    244 			}
    245 			/* fallthrough */
    246 		default:
    247 			last = ch;
    248 			utf8_putch(ch);
    249 			break;
    250 		}
    251 		++i;
    252 	}
    253 }
    254 
    255 void
    256 parse_heading(struct parser *p)
    257 {
    258 	uint32_t ch;
    259 	int level = 1;
    260 	while ((ch = parser_getch(p)) != UTF8_INVALID) {
    261 		if (ch == '#')
    262 			++level;
    263 		else if (ch == ' ')
    264 			break;
    265 		else
    266 			parser_fatal(p, "Invalid start of heading (probably needs a space)");
    267 	}
    268 	switch (level) {
    269 	case 1:
    270 		fputs(".Sh ", stdout);
    271 		break;
    272 	case 2:
    273 		fputs(".Ss ", stdout);
    274 		break;
    275 	default:
    276 		parser_fatal(p, "Only headings up to two levels deep are permitted");
    277 		break;
    278 	}
    279 	while ((ch = parser_getch(p)) != UTF8_INVALID) {
    280 		utf8_putch(ch);
    281 		if (ch == '\n')
    282 			break;
    283 	}
    284 }
    285 
    286 int
    287 parse_indent(struct parser *p, int *indent, bool write)
    288 {
    289 	int i = 0;
    290 	uint32_t ch;
    291 	while ((ch = parser_getch(p)) == '\t')
    292 		++i;
    293 
    294 	parser_pushch(p, ch);
    295 	if (ch == '\n' && *indent != 0) {
    296 		/* Don't change indent when we encounter empty lines */
    297 		return *indent;
    298 	} else if (write) {
    299 		if (i < *indent)
    300 			for (int j = *indent; i < j; --j)
    301 				puts(".Ed");
    302 		else if (i == *indent + 1)
    303 			puts(".Bd -ragged -offset indent -compact");
    304 		else if (i != *indent && ch == '\t')
    305 			parser_fatal(p, "Indented by an amount greater than 1");
    306 	}
    307 	*indent = i;
    308 	return i;
    309 }
    310 
    311 enum list_type {
    312 	BULLETED,
    313 	NUMBERED,
    314 };
    315 
    316 void
    317 parse_list(struct parser *p, int *indent, enum list_type t)
    318 {
    319 	uint32_t ch;
    320 	int oldindent = *indent;
    321 	if ((ch = parser_getch(p)) != ' ')
    322 		parser_fatal(p, "Expected space before start of list entry");
    323 
    324 	printf(".Bl -%s -compact\n",
    325 		t == BULLETED ? "bullet" : "enum");
    326 	puts(".It");
    327 
    328 	parse_text(p);
    329 	do {
    330 		parse_indent(p, indent, false);
    331 		if (*indent > oldindent) {
    332 			parser_getch(p);
    333 			parse_list(p, indent, t);
    334 		} else if (*indent < oldindent) {
    335 			goto ret;
    336 		}
    337 		if ((ch = parser_getch(p)) == UTF8_INVALID)
    338 			break;
    339 		switch (ch) {
    340 		case ' ':
    341 			if ((ch = parser_getch(p)) != ' ')
    342 				parser_fatal(p, "Expected two spaces for list entry continuation");
    343 			parse_text(p);
    344 			break;
    345 		case '-':
    346 		case '.':
    347 			if ((ch = parser_getch(p)) != ' ')
    348 				parser_fatal(p, "Expected space before start of list entry");
    349 			puts(".It");
    350 			parse_text(p);
    351 			break;
    352 		default:
    353 			parser_pushch(p, ch);
    354 			goto ret;
    355 		}
    356 	} while (ch != UTF8_INVALID);
    357 ret:
    358 	puts(".El");
    359 	--*indent;
    360 }
    361 
    362 void
    363 parse_literal(struct parser *p, int *indent)
    364 {
    365 	uint32_t ch;
    366 	if ((ch = parser_getch(p)) != '`' ||
    367 	    (ch = parser_getch(p)) != '`' ||
    368 	    (ch = parser_getch(p)) != '\n')
    369 		parser_fatal(p, "Expected ``` and a newline to begin literal block");
    370 
    371 	int stops = 0;
    372 	puts(".Bd -literal -offset indent");
    373 	do {
    374 		if ((ch = parser_getch(p)) == UTF8_INVALID)
    375 			break;
    376 		if (ch == '`') {
    377 			if (++stops == 3) {
    378 				if ((ch = parser_getch(p)) != '\n')
    379 					parser_fatal(p, "Expected literal block to end with newline");
    380 				puts(".Ed");
    381 				return;
    382 			}
    383 		} else {
    384 			while (stops != 0) {
    385 				putchar('`');
    386 				--stops;
    387 			}
    388 			switch (ch) {
    389 			case '.':
    390 				fputs("\\&.", stdout);
    391 				break;
    392 			case '\\':
    393 				ch = parser_getch(p);
    394 				if (ch == UTF8_INVALID)
    395 					parser_fatal(p, "Unexpected EOF");
    396 				else if (ch == '\\')
    397 					fputs("\\e", stdout);
    398 				else
    399 					utf8_putch(ch);
    400 				break;
    401 			default:
    402 				utf8_putch(ch);
    403 				break;
    404 			}
    405 		}
    406 	} while (ch != UTF8_INVALID);
    407 }
    408 
    409 enum table_align {
    410 	ALIGN_LEFT,
    411 	ALIGN_CENTER,
    412 	ALIGN_RIGHT,
    413 };
    414 
    415 struct table_row {
    416 	struct table_cell *cell;
    417 	struct table_row  *next;
    418 };
    419 
    420 struct table_cell {
    421 	enum table_align align;
    422 	str_t *contents;
    423 	struct table_cell *next;
    424 };
    425 
    426 void
    427 parse_table(struct parser *p, uint32_t style)
    428 {
    429 	struct table_row *table = NULL;
    430 	struct table_row *currow = NULL, *prevrow = NULL;
    431 	struct table_cell *curcell = NULL;
    432 	int column = 0;
    433 	uint32_t ch;
    434 	parser_pushch(p, '|');
    435 
    436 	do {
    437 		if ((ch = parser_getch(p)) == UTF8_INVALID)
    438 			break;
    439 		switch (ch) {
    440 		case '\n':
    441 			goto commit_table;
    442 		case '|':
    443 			prevrow = currow;
    444 			if ((currow = calloc(1, sizeof(struct table_row))) == NULL)
    445 				err(EXIT_FAILURE, NULL);
    446 			/* TODO: Verify the number of columns match */
    447 			if (prevrow)
    448 				prevrow->next = currow;
    449 			if ((curcell = calloc(1, sizeof(struct table_cell))) == NULL)
    450 				err(EXIT_FAILURE, NULL);
    451 			currow->cell = curcell;
    452 			column = 0;
    453 			if (!table)
    454 				table = currow;
    455 			break;
    456 		case ':':
    457 			if (!currow) {
    458 				parser_fatal(p, "Cannot start a column without "
    459 						"starting a row first");
    460 			} else {
    461 				struct table_cell *prev = curcell;
    462 				if ((curcell = calloc(1, sizeof(struct table_cell))) == NULL)
    463 					err(EXIT_FAILURE, NULL);
    464 				if (prev)
    465 					prev->next = curcell;
    466 				++column;
    467 			}
    468 			break;
    469 		case ' ':
    470 			goto continue_cell;
    471 		default:
    472 			parser_fatal(p, "Expected either '|' or ':'");
    473 			break;
    474 		}
    475 		if ((ch = parser_getch(p)) == UTF8_INVALID)
    476 			break;
    477 		switch (ch) {
    478 		case '[':
    479 			curcell->align = ALIGN_LEFT;
    480 			break;
    481 		case '-':
    482 			curcell->align = ALIGN_CENTER;
    483 			break;
    484 		case ']':
    485 			curcell->align = ALIGN_RIGHT;
    486 			break;
    487 		case ' ':
    488 			if (prevrow) {
    489 				struct table_cell *pcell = prevrow->cell;
    490 				for (int i = 0; i <= column && pcell; ++i, pcell = pcell->next) {
    491 					if (i == column) {
    492 						curcell->align = pcell->align;
    493 						break;
    494 					}
    495 				}
    496 			} else {
    497 				parser_fatal(p, "No previous row to infer alignment from");
    498 			}
    499 			break;
    500 		default:
    501 			parser_fatal(p, "Expected one of '[', '-', ']', or ' '");
    502 			break;
    503 		}
    504 		curcell->contents = str_create();
    505 continue_cell:
    506 		switch (ch = parser_getch(p)) {
    507 		case ' ':
    508 			// Read out remainder of the text
    509 			while ((ch = parser_getch(p)) != UTF8_INVALID) {
    510 				switch (ch) {
    511 				case '\n':
    512 					goto commit_cell;
    513 				default:
    514 					str_append_ch(curcell->contents, ch);
    515 					break;
    516 				}
    517 			}
    518 			break;
    519 		case '\n':
    520 			goto commit_cell;
    521 		default:
    522 			parser_fatal(p, "Expected ' ' or a newline");
    523 			break;
    524 		}
    525 commit_cell:
    526 		if (strstr(curcell->contents->str, "T{")
    527 		    || strstr(curcell->contents->str, "T}"))
    528 			parser_fatal(p, "Cells cannot contain T{ or T} "
    529 					"due to roff limitations");
    530 	} while (ch != UTF8_INVALID);
    531 commit_table:
    532 
    533 	if (ch == UTF8_INVALID)
    534 		return;
    535 
    536 	puts(".TS");
    537 
    538 	switch (style) {
    539 	case '[':
    540 		fputs("allbox;", stdout);
    541 		break;
    542 	case ']':
    543 		fputs("box;", stdout);
    544 		break;
    545 	}
    546 
    547 	// Print alignments first
    548 	currow = table;
    549 	while (currow) {
    550 		curcell = currow->cell;
    551 		while (curcell) {
    552 			printf("%c%s", "lcr"[curcell->align],
    553 				curcell->next ? " " : "");
    554 			curcell = curcell->next;
    555 		}
    556 		puts(currow->next ? "" : ".");
    557 		currow = currow->next;
    558 	}
    559 
    560 	// Then contents
    561 	currow = table;
    562 	while (currow) {
    563 		curcell = currow->cell;
    564 		puts("T{");
    565 		while (curcell) {
    566 			parser_pushstr(p, curcell->contents->str);
    567 			parse_text(p);
    568 			if (curcell->next)
    569 				puts("\nT}\tT{");
    570 			else
    571 				fputs("\nT}", stdout);
    572 
    573 			struct table_cell *prev = curcell;
    574 			curcell = curcell->next;
    575 			str_free(prev->contents);
    576 			free(prev);
    577 		}
    578 		putchar('\n');
    579 		struct table_row *prev = currow;
    580 		currow = currow->next;
    581 		free(prev);
    582 	}
    583 
    584 	puts(".TE");
    585 	puts(".Pp");
    586 }
    587 
    588 void
    589 parse_document(struct parser *p)
    590 {
    591 	parse_preamble(p);
    592 	uint32_t ch;
    593 	int indent = 0;
    594 	do {
    595 		parse_indent(p, &indent, true);
    596 		if ((ch = parser_getch(p)) == UTF8_INVALID)
    597 			break;
    598 		switch (ch) {
    599 		case ';':
    600 			if ((ch = parser_getch(p)) != ' ')
    601 				parser_fatal(p, "Expected space after ; to begin comment");
    602 			do
    603 				ch = parser_getch(p);
    604 			while (ch != UTF8_INVALID && ch != '\n');
    605 			break;
    606 		case '#':
    607 			if (indent != 0) {
    608 				parser_pushch(p, ch);
    609 				parse_text(p);
    610 				break;
    611 			}
    612 			parse_heading(p);
    613 			break;
    614 		case '-':
    615 			parse_list(p, &indent, BULLETED);
    616 			break;
    617 		case '.':
    618 			if ((ch = parser_getch(p)) == ' ') {
    619 				parser_pushch(p, ch);
    620 				parse_list(p, &indent, NUMBERED);
    621 			} else {
    622 				parser_pushch(p, ch);
    623 				parse_text(p);
    624 			}
    625 			break;
    626 		case '`':
    627 			parse_literal(p, &indent);
    628 			break;
    629 		case '[':
    630 		case '|':
    631 		case ']':
    632 			if (indent != 0)
    633 				parser_fatal(p, "Tables cannot be indented");
    634 			parse_table(p, ch);
    635 			break;
    636 		case ' ':
    637 			parser_fatal(p, "Tabs are required for indentation");
    638 			break;
    639 		case '\n':
    640 			if (p->flags) {
    641 				char error[512];
    642 				snprintf(error, sizeof(error), "Expected %c before starting "
    643 						"new paragraph (began with %c at %d:%d)",
    644 						p->flags == FORMAT_BOLD ? '*' : '_',
    645 						p->flags == FORMAT_BOLD ? '*' : '_',
    646 						p->fmt_line, p->fmt_col);
    647 				parser_fatal(p, error);
    648 			}
    649 			puts(".Pp");
    650 			break;
    651 		default:
    652 			parser_pushch(p, ch);
    653 			parse_text(p);
    654 			break;
    655 		}
    656 	} while (ch != UTF8_INVALID);
    657 }