parser.c (13431B)
1 #include "config.h" 2 #include <ctype.h> 3 #if HAVE_ERR 4 #include <err.h> 5 #endif 6 #include <errno.h> 7 #include <limits.h> 8 #include <stdbool.h> 9 #include <stdint.h> 10 #include <stdio.h> 11 #include <stdlib.h> 12 #include <string.h> 13 #include <time.h> 14 #include "utf8.h" 15 #include "string.h" 16 #include "parser.h" 17 18 void 19 parser_fatal(struct parser *p, const char *err) 20 { 21 fclose(p->input); 22 errx(EXIT_FAILURE, "%s:%d:%d: %s", p->fname, p->line, p->col, err); 23 } 24 25 uint32_t 26 parser_getch(struct parser *p) 27 { 28 if (p->qhead) { 29 return p->queue[--p->qhead]; 30 } else if (p->str) { 31 uint32_t ch = utf8_decode(&p->str); 32 if (!ch || ch == UTF8_INVALID) { 33 p->str = NULL; 34 return UTF8_INVALID; 35 } 36 return ch; 37 } 38 uint32_t ch = utf8_fgetch(p->input); 39 if (ch == '\n') { 40 p->col = 0; 41 ++p->line; 42 } else { 43 ++p->col; 44 } 45 return ch; 46 } 47 48 void 49 parser_pushch(struct parser *p, uint32_t ch) 50 { 51 if (ch != UTF8_INVALID) 52 p->queue[p->qhead++] = ch; 53 } 54 55 void 56 parser_pushstr(struct parser *p, const char *str) 57 { 58 p->str = str; 59 } 60 61 int 62 parse_section(struct parser *p) 63 { 64 str_t *section = str_create(); 65 uint32_t ch; 66 int sec; 67 const char *errstr; 68 while ((ch = parser_getch(p)) != UTF8_INVALID) { 69 if (ch < 0x80 && isdigit(ch)) { 70 str_append_ch(section, ch); 71 } else if (ch == ')') { 72 if (!section->str) 73 break; 74 sec = strtonum(section->str, 0, 9, &errstr); 75 if (errstr != NULL) 76 parser_fatal(p, "Expected section between 0 and 9"); 77 str_free(section); 78 return sec; 79 } else { 80 parser_fatal(p, "Expected digit or ')'"); 81 } 82 } 83 84 parser_fatal(p, "Expected manual section"); 85 return -1; 86 } 87 88 void 89 parse_preamble(struct parser *p) 90 { 91 str_t *name = str_create(); 92 int section = -1; 93 uint32_t ch; 94 time_t date_time; 95 char date[256]; 96 char *offset; 97 char *source_date_epoch = getenv("SOURCE_DATE_EPOCH"); 98 if (source_date_epoch != NULL) { 99 long long epoch; 100 const char *errstr; 101 epoch = strtonum(source_date_epoch, 1, LLONG_MAX, &errstr); 102 if (errstr != NULL) 103 errx(EXIT_FAILURE, "$SOURCE_DATE_EPOCH is %s: %s", 104 errstr, source_date_epoch); 105 date_time = epoch; 106 } else { 107 date_time = time(NULL); 108 } 109 struct tm *date_tm = gmtime(&date_time); 110 strftime(date, sizeof(date), "%B %d, %Y", date_tm); 111 112 while ((ch = parser_getch(p)) != UTF8_INVALID) { 113 if ((ch < 0x80 && isalnum(ch)) || 114 ch == '_' || ch == '-' || ch == '.') { 115 str_append_ch(name, ch); 116 } else if (ch == '(') { 117 section = parse_section(p); 118 } else if (ch == '\n') { 119 if (name->len == 0) 120 parser_fatal(p, "Expected preamble"); 121 if (section == -1) 122 parser_fatal(p, "Expected manual section"); 123 124 /* 125 * the date format for Dd is in the form: 126 * month date, year 127 * where date is a number from 1-31. 128 * strftime's %d inserts a leading '0' if the date is 129 * between 1-9. So, check for this '0' and skip it. 130 */ 131 fputs(".Dd ", stdout); 132 for(offset = date;;) { 133 putchar(*offset); 134 if (*(offset++) == ' ') { 135 if (*offset == '0') 136 offset++; 137 break; 138 } 139 } 140 puts(offset); 141 str_toupper(name); 142 printf(".Dt %s %d\n", name->str, section); 143 puts(".Os"); 144 break; 145 } else { 146 parser_fatal(p, "Expected character in [A-Za-z0-9-]"); 147 } 148 } 149 150 str_free(name); 151 } 152 153 void 154 parse_format(struct parser *p, enum formatting fmt) 155 { 156 char formats[FORMAT_LAST] = { 157 [FORMAT_BOLD] = 'B', 158 [FORMAT_UNDERLINE] = 'I', 159 }; 160 char error[512]; 161 if (p->flags) { 162 if ((p->flags & ~fmt)) { 163 snprintf(error, sizeof(error), 164 "Cannot nest inline formatting " 165 "(began with %c at %d:%d)", 166 p->flags == FORMAT_BOLD ? '*' : '_', 167 p->fmt_line, p->fmt_col); 168 parser_fatal(p, error); 169 } 170 fputs("\\fR", stdout); 171 } else { 172 printf("\\f%c", formats[fmt]); 173 p->fmt_line = p->line; 174 p->fmt_col = p->col; 175 } 176 p->flags ^= fmt; 177 } 178 179 void 180 parse_linebreak(struct parser *p) 181 { 182 uint32_t plus = parser_getch(p); 183 if (plus != '+') { 184 putchar('+'); 185 parser_pushch(p, plus); 186 return; 187 } 188 uint32_t lf = parser_getch(p); 189 if (lf != '\n') { 190 putchar('+'); 191 parser_pushch(p, plus); 192 parser_pushch(p, '\n'); 193 return; 194 } 195 uint32_t ch = parser_getch(p); 196 if (ch == '\n') 197 parser_fatal(p, "Explicit line breaks cannot be followed by a blank line"); 198 parser_pushch(p, ch); 199 puts("\n.br"); 200 } 201 202 void 203 parse_text(struct parser *p) 204 { 205 uint32_t ch, next, last = ' '; 206 int i = 0; 207 while ((ch = parser_getch(p)) != UTF8_INVALID) { 208 switch (ch) { 209 case '\\': 210 ch = parser_getch(p); 211 if (ch == UTF8_INVALID) 212 parser_fatal(p, "Unexpected EOF"); 213 else if (ch == '\\') 214 fputs("\\e", stdout); 215 else 216 utf8_putch(ch); 217 break; 218 case '*': 219 parse_format(p, FORMAT_BOLD); 220 break; 221 case '_': 222 next = parser_getch(p); 223 if (!isalnum(last) 224 || ((p->flags & FORMAT_UNDERLINE) && 225 !isalnum(next))) 226 parse_format(p, FORMAT_UNDERLINE); 227 else 228 utf8_putch(ch); 229 if (next == UTF8_INVALID) 230 return; 231 parser_pushch(p, next); 232 break; 233 case '+': 234 parse_linebreak(p); 235 break; 236 case '\n': 237 utf8_putch(ch); 238 return; 239 case '.': 240 if (!i) { 241 /* Escape lone dots */ 242 fputs("\\&.", stdout); 243 break; 244 } 245 /* fallthrough */ 246 default: 247 last = ch; 248 utf8_putch(ch); 249 break; 250 } 251 ++i; 252 } 253 } 254 255 void 256 parse_heading(struct parser *p) 257 { 258 uint32_t ch; 259 int level = 1; 260 while ((ch = parser_getch(p)) != UTF8_INVALID) { 261 if (ch == '#') 262 ++level; 263 else if (ch == ' ') 264 break; 265 else 266 parser_fatal(p, "Invalid start of heading (probably needs a space)"); 267 } 268 switch (level) { 269 case 1: 270 fputs(".Sh ", stdout); 271 break; 272 case 2: 273 fputs(".Ss ", stdout); 274 break; 275 default: 276 parser_fatal(p, "Only headings up to two levels deep are permitted"); 277 break; 278 } 279 while ((ch = parser_getch(p)) != UTF8_INVALID) { 280 utf8_putch(ch); 281 if (ch == '\n') 282 break; 283 } 284 } 285 286 int 287 parse_indent(struct parser *p, int *indent, bool write) 288 { 289 int i = 0; 290 uint32_t ch; 291 while ((ch = parser_getch(p)) == '\t') 292 ++i; 293 294 parser_pushch(p, ch); 295 if (ch == '\n' && *indent != 0) { 296 /* Don't change indent when we encounter empty lines */ 297 return *indent; 298 } else if (write) { 299 if (i < *indent) 300 for (int j = *indent; i < j; --j) 301 puts(".Ed"); 302 else if (i == *indent + 1) 303 puts(".Bd -ragged -offset indent -compact"); 304 else if (i != *indent && ch == '\t') 305 parser_fatal(p, "Indented by an amount greater than 1"); 306 } 307 *indent = i; 308 return i; 309 } 310 311 enum list_type { 312 BULLETED, 313 NUMBERED, 314 }; 315 316 void 317 parse_list(struct parser *p, int *indent, enum list_type t) 318 { 319 uint32_t ch; 320 int oldindent = *indent; 321 if ((ch = parser_getch(p)) != ' ') 322 parser_fatal(p, "Expected space before start of list entry"); 323 324 printf(".Bl -%s -compact\n", 325 t == BULLETED ? "bullet" : "enum"); 326 puts(".It"); 327 328 parse_text(p); 329 do { 330 parse_indent(p, indent, false); 331 if (*indent > oldindent) { 332 parser_getch(p); 333 parse_list(p, indent, t); 334 } else if (*indent < oldindent) { 335 goto ret; 336 } 337 if ((ch = parser_getch(p)) == UTF8_INVALID) 338 break; 339 switch (ch) { 340 case ' ': 341 if ((ch = parser_getch(p)) != ' ') 342 parser_fatal(p, "Expected two spaces for list entry continuation"); 343 parse_text(p); 344 break; 345 case '-': 346 case '.': 347 if ((ch = parser_getch(p)) != ' ') 348 parser_fatal(p, "Expected space before start of list entry"); 349 puts(".It"); 350 parse_text(p); 351 break; 352 default: 353 parser_pushch(p, ch); 354 goto ret; 355 } 356 } while (ch != UTF8_INVALID); 357 ret: 358 puts(".El"); 359 --*indent; 360 } 361 362 void 363 parse_literal(struct parser *p, int *indent) 364 { 365 uint32_t ch; 366 if ((ch = parser_getch(p)) != '`' || 367 (ch = parser_getch(p)) != '`' || 368 (ch = parser_getch(p)) != '\n') 369 parser_fatal(p, "Expected ``` and a newline to begin literal block"); 370 371 int stops = 0; 372 puts(".Bd -literal -offset indent"); 373 do { 374 if ((ch = parser_getch(p)) == UTF8_INVALID) 375 break; 376 if (ch == '`') { 377 if (++stops == 3) { 378 if ((ch = parser_getch(p)) != '\n') 379 parser_fatal(p, "Expected literal block to end with newline"); 380 puts(".Ed"); 381 return; 382 } 383 } else { 384 while (stops != 0) { 385 putchar('`'); 386 --stops; 387 } 388 switch (ch) { 389 case '.': 390 fputs("\\&.", stdout); 391 break; 392 case '\\': 393 ch = parser_getch(p); 394 if (ch == UTF8_INVALID) 395 parser_fatal(p, "Unexpected EOF"); 396 else if (ch == '\\') 397 fputs("\\e", stdout); 398 else 399 utf8_putch(ch); 400 break; 401 default: 402 utf8_putch(ch); 403 break; 404 } 405 } 406 } while (ch != UTF8_INVALID); 407 } 408 409 enum table_align { 410 ALIGN_LEFT, 411 ALIGN_CENTER, 412 ALIGN_RIGHT, 413 }; 414 415 struct table_row { 416 struct table_cell *cell; 417 struct table_row *next; 418 }; 419 420 struct table_cell { 421 enum table_align align; 422 str_t *contents; 423 struct table_cell *next; 424 }; 425 426 void 427 parse_table(struct parser *p, uint32_t style) 428 { 429 struct table_row *table = NULL; 430 struct table_row *currow = NULL, *prevrow = NULL; 431 struct table_cell *curcell = NULL; 432 int column = 0; 433 uint32_t ch; 434 parser_pushch(p, '|'); 435 436 do { 437 if ((ch = parser_getch(p)) == UTF8_INVALID) 438 break; 439 switch (ch) { 440 case '\n': 441 goto commit_table; 442 case '|': 443 prevrow = currow; 444 if ((currow = calloc(1, sizeof(struct table_row))) == NULL) 445 err(EXIT_FAILURE, NULL); 446 /* TODO: Verify the number of columns match */ 447 if (prevrow) 448 prevrow->next = currow; 449 if ((curcell = calloc(1, sizeof(struct table_cell))) == NULL) 450 err(EXIT_FAILURE, NULL); 451 currow->cell = curcell; 452 column = 0; 453 if (!table) 454 table = currow; 455 break; 456 case ':': 457 if (!currow) { 458 parser_fatal(p, "Cannot start a column without " 459 "starting a row first"); 460 } else { 461 struct table_cell *prev = curcell; 462 if ((curcell = calloc(1, sizeof(struct table_cell))) == NULL) 463 err(EXIT_FAILURE, NULL); 464 if (prev) 465 prev->next = curcell; 466 ++column; 467 } 468 break; 469 case ' ': 470 goto continue_cell; 471 default: 472 parser_fatal(p, "Expected either '|' or ':'"); 473 break; 474 } 475 if ((ch = parser_getch(p)) == UTF8_INVALID) 476 break; 477 switch (ch) { 478 case '[': 479 curcell->align = ALIGN_LEFT; 480 break; 481 case '-': 482 curcell->align = ALIGN_CENTER; 483 break; 484 case ']': 485 curcell->align = ALIGN_RIGHT; 486 break; 487 case ' ': 488 if (prevrow) { 489 struct table_cell *pcell = prevrow->cell; 490 for (int i = 0; i <= column && pcell; ++i, pcell = pcell->next) { 491 if (i == column) { 492 curcell->align = pcell->align; 493 break; 494 } 495 } 496 } else { 497 parser_fatal(p, "No previous row to infer alignment from"); 498 } 499 break; 500 default: 501 parser_fatal(p, "Expected one of '[', '-', ']', or ' '"); 502 break; 503 } 504 curcell->contents = str_create(); 505 continue_cell: 506 switch (ch = parser_getch(p)) { 507 case ' ': 508 // Read out remainder of the text 509 while ((ch = parser_getch(p)) != UTF8_INVALID) { 510 switch (ch) { 511 case '\n': 512 goto commit_cell; 513 default: 514 str_append_ch(curcell->contents, ch); 515 break; 516 } 517 } 518 break; 519 case '\n': 520 goto commit_cell; 521 default: 522 parser_fatal(p, "Expected ' ' or a newline"); 523 break; 524 } 525 commit_cell: 526 if (strstr(curcell->contents->str, "T{") 527 || strstr(curcell->contents->str, "T}")) 528 parser_fatal(p, "Cells cannot contain T{ or T} " 529 "due to roff limitations"); 530 } while (ch != UTF8_INVALID); 531 commit_table: 532 533 if (ch == UTF8_INVALID) 534 return; 535 536 puts(".TS"); 537 538 switch (style) { 539 case '[': 540 fputs("allbox;", stdout); 541 break; 542 case ']': 543 fputs("box;", stdout); 544 break; 545 } 546 547 // Print alignments first 548 currow = table; 549 while (currow) { 550 curcell = currow->cell; 551 while (curcell) { 552 printf("%c%s", "lcr"[curcell->align], 553 curcell->next ? " " : ""); 554 curcell = curcell->next; 555 } 556 puts(currow->next ? "" : "."); 557 currow = currow->next; 558 } 559 560 // Then contents 561 currow = table; 562 while (currow) { 563 curcell = currow->cell; 564 puts("T{"); 565 while (curcell) { 566 parser_pushstr(p, curcell->contents->str); 567 parse_text(p); 568 if (curcell->next) 569 puts("\nT}\tT{"); 570 else 571 fputs("\nT}", stdout); 572 573 struct table_cell *prev = curcell; 574 curcell = curcell->next; 575 str_free(prev->contents); 576 free(prev); 577 } 578 putchar('\n'); 579 struct table_row *prev = currow; 580 currow = currow->next; 581 free(prev); 582 } 583 584 puts(".TE"); 585 puts(".Pp"); 586 } 587 588 void 589 parse_document(struct parser *p) 590 { 591 parse_preamble(p); 592 uint32_t ch; 593 int indent = 0; 594 do { 595 parse_indent(p, &indent, true); 596 if ((ch = parser_getch(p)) == UTF8_INVALID) 597 break; 598 switch (ch) { 599 case ';': 600 if ((ch = parser_getch(p)) != ' ') 601 parser_fatal(p, "Expected space after ; to begin comment"); 602 do 603 ch = parser_getch(p); 604 while (ch != UTF8_INVALID && ch != '\n'); 605 break; 606 case '#': 607 if (indent != 0) { 608 parser_pushch(p, ch); 609 parse_text(p); 610 break; 611 } 612 parse_heading(p); 613 break; 614 case '-': 615 parse_list(p, &indent, BULLETED); 616 break; 617 case '.': 618 if ((ch = parser_getch(p)) == ' ') { 619 parser_pushch(p, ch); 620 parse_list(p, &indent, NUMBERED); 621 } else { 622 parser_pushch(p, ch); 623 parse_text(p); 624 } 625 break; 626 case '`': 627 parse_literal(p, &indent); 628 break; 629 case '[': 630 case '|': 631 case ']': 632 if (indent != 0) 633 parser_fatal(p, "Tables cannot be indented"); 634 parse_table(p, ch); 635 break; 636 case ' ': 637 parser_fatal(p, "Tabs are required for indentation"); 638 break; 639 case '\n': 640 if (p->flags) { 641 char error[512]; 642 snprintf(error, sizeof(error), "Expected %c before starting " 643 "new paragraph (began with %c at %d:%d)", 644 p->flags == FORMAT_BOLD ? '*' : '_', 645 p->flags == FORMAT_BOLD ? '*' : '_', 646 p->fmt_line, p->fmt_col); 647 parser_fatal(p, error); 648 } 649 puts(".Pp"); 650 break; 651 default: 652 parser_pushch(p, ch); 653 parse_text(p); 654 break; 655 } 656 } while (ch != UTF8_INVALID); 657 }