scdoc2mdoc

A fork of scdoc to output mdoc(7)
git clone git://git.sgregoratto.me/scdoc2mdoc
Log | Files | Refs | README | LICENSE

commit 9243fbbbc71594e9f41802bbb3149c22124cd3ad
parent 599f968c0110b8eebe94e7019359e966fc7a53bb
Author: Stephen Gregoratto <dev@sgregoratto.me>
Date:   Sat, 15 Jun 2019 16:08:58 +1000

major build improvements

- Move sourcefiles to main dir
- Concat all utf8_*.c into one utf8.c
- Rename unicode.h to utf8.h
- Simplify Makefile

Diffstat:
MMakefile | 25++++++++-----------------
Amain.c | 716+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/main.c | 715-------------------------------------------------------------------------------
Dsrc/string.c | 45---------------------------------------------
Dsrc/utf8_chsize.c | 14--------------
Dsrc/utf8_decode.c | 38--------------------------------------
Dsrc/utf8_encode.c | 30------------------------------
Dsrc/utf8_fgetch.c | 27---------------------------
Dsrc/utf8_fputch.c | 10----------
Dsrc/utf8_size.c | 27---------------------------
Dsrc/util.c | 71-----------------------------------------------------------------------
Rinclude/str.h -> str.h | 0
Astring.c | 46++++++++++++++++++++++++++++++++++++++++++++++
Autf8.c | 132+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rinclude/unicode.h -> utf8.h | 0
Autil.c | 72++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rinclude/util.h -> util.h | 0
17 files changed, 974 insertions(+), 994 deletions(-)

diff --git a/Makefile b/Makefile @@ -7,27 +7,18 @@ _INSTDIR=$(DESTDIR)$(PREFIX) BINDIR?=$(_INSTDIR)/bin MANDIR?=$(_INSTDIR)/share/man PCDIR?=$(_INSTDIR)/lib/pkgconfig -OUTDIR=.build HOST_SCDOC=./scdoc .DEFAULT_GOAL=all -OBJECTS=\ - $(OUTDIR)/main.o \ - $(OUTDIR)/string.o \ - $(OUTDIR)/utf8_chsize.o \ - $(OUTDIR)/utf8_decode.o \ - $(OUTDIR)/utf8_encode.o \ - $(OUTDIR)/utf8_fgetch.o \ - $(OUTDIR)/utf8_fputch.o \ - $(OUTDIR)/utf8_size.o \ - $(OUTDIR)/util.o +OBJS = main.o string.o utf8.o util.o -$(OUTDIR)/%.o: src/%.c - @mkdir -p $(OUTDIR) - $(CC) -std=c99 -pedantic -c -o $@ $(CFLAGS) $(INCLUDE) $< +main.o: str.h utf8.h util.h +string.o: str.h utf8.h +utf8.o: utf8.h +util.o: utf8.h util.h -scdoc: $(OBJECTS) - $(CC) $(LDFLAGS) -o $@ $^ +scdoc: $(OBJS) + $(CC) $(LDFLAGS) -o $@ $(OBJS) scdoc.1: scdoc.1.scd $(HOST_SCDOC) $(HOST_SCDOC) < $< > $@ @@ -41,7 +32,7 @@ scdoc.pc: scdoc.pc.in all: scdoc scdoc.1 scdoc.5 scdoc.pc clean: - rm -rf $(OUTDIR) scdoc scdoc.1 scdoc.5 scdoc.pc + rm -rf $(OBJS) scdoc scdoc.1 scdoc.5 scdoc.pc install: all mkdir -p $(BINDIR) $(MANDIR)/man1 $(MANDIR)/man5 $(PCDIR) diff --git a/main.c b/main.c @@ -0,0 +1,716 @@ +#define _XOPEN_SOURCE 600 +#include <assert.h> +#include <ctype.h> +#include <errno.h> +#include <limits.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +#include "str.h" +#include "utf8.h" +#include "util.h" + +char *strstr(const char *haystack, const char *needle); +char *strerror(int errnum); + +static int parse_section(struct parser *p) { + str_t *section = str_create(); + uint32_t ch; + while ((ch = parser_getch(p)) != UTF8_INVALID) { + if (ch < 0x80 && isdigit(ch)) { + int ret = str_append_ch(section, ch); + assert(ret != -1); + } else if (ch == ')') { + if (!section->str) { + break; + } + int sec = strtol(section->str, NULL, 10); + if (sec < 0 || sec > 9) { + parser_fatal(p, "Expected section between 0 and 9"); + break; + } + str_free(section); + return sec; + } else { + parser_fatal(p, "Expected digit or )"); + break; + } + }; + parser_fatal(p, "Expected manual section"); + return -1; +} + +static str_t *parse_extra(struct parser *p) { + str_t *extra = str_create(); + int ret = str_append_ch(extra, '"'); + assert(ret != -1); + uint32_t ch; + while ((ch = parser_getch(p)) != UTF8_INVALID) { + if (ch == '"') { + ret = str_append_ch(extra, ch); + assert(ret != -1); + return extra; + } else if (ch == '\n') { + parser_fatal(p, "Unclosed extra preamble field"); + break; + } else { + ret = str_append_ch(extra, ch); + assert(ret != -1); + } + } + str_free(extra); + return NULL; +} + +static void parse_preamble(struct parser *p) { + str_t *name = str_create(); + int ex = 0; + str_t *extras[2] = { NULL }; + int section = -1; + uint32_t ch; + time_t date_time; + char date[256]; + char *source_date_epoch = getenv("SOURCE_DATE_EPOCH"); + if (source_date_epoch != NULL) { + unsigned long long epoch; + char *endptr; + errno = 0; + epoch = strtoull(source_date_epoch, &endptr, 10); + if ((errno == ERANGE && (epoch == ULLONG_MAX || epoch == 0)) + || (errno != 0 && epoch == 0)) { + fprintf(stderr, "$SOURCE_DATE_EPOCH: strtoull: %s\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + if (endptr == source_date_epoch) { + fprintf(stderr, "$SOURCE_DATE_EPOCH: No digits were found: %s\n", + endptr); + exit(EXIT_FAILURE); + } + if (*endptr != '\0') { + fprintf(stderr, "$SOURCE_DATE_EPOCH: Trailing garbage: %s\n", + endptr); + exit(EXIT_FAILURE); + } + if (epoch > ULONG_MAX) { + fprintf(stderr, "$SOURCE_DATE_EPOCH: value must be smaller than or " + "equal to %lu but was found to be: %llu \n", + ULONG_MAX, epoch); + exit(EXIT_FAILURE); + } + date_time = epoch; + } else { + date_time = time(NULL); + } + struct tm *date_tm = gmtime(&date_time); + strftime(date, sizeof(date), "%F", date_tm); + while ((ch = parser_getch(p)) != UTF8_INVALID) { + if ((ch < 0x80 && isalnum(ch)) || ch == '_' || ch == '-' || ch == '.') { + int ret = str_append_ch(name, ch); + assert(ret != -1); + } else if (ch == '(') { + section = parse_section(p); + } else if (ch == '"') { + if (ex == 2) { + parser_fatal(p, "Too many extra preamble fields"); + } + extras[ex++] = parse_extra(p); + } else if (ch == '\n') { + if (name->len == 0) { + parser_fatal(p, "Expected preamble"); + } + if (section == -1) { + parser_fatal(p, "Expected manual section"); + } + char sec[2] = { '0' + section, 0 }; + char *ex2 = extras[0] != NULL ? extras[0]->str : NULL; + char *ex3 = extras[1] != NULL ? extras[1]->str : NULL; + fprintf(p->output, ".TH \"%s\" \"%s\" \"%s\"", name->str, sec, date); + /* ex2 and ex3 are already double-quoted */ + if (ex2) { + fprintf(p->output, " %s", ex2); + } + if (ex3) { + fprintf(p->output, " %s", ex3); + } + fprintf(p->output, "\n"); + break; + } + } + str_free(name); + for (int i = 0; i < 2; ++i) { + if (extras[i] != NULL) { + str_free(extras[i]); + } + } +} + +static void parse_format(struct parser *p, enum formatting fmt) { + char formats[FORMAT_LAST] = { + [FORMAT_BOLD] = 'B', + [FORMAT_UNDERLINE] = 'I', + }; + char error[512]; + if (p->flags) { + if ((p->flags & ~fmt)) { + snprintf(error, sizeof(error), "Cannot nest inline formatting " + "(began with %c at %d:%d)", + p->flags == FORMAT_BOLD ? '*' : '_', + p->fmt_line, p->fmt_col); + parser_fatal(p, error); + } + fprintf(p->output, "\\fR"); + } else { + fprintf(p->output, "\\f%c", formats[fmt]); + p->fmt_line = p->line; + p->fmt_col = p->col; + } + p->flags ^= fmt; +} + +static void parse_linebreak(struct parser *p) { + uint32_t plus = parser_getch(p); + if (plus != '+') { + fprintf(p->output, "+"); + parser_pushch(p, plus); + return; + } + uint32_t lf = parser_getch(p); + if (lf != '\n') { + fprintf(p->output, "+"); + parser_pushch(p, plus); + parser_pushch(p, '\n'); + return; + } + uint32_t ch = parser_getch(p); + if (ch == '\n') { + parser_fatal( + p, "Explicit line breaks cannot be followed by a blank line"); + } + parser_pushch(p, ch); + fprintf(p->output, "\n.br\n"); +} + +static void parse_text(struct parser *p) { + uint32_t ch, next, last = ' '; + int i = 0; + while ((ch = parser_getch(p)) != UTF8_INVALID) { + switch (ch) { + case '\\': + ch = parser_getch(p); + if (ch == UTF8_INVALID) { + parser_fatal(p, "Unexpected EOF"); + } else if (ch == '\\') { + fprintf(p->output, "\\\\"); + } else { + utf8_fputch(p->output, ch); + } + break; + case '*': + parse_format(p, FORMAT_BOLD); + break; + case '_': + next = parser_getch(p); + if (!isalnum(last) || ((p->flags & FORMAT_UNDERLINE) && !isalnum(next))) { + parse_format(p, FORMAT_UNDERLINE); + } else { + utf8_fputch(p->output, ch); + } + if (next == UTF8_INVALID) { + return; + } + parser_pushch(p, next); + break; + case '+': + parse_linebreak(p); + break; + case '\n': + utf8_fputch(p->output, ch); + return; + case '.': + if (!i) { + // Escape . if it's the first character + fprintf(p->output, "\\&."); + break; + } + /* fallthrough */ + default: + last = ch; + utf8_fputch(p->output, ch); + break; + } + ++i; + } +} + +static void parse_heading(struct parser *p) { + uint32_t ch; + int level = 1; + while ((ch = parser_getch(p)) != UTF8_INVALID) { + if (ch == '#') { + ++level; + } else if (ch == ' ') { + break; + } else { + parser_fatal(p, "Invalid start of heading (probably needs a space)"); + } + } + switch (level) { + case 1: + fprintf(p->output, ".SH "); + break; + case 2: + fprintf(p->output, ".SS "); + break; + default: + parser_fatal(p, "Only headings up to two levels deep are permitted"); + break; + } + while ((ch = parser_getch(p)) != UTF8_INVALID) { + utf8_fputch(p->output, ch); + if (ch == '\n') { + break; + } + } +} + +static int parse_indent(struct parser *p, int *indent, bool write) { + int i = 0; + uint32_t ch; + while ((ch = parser_getch(p)) == '\t') { + ++i; + } + parser_pushch(p, ch); + if (ch == '\n' && *indent != 0) { + // Don't change indent when we encounter empty lines + return *indent; + } + if (write) { + if (i < *indent) { + for (int j = *indent; i < j; --j) { + roff_macro(p, "RE", NULL); + } + } else if (i == *indent + 1) { + fprintf(p->output, ".RS 4\n"); + } else if (i != *indent && ch == '\t') { + parser_fatal(p, "Indented by an amount greater than 1"); + } + } + *indent = i; + return i; +} + +static void list_header(struct parser *p, int *num) { + fprintf(p->output, ".RS 4\n"); + fprintf(p->output, ".ie n \\{\\\n"); + if (*num == -1) { + fprintf(p->output, "\\h'-0%d'%s\\h'+03'\\c\n", + *num >= 10 ? 5 : 4, "\\(bu"); + } else { + fprintf(p->output, "\\h'-0%d'%d.\\h'+03'\\c\n", + *num >= 10 ? 5 : 4, *num); + } + fprintf(p->output, ".\\}\n"); + fprintf(p->output, ".el \\{\\\n"); + if (*num == -1) { + fprintf(p->output, ".IP %s 4\n", "\\(bu"); + } else { + fprintf(p->output, ".IP %d. 4\n", *num); + *num = *num + 1; + } + fprintf(p->output, ".\\}\n"); +} + +static void parse_list(struct parser *p, int *indent, int num) { + uint32_t ch; + if ((ch = parser_getch(p)) != ' ') { + parser_fatal(p, "Expected space before start of list entry"); + } + list_header(p, &num); + parse_text(p); + bool closed = false; + do { + parse_indent(p, indent, true); + if ((ch = parser_getch(p)) == UTF8_INVALID) { + break; + } + switch (ch) { + case ' ': + if ((ch = parser_getch(p)) != ' ') { + parser_fatal(p, "Expected two spaces for list entry continuation"); + } + parse_text(p); + break; + case '-': + case '.': + if ((ch = parser_getch(p)) != ' ') { + parser_fatal(p, "Expected space before start of list entry"); + } + if (!closed) { + roff_macro(p, "RE", NULL); + } + list_header(p, &num); + parse_text(p); + closed = false; + break; + default: + fprintf(p->output, "\n"); + parser_pushch(p, ch); + goto ret; + } + } while (ch != UTF8_INVALID); +ret: + if (!closed) { + roff_macro(p, "RE", NULL); + } +} + +static void parse_literal(struct parser *p, int *indent) { + uint32_t ch; + if ((ch = parser_getch(p)) != '`' || + (ch = parser_getch(p)) != '`' || + (ch = parser_getch(p)) != '\n') { + parser_fatal(p, "Expected ``` and a newline to begin literal block"); + } + int stops = 0; + roff_macro(p, "nf", NULL); + fprintf(p->output, ".RS 4\n"); + do { + int _indent = *indent; + parse_indent(p, &_indent, false); + if (_indent < *indent) { + parser_fatal(p, "Cannot deindent in literal block"); + } + while (_indent > *indent) { + --_indent; + fprintf(p->output, "\t"); + } + if ((ch = parser_getch(p)) == UTF8_INVALID) { + break; + } + if (ch == '`') { + if (++stops == 3) { + if ((ch = parser_getch(p)) != '\n') { + parser_fatal(p, "Expected literal block to end with newline"); + } + roff_macro(p, "fi", NULL); + roff_macro(p, "RE", NULL); + return; + } + } else { + while (stops != 0) { + fputc('`', p->output); + --stops; + } + switch (ch) { + case '.': + fprintf(p->output, "\\&."); + break; + case '\\': + ch = parser_getch(p); + if (ch == UTF8_INVALID) { + parser_fatal(p, "Unexpected EOF"); + } else if (ch == '\\') { + fprintf(p->output, "\\\\"); + } else { + utf8_fputch(p->output, ch); + } + break; + default: + utf8_fputch(p->output, ch); + break; + } + } + } while (ch != UTF8_INVALID); +} + +enum table_align { + ALIGN_LEFT, + ALIGN_CENTER, + ALIGN_RIGHT, +}; + +struct table_row { + struct table_cell *cell; + struct table_row *next; +}; + +struct table_cell { + enum table_align align; + str_t *contents; + struct table_cell *next; +}; + +static void parse_table(struct parser *p, uint32_t style) { + struct table_row *table = NULL; + struct table_row *currow = NULL, *prevrow = NULL; + struct table_cell *curcell = NULL; + int column = 0; + uint32_t ch; + parser_pushch(p, '|'); + + do { + if ((ch = parser_getch(p)) == UTF8_INVALID) { + break; + } + switch (ch) { + case '\n': + goto commit_table; + case '|': + prevrow = currow; + currow = calloc(1, sizeof(struct table_row)); + if (prevrow) { + // TODO: Verify the number of columns match + prevrow->next = currow; + } + curcell = calloc(1, sizeof(struct table_cell)); + currow->cell = curcell; + column = 0; + if (!table) { + table = currow; + } + break; + case ':': + if (!currow) { + parser_fatal(p, "Cannot start a column without " + "starting a row first"); + } else { + struct table_cell *prev = curcell; + curcell = calloc(1, sizeof(struct table_cell)); + if (prev) { + prev->next = curcell; + } + ++column; + } + break; + case ' ': + goto continue_cell; + default: + parser_fatal(p, "Expected either '|' or ':'"); + break; + } + if ((ch = parser_getch(p)) == UTF8_INVALID) { + break; + } + switch (ch) { + case '[': + curcell->align = ALIGN_LEFT; + break; + case '-': + curcell->align = ALIGN_CENTER; + break; + case ']': + curcell->align = ALIGN_RIGHT; + break; + case ' ': + if (prevrow) { + struct table_cell *pcell = prevrow->cell; + for (int i = 0; i <= column && pcell; ++i, pcell = pcell->next) { + if (i == column) { + curcell->align = pcell->align; + break; + } + } + } else { + parser_fatal(p, "No previous row to infer alignment from"); + } + break; + default: + parser_fatal(p, "Expected one of '[', '-', ']', or ' '"); + break; + } + curcell->contents = str_create(); +continue_cell: + switch (ch = parser_getch(p)) { + case ' ': + // Read out remainder of the text + while ((ch = parser_getch(p)) != UTF8_INVALID) { + switch (ch) { + case '\n': + goto commit_cell; + default:; + int ret = str_append_ch(curcell->contents, ch); + assert(ret != -1); + break; + } + } + break; + case '\n': + goto commit_cell; + default: + parser_fatal(p, "Expected ' ' or a newline"); + break; + } +commit_cell: + if (strstr(curcell->contents->str, "T{") + || strstr(curcell->contents->str, "T}")) { + parser_fatal(p, "Cells cannot contain T{ or T} " + "due to roff limitations"); + } + } while (ch != UTF8_INVALID); +commit_table: + + if (ch == UTF8_INVALID) { + return; + } + + roff_macro(p, "TS", NULL); + + switch (style) { + case '[': + fprintf(p->output, "allbox;"); + break; + case ']': + fprintf(p->output, "box;"); + break; + } + + // Print alignments first + currow = table; + while (currow) { + curcell = currow->cell; + while (curcell) { + fprintf(p->output, "%c%s", "lcr"[curcell->align], + curcell->next ? " " : ""); + curcell = curcell->next; + } + fprintf(p->output, "%s\n", currow->next ? "" : "."); + currow = currow->next; + } + + // Then contents + currow = table; + while (currow) { + curcell = currow->cell; + fprintf(p->output, "T{\n"); + while (curcell) { + parser_pushstr(p, curcell->contents->str); + parse_text(p); + if (curcell->next) { + fprintf(p->output, "\nT}\tT{\n"); + } else { + fprintf(p->output, "\nT}"); + } + struct table_cell *prev = curcell; + curcell = curcell->next; + str_free(prev->contents); + free(prev); + } + fprintf(p->output, "\n"); + struct table_row *prev = currow; + currow = currow->next; + free(prev); + } + + roff_macro(p, "TE", NULL); + fprintf(p->output, ".sp 1\n"); +} + +static void parse_document(struct parser *p) { + uint32_t ch; + int indent = 0; + do { + parse_indent(p, &indent, true); + if ((ch = parser_getch(p)) == UTF8_INVALID) { + break; + } + switch (ch) { + case ';': + if ((ch = parser_getch(p)) != ' ') { + parser_fatal(p, "Expected space after ; to begin comment"); + } + do { + ch = parser_getch(p); + } while (ch != UTF8_INVALID && ch != '\n'); + break; + case '#': + if (indent != 0) { + parser_pushch(p, ch); + parse_text(p); + break; + } + parse_heading(p); + break; + case '-': + parse_list(p, &indent, -1); + break; + case '.': + if ((ch = parser_getch(p)) == ' ') { + parser_pushch(p, ch); + parse_list(p, &indent, 1); + } else { + parser_pushch(p, ch); + parse_text(p); + } + break; + case '`': + parse_literal(p, &indent); + break; + case '[': + case '|': + case ']': + if (indent != 0) { + parser_fatal(p, "Tables cannot be indented"); + } + parse_table(p, ch); + break; + case ' ': + parser_fatal(p, "Tabs are required for indentation"); + break; + case '\n': + if (p->flags) { + char error[512]; + snprintf(error, sizeof(error), "Expected %c before starting " + "new paragraph (began with %c at %d:%d)", + p->flags == FORMAT_BOLD ? '*' : '_', + p->flags == FORMAT_BOLD ? '*' : '_', + p->fmt_line, p->fmt_col); + parser_fatal(p, error); + } + roff_macro(p, "P", NULL); + break; + default: + parser_pushch(p, ch); + parse_text(p); + break; + } + } while (ch != UTF8_INVALID); +} + +static void output_scdoc_preamble(struct parser *p) { + fprintf(p->output, ".\\\" Generated by scdoc " VERSION "\n"); + // Fix weird quotation marks + // http://bugs.debian.org/507673 + // http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html + fprintf(p->output, ".ie \\n(.g .ds Aq \\(aq\n"); + fprintf(p->output, ".el .ds Aq '\n"); + // Disable hyphenation: + roff_macro(p, "nh", NULL); + // Disable justification: + roff_macro(p, "ad l", NULL); + fprintf(p->output, ".\\\" Begin generated content:\n"); +} + +int main(int argc, char **argv) { + if (argc == 2 && strcmp(argv[1], "-v") == 0) { + printf("scdoc " VERSION "\n"); + return 0; + } else if (argc > 1) { + fprintf(stderr, "Usage: scdoc < input.scd > output.roff\n"); + return 1; + } + struct parser p = { + .input = stdin, + .output = stdout, + .line = 1, + .col = 1 + }; + output_scdoc_preamble(&p); + parse_preamble(&p); + parse_document(&p); + return 0; +} diff --git a/src/main.c b/src/main.c @@ -1,715 +0,0 @@ -#define _XOPEN_SOURCE 600 -#include <assert.h> -#include <ctype.h> -#include <errno.h> -#include <limits.h> -#include <stdbool.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <time.h> -#include <unistd.h> -#include "str.h" -#include "unicode.h" -#include "util.h" - -char *strstr(const char *haystack, const char *needle); -char *strerror(int errnum); - -static int parse_section(struct parser *p) { - str_t *section = str_create(); - uint32_t ch; - while ((ch = parser_getch(p)) != UTF8_INVALID) { - if (ch < 0x80 && isdigit(ch)) { - int ret = str_append_ch(section, ch); - assert(ret != -1); - } else if (ch == ')') { - if (!section->str) { - break; - } - int sec = strtol(section->str, NULL, 10); - if (sec < 0 || sec > 9) { - parser_fatal(p, "Expected section between 0 and 9"); - break; - } - str_free(section); - return sec; - } else { - parser_fatal(p, "Expected digit or )"); - break; - } - }; - parser_fatal(p, "Expected manual section"); - return -1; -} - -static str_t *parse_extra(struct parser *p) { - str_t *extra = str_create(); - int ret = str_append_ch(extra, '"'); - assert(ret != -1); - uint32_t ch; - while ((ch = parser_getch(p)) != UTF8_INVALID) { - if (ch == '"') { - ret = str_append_ch(extra, ch); - assert(ret != -1); - return extra; - } else if (ch == '\n') { - parser_fatal(p, "Unclosed extra preamble field"); - break; - } else { - ret = str_append_ch(extra, ch); - assert(ret != -1); - } - } - str_free(extra); - return NULL; -} - -static void parse_preamble(struct parser *p) { - str_t *name = str_create(); - int ex = 0; - str_t *extras[2] = { NULL }; - int section = -1; - uint32_t ch; - time_t date_time; - char date[256]; - char *source_date_epoch = getenv("SOURCE_DATE_EPOCH"); - if (source_date_epoch != NULL) { - unsigned long long epoch; - char *endptr; - errno = 0; - epoch = strtoull(source_date_epoch, &endptr, 10); - if ((errno == ERANGE && (epoch == ULLONG_MAX || epoch == 0)) - || (errno != 0 && epoch == 0)) { - fprintf(stderr, "$SOURCE_DATE_EPOCH: strtoull: %s\n", - strerror(errno)); - exit(EXIT_FAILURE); - } - if (endptr == source_date_epoch) { - fprintf(stderr, "$SOURCE_DATE_EPOCH: No digits were found: %s\n", - endptr); - exit(EXIT_FAILURE); - } - if (*endptr != '\0') { - fprintf(stderr, "$SOURCE_DATE_EPOCH: Trailing garbage: %s\n", - endptr); - exit(EXIT_FAILURE); - } - if (epoch > ULONG_MAX) { - fprintf(stderr, "$SOURCE_DATE_EPOCH: value must be smaller than or " - "equal to %lu but was found to be: %llu \n", - ULONG_MAX, epoch); - exit(EXIT_FAILURE); - } - date_time = epoch; - } else { - date_time = time(NULL); - } - struct tm *date_tm = gmtime(&date_time); - strftime(date, sizeof(date), "%F", date_tm); - while ((ch = parser_getch(p)) != UTF8_INVALID) { - if ((ch < 0x80 && isalnum(ch)) || ch == '_' || ch == '-' || ch == '.') { - int ret = str_append_ch(name, ch); - assert(ret != -1); - } else if (ch == '(') { - section = parse_section(p); - } else if (ch == '"') { - if (ex == 2) { - parser_fatal(p, "Too many extra preamble fields"); - } - extras[ex++] = parse_extra(p); - } else if (ch == '\n') { - if (name->len == 0) { - parser_fatal(p, "Expected preamble"); - } - if (section == -1) { - parser_fatal(p, "Expected manual section"); - } - char sec[2] = { '0' + section, 0 }; - char *ex2 = extras[0] != NULL ? extras[0]->str : NULL; - char *ex3 = extras[1] != NULL ? extras[1]->str : NULL; - fprintf(p->output, ".TH \"%s\" \"%s\" \"%s\"", name->str, sec, date); - /* ex2 and ex3 are already double-quoted */ - if (ex2) { - fprintf(p->output, " %s", ex2); - } - if (ex3) { - fprintf(p->output, " %s", ex3); - } - fprintf(p->output, "\n"); - break; - } - } - str_free(name); - for (int i = 0; i < 2; ++i) { - if (extras[i] != NULL) { - str_free(extras[i]); - } - } -} - -static void parse_format(struct parser *p, enum formatting fmt) { - char formats[FORMAT_LAST] = { - [FORMAT_BOLD] = 'B', - [FORMAT_UNDERLINE] = 'I', - }; - char error[512]; - if (p->flags) { - if ((p->flags & ~fmt)) { - snprintf(error, sizeof(error), "Cannot nest inline formatting " - "(began with %c at %d:%d)", - p->flags == FORMAT_BOLD ? '*' : '_', - p->fmt_line, p->fmt_col); - parser_fatal(p, error); - } - fprintf(p->output, "\\fR"); - } else { - fprintf(p->output, "\\f%c", formats[fmt]); - p->fmt_line = p->line; - p->fmt_col = p->col; - } - p->flags ^= fmt; -} - -static void parse_linebreak(struct parser *p) { - uint32_t plus = parser_getch(p); - if (plus != '+') { - fprintf(p->output, "+"); - parser_pushch(p, plus); - return; - } - uint32_t lf = parser_getch(p); - if (lf != '\n') { - fprintf(p->output, "+"); - parser_pushch(p, plus); - parser_pushch(p, '\n'); - return; - } - uint32_t ch = parser_getch(p); - if (ch == '\n') { - parser_fatal( - p, "Explicit line breaks cannot be followed by a blank line"); - } - parser_pushch(p, ch); - fprintf(p->output, "\n.br\n"); -} - -static void parse_text(struct parser *p) { - uint32_t ch, next, last = ' '; - int i = 0; - while ((ch = parser_getch(p)) != UTF8_INVALID) { - switch (ch) { - case '\\': - ch = parser_getch(p); - if (ch == UTF8_INVALID) { - parser_fatal(p, "Unexpected EOF"); - } else if (ch == '\\') { - fprintf(p->output, "\\\\"); - } else { - utf8_fputch(p->output, ch); - } - break; - case '*': - parse_format(p, FORMAT_BOLD); - break; - case '_': - next = parser_getch(p); - if (!isalnum(last) || ((p->flags & FORMAT_UNDERLINE) && !isalnum(next))) { - parse_format(p, FORMAT_UNDERLINE); - } else { - utf8_fputch(p->output, ch); - } - if (next == UTF8_INVALID) { - return; - } - parser_pushch(p, next); - break; - case '+': - parse_linebreak(p); - break; - case '\n': - utf8_fputch(p->output, ch); - return; - case '.': - if (!i) { - // Escape . if it's the first character - fprintf(p->output, "\\&."); - break; - } - /* fallthrough */ - default: - last = ch; - utf8_fputch(p->output, ch); - break; - } - ++i; - } -} - -static void parse_heading(struct parser *p) { - uint32_t ch; - int level = 1; - while ((ch = parser_getch(p)) != UTF8_INVALID) { - if (ch == '#') { - ++level; - } else if (ch == ' ') { - break; - } else { - parser_fatal(p, "Invalid start of heading (probably needs a space)"); - } - } - switch (level) { - case 1: - fprintf(p->output, ".SH "); - break; - case 2: - fprintf(p->output, ".SS "); - break; - default: - parser_fatal(p, "Only headings up to two levels deep are permitted"); - break; - } - while ((ch = parser_getch(p)) != UTF8_INVALID) { - utf8_fputch(p->output, ch); - if (ch == '\n') { - break; - } - } -} - -static int parse_indent(struct parser *p, int *indent, bool write) { - int i = 0; - uint32_t ch; - while ((ch = parser_getch(p)) == '\t') { - ++i; - } - parser_pushch(p, ch); - if (ch == '\n' && *indent != 0) { - // Don't change indent when we encounter empty lines - return *indent; - } - if (write) { - if (i < *indent) { - for (int j = *indent; i < j; --j) { - roff_macro(p, "RE", NULL); - } - } else if (i == *indent + 1) { - fprintf(p->output, ".RS 4\n"); - } else if (i != *indent && ch == '\t') { - parser_fatal(p, "Indented by an amount greater than 1"); - } - } - *indent = i; - return i; -} - -static void list_header(struct parser *p, int *num) { - fprintf(p->output, ".RS 4\n"); - fprintf(p->output, ".ie n \\{\\\n"); - if (*num == -1) { - fprintf(p->output, "\\h'-0%d'%s\\h'+03'\\c\n", - *num >= 10 ? 5 : 4, "\\(bu"); - } else { - fprintf(p->output, "\\h'-0%d'%d.\\h'+03'\\c\n", - *num >= 10 ? 5 : 4, *num); - } - fprintf(p->output, ".\\}\n"); - fprintf(p->output, ".el \\{\\\n"); - if (*num == -1) { - fprintf(p->output, ".IP %s 4\n", "\\(bu"); - } else { - fprintf(p->output, ".IP %d. 4\n", *num); - *num = *num + 1; - } - fprintf(p->output, ".\\}\n"); -} - -static void parse_list(struct parser *p, int *indent, int num) { - uint32_t ch; - if ((ch = parser_getch(p)) != ' ') { - parser_fatal(p, "Expected space before start of list entry"); - } - list_header(p, &num); - parse_text(p); - bool closed = false; - do { - parse_indent(p, indent, true); - if ((ch = parser_getch(p)) == UTF8_INVALID) { - break; - } - switch (ch) { - case ' ': - if ((ch = parser_getch(p)) != ' ') { - parser_fatal(p, "Expected two spaces for list entry continuation"); - } - parse_text(p); - break; - case '-': - case '.': - if ((ch = parser_getch(p)) != ' ') { - parser_fatal(p, "Expected space before start of list entry"); - } - if (!closed) { - roff_macro(p, "RE", NULL); - } - list_header(p, &num); - parse_text(p); - closed = false; - break; - default: - fprintf(p->output, "\n"); - parser_pushch(p, ch); - goto ret; - } - } while (ch != UTF8_INVALID); -ret: - if (!closed) { - roff_macro(p, "RE", NULL); - } -} - -static void parse_literal(struct parser *p, int *indent) { - uint32_t ch; - if ((ch = parser_getch(p)) != '`' || - (ch = parser_getch(p)) != '`' || - (ch = parser_getch(p)) != '\n') { - parser_fatal(p, "Expected ``` and a newline to begin literal block"); - } - int stops = 0; - roff_macro(p, "nf", NULL); - fprintf(p->output, ".RS 4\n"); - do { - int _indent = *indent; - parse_indent(p, &_indent, false); - if (_indent < *indent) { - parser_fatal(p, "Cannot deindent in literal block"); - } - while (_indent > *indent) { - --_indent; - fprintf(p->output, "\t"); - } - if ((ch = parser_getch(p)) == UTF8_INVALID) { - break; - } - if (ch == '`') { - if (++stops == 3) { - if ((ch = parser_getch(p)) != '\n') { - parser_fatal(p, "Expected literal block to end with newline"); - } - roff_macro(p, "fi", NULL); - roff_macro(p, "RE", NULL); - return; - } - } else { - while (stops != 0) { - fputc('`', p->output); - --stops; - } - switch (ch) { - case '.': - fprintf(p->output, "\\&."); - break; - case '\\': - ch = parser_getch(p); - if (ch == UTF8_INVALID) { - parser_fatal(p, "Unexpected EOF"); - } else if (ch == '\\') { - fprintf(p->output, "\\\\"); - } else { - utf8_fputch(p->output, ch); - } - break; - default: - utf8_fputch(p->output, ch); - break; - } - } - } while (ch != UTF8_INVALID); -} - -enum table_align { - ALIGN_LEFT, - ALIGN_CENTER, - ALIGN_RIGHT, -}; - -struct table_row { - struct table_cell *cell; - struct table_row *next; -}; - -struct table_cell { - enum table_align align; - str_t *contents; - struct table_cell *next; -}; - -static void parse_table(struct parser *p, uint32_t style) { - struct table_row *table = NULL; - struct table_row *currow = NULL, *prevrow = NULL; - struct table_cell *curcell = NULL; - int column = 0; - uint32_t ch; - parser_pushch(p, '|'); - - do { - if ((ch = parser_getch(p)) == UTF8_INVALID) { - break; - } - switch (ch) { - case '\n': - goto commit_table; - case '|': - prevrow = currow; - currow = calloc(1, sizeof(struct table_row)); - if (prevrow) { - // TODO: Verify the number of columns match - prevrow->next = currow; - } - curcell = calloc(1, sizeof(struct table_cell)); - currow->cell = curcell; - column = 0; - if (!table) { - table = currow; - } - break; - case ':': - if (!currow) { - parser_fatal(p, "Cannot start a column without " - "starting a row first"); - } else { - struct table_cell *prev = curcell; - curcell = calloc(1, sizeof(struct table_cell)); - if (prev) { - prev->next = curcell; - } - ++column; - } - break; - case ' ': - goto continue_cell; - default: - parser_fatal(p, "Expected either '|' or ':'"); - break; - } - if ((ch = parser_getch(p)) == UTF8_INVALID) { - break; - } - switch (ch) { - case '[': - curcell->align = ALIGN_LEFT; - break; - case '-': - curcell->align = ALIGN_CENTER; - break; - case ']': - curcell->align = ALIGN_RIGHT; - break; - case ' ': - if (prevrow) { - struct table_cell *pcell = prevrow->cell; - for (int i = 0; i <= column && pcell; ++i, pcell = pcell->next) { - if (i == column) { - curcell->align = pcell->align; - break; - } - } - } else { - parser_fatal(p, "No previous row to infer alignment from"); - } - break; - default: - parser_fatal(p, "Expected one of '[', '-', ']', or ' '"); - break; - } - curcell->contents = str_create(); -continue_cell: - switch (ch = parser_getch(p)) { - case ' ': - // Read out remainder of the text - while ((ch = parser_getch(p)) != UTF8_INVALID) { - switch (ch) { - case '\n': - goto commit_cell; - default:; - int ret = str_append_ch(curcell->contents, ch); - assert(ret != -1); - break; - } - } - break; - case '\n': - goto commit_cell; - default: - parser_fatal(p, "Expected ' ' or a newline"); - break; - } -commit_cell: - if (strstr(curcell->contents->str, "T{") - || strstr(curcell->contents->str, "T}")) { - parser_fatal(p, "Cells cannot contain T{ or T} " - "due to roff limitations"); - } - } while (ch != UTF8_INVALID); -commit_table: - - if (ch == UTF8_INVALID) { - return; - } - - roff_macro(p, "TS", NULL); - - switch (style) { - case '[': - fprintf(p->output, "allbox;"); - break; - case ']': - fprintf(p->output, "box;"); - break; - } - - // Print alignments first - currow = table; - while (currow) { - curcell = currow->cell; - while (curcell) { - fprintf(p->output, "%c%s", "lcr"[curcell->align], - curcell->next ? " " : ""); - curcell = curcell->next; - } - fprintf(p->output, "%s\n", currow->next ? "" : "."); - currow = currow->next; - } - - // Then contents - currow = table; - while (currow) { - curcell = currow->cell; - fprintf(p->output, "T{\n"); - while (curcell) { - parser_pushstr(p, curcell->contents->str); - parse_text(p); - if (curcell->next) { - fprintf(p->output, "\nT}\tT{\n"); - } else { - fprintf(p->output, "\nT}"); - } - struct table_cell *prev = curcell; - curcell = curcell->next; - str_free(prev->contents); - free(prev); - } - fprintf(p->output, "\n"); - struct table_row *prev = currow; - currow = currow->next; - free(prev); - } - - roff_macro(p, "TE", NULL); - fprintf(p->output, ".sp 1\n"); -} - -static void parse_document(struct parser *p) { - uint32_t ch; - int indent = 0; - do { - parse_indent(p, &indent, true); - if ((ch = parser_getch(p)) == UTF8_INVALID) { - break; - } - switch (ch) { - case ';': - if ((ch = parser_getch(p)) != ' ') { - parser_fatal(p, "Expected space after ; to begin comment"); - } - do { - ch = parser_getch(p); - } while (ch != UTF8_INVALID && ch != '\n'); - break; - case '#': - if (indent != 0) { - parser_pushch(p, ch); - parse_text(p); - break; - } - parse_heading(p); - break; - case '-': - parse_list(p, &indent, -1); - break; - case '.': - if ((ch = parser_getch(p)) == ' ') { - parser_pushch(p, ch); - parse_list(p, &indent, 1); - } else { - parser_pushch(p, ch); - parse_text(p); - } - break; - case '`': - parse_literal(p, &indent); - break; - case '[': - case '|': - case ']': - if (indent != 0) { - parser_fatal(p, "Tables cannot be indented"); - } - parse_table(p, ch); - break; - case ' ': - parser_fatal(p, "Tabs are required for indentation"); - break; - case '\n': - if (p->flags) { - char error[512]; - snprintf(error, sizeof(error), "Expected %c before starting " - "new paragraph (began with %c at %d:%d)", - p->flags == FORMAT_BOLD ? '*' : '_', - p->flags == FORMAT_BOLD ? '*' : '_', - p->fmt_line, p->fmt_col); - parser_fatal(p, error); - } - roff_macro(p, "P", NULL); - break; - default: - parser_pushch(p, ch); - parse_text(p); - break; - } - } while (ch != UTF8_INVALID); -} - -static void output_scdoc_preamble(struct parser *p) { - fprintf(p->output, ".\\\" Generated by scdoc " VERSION "\n"); - // Fix weird quotation marks - // http://bugs.debian.org/507673 - // http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html - fprintf(p->output, ".ie \\n(.g .ds Aq \\(aq\n"); - fprintf(p->output, ".el .ds Aq '\n"); - // Disable hyphenation: - roff_macro(p, "nh", NULL); - // Disable justification: - roff_macro(p, "ad l", NULL); - fprintf(p->output, ".\\\" Begin generated content:\n"); -} - -int main(int argc, char **argv) { - if (argc == 2 && strcmp(argv[1], "-v") == 0) { - printf("scdoc " VERSION "\n"); - return 0; - } else if (argc > 1) { - fprintf(stderr, "Usage: scdoc < input.scd > output.roff\n"); - return 1; - } - struct parser p = { - .input = stdin, - .output = stdout, - .line = 1, - .col = 1 - }; - output_scdoc_preamble(&p); - parse_preamble(&p); - parse_document(&p); - return 0; -} diff --git a/src/string.c b/src/string.c @@ -1,45 +0,0 @@ -#include <stdlib.h> -#include <stdint.h> -#include "str.h" -#include "unicode.h" - -static int ensure_capacity(str_t *str, size_t len) { - if (len + 1 >= str->size) { - char *new = realloc(str->str, str->size * 2); - if (!new) { - return 0; - } - str->str = new; - str->size *= 2; - } - return 1; -} - -str_t *str_create() { - str_t *str = calloc(sizeof(str_t), 1); - str->str = malloc(16); - str->size = 16; - str->len = 0; - str->str[0] = '\0'; - return str; -} - -void str_free(str_t *str) { - if (!str) return; - free(str->str); - free(str); -} - -int str_append_ch(str_t *str, uint32_t ch) { - int size = utf8_chsize(ch); - if (size <= 0) { - return -1; - } - if (!ensure_capacity(str, str->len + size)) { - return -1; - } - utf8_encode(&str->str[str->len], ch); - str->len += size; - str->str[str->len] = '\0'; - return size; -} diff --git a/src/utf8_chsize.c b/src/utf8_chsize.c @@ -1,14 +0,0 @@ -#include <stdint.h> -#include <stddef.h> -#include "unicode.h" - -size_t utf8_chsize(uint32_t ch) { - if (ch < 0x80) { - return 1; - } else if (ch < 0x800) { - return 2; - } else if (ch < 0x10000) { - return 3; - } - return 4; -} diff --git a/src/utf8_decode.c b/src/utf8_decode.c @@ -1,38 +0,0 @@ -#include <stdint.h> -#include <stddef.h> -#include "unicode.h" - -uint8_t masks[] = { - 0x7F, - 0x1F, - 0x0F, - 0x07, - 0x03, - 0x01 -}; - -uint32_t utf8_decode(const char **char_str) { - uint8_t **s = (uint8_t **)char_str; - - uint32_t cp = 0; - if (**s < 128) { - // shortcut - cp = **s; - ++*s; - return cp; - } - int size = utf8_size((char *)*s); - if (size == -1) { - ++*s; - return UTF8_INVALID; - } - uint8_t mask = masks[size - 1]; - cp = **s & mask; - ++*s; - while (--size) { - cp <<= 6; - cp |= **s & 0x3f; - ++*s; - } - return cp; -} diff --git a/src/utf8_encode.c b/src/utf8_encode.c @@ -1,30 +0,0 @@ -#include <stdint.h> -#include <stddef.h> -#include "unicode.h" - -size_t utf8_encode(char *str, uint32_t ch) { - size_t len = 0; - uint8_t first; - - if (ch < 0x80) { - first = 0; - len = 1; - } else if (ch < 0x800) { - first = 0xc0; - len = 2; - } else if (ch < 0x10000) { - first = 0xe0; - len = 3; - } else { - first = 0xf0; - len = 4; - } - - for (size_t i = len - 1; i > 0; --i) { - str[i] = (ch & 0x3f) | 0x80; - ch >>= 6; - } - - str[0] = ch | first; - return len; -} diff --git a/src/utf8_fgetch.c b/src/utf8_fgetch.c @@ -1,27 +0,0 @@ -#include <stdint.h> -#include <stdio.h> -#include "unicode.h" - -uint32_t utf8_fgetch(FILE *f) { - char buffer[UTF8_MAX_SIZE]; - int c = fgetc(f); - if (c == EOF) { - return UTF8_INVALID; - } - buffer[0] = (char)c; - int size = utf8_size(buffer); - - if (size > UTF8_MAX_SIZE) { - fseek(f, size - 1, SEEK_CUR); - return UTF8_INVALID; - } - - if (size > 1) { - int amt = fread(&buffer[1], 1, size - 1, f); - if (amt != size - 1) { - return UTF8_INVALID; - } - } - const char *ptr = buffer; - return utf8_decode(&ptr); -} diff --git a/src/utf8_fputch.c b/src/utf8_fputch.c @@ -1,10 +0,0 @@ -#include <stdint.h> -#include <stdio.h> -#include "unicode.h" - -size_t utf8_fputch(FILE *f, uint32_t ch) { - char buffer[UTF8_MAX_SIZE]; - char *ptr = buffer; - size_t size = utf8_encode(ptr, ch); - return fwrite(&buffer, 1, size, f); -} diff --git a/src/utf8_size.c b/src/utf8_size.c @@ -1,27 +0,0 @@ -#include <stdint.h> -#include <stddef.h> -#include "unicode.h" - -struct { - uint8_t mask; - uint8_t result; - int octets; -} sizes[] = { - { 0x80, 0x00, 1 }, - { 0xE0, 0xC0, 2 }, - { 0xF0, 0xE0, 3 }, - { 0xF8, 0xF0, 4 }, - { 0xFC, 0xF8, 5 }, - { 0xFE, 0xF8, 6 }, - { 0x80, 0x80, -1 }, -}; - -int utf8_size(const char *s) { - uint8_t c = (uint8_t)*s; - for (size_t i = 0; i < sizeof(sizes) / 2; ++i) { - if ((c & sizes[i].mask) == sizes[i].result) { - return sizes[i].octets; - } - } - return -1; -} diff --git a/src/util.c b/src/util.c @@ -1,71 +0,0 @@ -#include <stdarg.h> -#include <stdlib.h> -#include <stdint.h> -#include <stdio.h> -#include "unicode.h" -#include "util.h" - -void parser_fatal(struct parser *parser, const char *err) { - fprintf(stderr, "Error at %d:%d: %s\n", - parser->line, parser->col, err); - fclose(parser->input); - fclose(parser->output); - exit(1); -} - -uint32_t parser_getch(struct parser *parser) { - if (parser->qhead) { - return parser->queue[--parser->qhead]; - } - if (parser->str) { - uint32_t ch = utf8_decode(&parser->str); - if (!ch || ch == UTF8_INVALID) { - parser->str = NULL; - return UTF8_INVALID; - } - return ch; - } - uint32_t ch = utf8_fgetch(parser->input); - if (ch == '\n') { - parser->col = 0; - ++parser->line; - } else { - ++parser->col; - } - return ch; -} - -void parser_pushch(struct parser *parser, uint32_t ch) { - if (ch != UTF8_INVALID) { - parser->queue[parser->qhead++] = ch; - } -} - -void parser_pushstr(struct parser *parser, const char *str) { - parser->str = str; -} - -int roff_macro(struct parser *p, char *cmd, ...) { - FILE *f = p->output; - int l = fprintf(f, ".%s", cmd); - va_list ap; - va_start(ap, cmd); - const char *arg; - while ((arg = va_arg(ap, const char *))) { - fputc(' ', f); - fputc('"', f); - while (*arg) { - uint32_t ch = utf8_decode(&arg); - if (ch == '"') { - fputc('\\', f); - ++l; - } - l += utf8_fputch(f, ch); - } - fputc('"', f); - l += 3; - } - va_end(ap); - fputc('\n', f); - return l + 1; -} diff --git a/include/str.h b/str.h diff --git a/string.c b/string.c @@ -0,0 +1,46 @@ +#include <stdlib.h> +#include <stdint.h> + +#include "str.h" +#include "utf8.h" + +static int ensure_capacity(str_t *str, size_t len) { + if (len + 1 >= str->size) { + char *new = realloc(str->str, str->size * 2); + if (!new) { + return 0; + } + str->str = new; + str->size *= 2; + } + return 1; +} + +str_t *str_create() { + str_t *str = calloc(sizeof(str_t), 1); + str->str = malloc(16); + str->size = 16; + str->len = 0; + str->str[0] = '\0'; + return str; +} + +void str_free(str_t *str) { + if (!str) return; + free(str->str); + free(str); +} + +int str_append_ch(str_t *str, uint32_t ch) { + int size = utf8_chsize(ch); + if (size <= 0) { + return -1; + } + if (!ensure_capacity(str, str->len + size)) { + return -1; + } + utf8_encode(&str->str[str->len], ch); + str->len += size; + str->str[str->len] = '\0'; + return size; +} diff --git a/utf8.c b/utf8.c @@ -0,0 +1,132 @@ +#include <stdint.h> +#include <stddef.h> + +#include "utf8.h" + +size_t utf8_chsize(uint32_t ch) { + if (ch < 0x80) { + return 1; + } else if (ch < 0x800) { + return 2; + } else if (ch < 0x10000) { + return 3; + } + return 4; +} + +uint8_t masks[] = { + 0x7F, + 0x1F, + 0x0F, + 0x07, + 0x03, + 0x01 +}; + +uint32_t utf8_decode(const char **char_str) { + uint8_t **s = (uint8_t **)char_str; + + uint32_t cp = 0; + if (**s < 128) { + // shortcut + cp = **s; + ++*s; + return cp; + } + int size = utf8_size((char *)*s); + if (size == -1) { + ++*s; + return UTF8_INVALID; + } + uint8_t mask = masks[size - 1]; + cp = **s & mask; + ++*s; + while (--size) { + cp <<= 6; + cp |= **s & 0x3f; + ++*s; + } + return cp; +} + +size_t utf8_encode(char *str, uint32_t ch) { + size_t len = 0; + uint8_t first; + + if (ch < 0x80) { + first = 0; + len = 1; + } else if (ch < 0x800) { + first = 0xc0; + len = 2; + } else if (ch < 0x10000) { + first = 0xe0; + len = 3; + } else { + first = 0xf0; + len = 4; + } + + for (size_t i = len - 1; i > 0; --i) { + str[i] = (ch & 0x3f) | 0x80; + ch >>= 6; + } + + str[0] = ch | first; + return len; +} + +uint32_t utf8_fgetch(FILE *f) { + char buffer[UTF8_MAX_SIZE]; + int c = fgetc(f); + if (c == EOF) { + return UTF8_INVALID; + } + buffer[0] = (char)c; + int size = utf8_size(buffer); + + if (size > UTF8_MAX_SIZE) { + fseek(f, size - 1, SEEK_CUR); + return UTF8_INVALID; + } + + if (size > 1) { + int amt = fread(&buffer[1], 1, size - 1, f); + if (amt != size - 1) { + return UTF8_INVALID; + } + } + const char *ptr = buffer; + return utf8_decode(&ptr); +} + +size_t utf8_fputch(FILE *f, uint32_t ch) { + char buffer[UTF8_MAX_SIZE]; + char *ptr = buffer; + size_t size = utf8_encode(ptr, ch); + return fwrite(&buffer, 1, size, f); +} + +struct { + uint8_t mask; + uint8_t result; + int octets; +} sizes[] = { + { 0x80, 0x00, 1 }, + { 0xE0, 0xC0, 2 }, + { 0xF0, 0xE0, 3 }, + { 0xF8, 0xF0, 4 }, + { 0xFC, 0xF8, 5 }, + { 0xFE, 0xF8, 6 }, + { 0x80, 0x80, -1 }, +}; + +int utf8_size(const char *s) { + uint8_t c = (uint8_t)*s; + for (size_t i = 0; i < sizeof(sizes) / 2; ++i) { + if ((c & sizes[i].mask) == sizes[i].result) { + return sizes[i].octets; + } + } + return -1; +} diff --git a/include/unicode.h b/utf8.h diff --git a/util.c b/util.c @@ -0,0 +1,72 @@ +#include <stdarg.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> + +#include "utf8.h" +#include "util.h" + +void parser_fatal(struct parser *parser, const char *err) { + fprintf(stderr, "Error at %d:%d: %s\n", + parser->line, parser->col, err); + fclose(parser->input); + fclose(parser->output); + exit(1); +} + +uint32_t parser_getch(struct parser *parser) { + if (parser->qhead) { + return parser->queue[--parser->qhead]; + } + if (parser->str) { + uint32_t ch = utf8_decode(&parser->str); + if (!ch || ch == UTF8_INVALID) { + parser->str = NULL; + return UTF8_INVALID; + } + return ch; + } + uint32_t ch = utf8_fgetch(parser->input); + if (ch == '\n') { + parser->col = 0; + ++parser->line; + } else { + ++parser->col; + } + return ch; +} + +void parser_pushch(struct parser *parser, uint32_t ch) { + if (ch != UTF8_INVALID) { + parser->queue[parser->qhead++] = ch; + } +} + +void parser_pushstr(struct parser *parser, const char *str) { + parser->str = str; +} + +int roff_macro(struct parser *p, char *cmd, ...) { + FILE *f = p->output; + int l = fprintf(f, ".%s", cmd); + va_list ap; + va_start(ap, cmd); + const char *arg; + while ((arg = va_arg(ap, const char *))) { + fputc(' ', f); + fputc('"', f); + while (*arg) { + uint32_t ch = utf8_decode(&arg); + if (ch == '"') { + fputc('\\', f); + ++l; + } + l += utf8_fputch(f, ch); + } + fputc('"', f); + l += 3; + } + va_end(ap); + fputc('\n', f); + return l + 1; +} diff --git a/include/util.h b/util.h