scdoc2mdoc

A fork of scdoc to output mdoc(7)
git clone git://git.sgregoratto.me/scdoc2mdoc
Log | Files | Refs | README | LICENSE

commit a4193949ec755b3848a803a9b02364eeddbb1455
Author: Drew DeVault <sir@cmpwn.com>
Date:   Sat,  9 Dec 2017 23:18:57 -0500

Initial commit

Diffstat:
A.gitignore | 1+
Ainclude/string.h | 17+++++++++++++++++
Ainclude/unicode.h | 43+++++++++++++++++++++++++++++++++++++++++++
Ainclude/util.h | 16++++++++++++++++
Ameson.build | 29+++++++++++++++++++++++++++++
Ascdoc.5.scd | 80+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/main.c | 95+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/string.c | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/utf8_chsize.c | 14++++++++++++++
Asrc/utf8_decode.c | 38++++++++++++++++++++++++++++++++++++++
Asrc/utf8_encode.c | 30++++++++++++++++++++++++++++++
Asrc/utf8_fgetch.c | 21+++++++++++++++++++++
Asrc/utf8_fputch.c | 10++++++++++
Asrc/utf8_size.c | 27+++++++++++++++++++++++++++
Asrc/util.c | 50++++++++++++++++++++++++++++++++++++++++++++++++++
15 files changed, 526 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1 @@ +build diff --git a/include/string.h b/include/string.h @@ -0,0 +1,17 @@ +#ifndef _SCDOC_STRING_H +#define _SCDOC_STRING_H +#include <stdint.h> + +struct str { + char *str; + size_t len, size; +}; + +typedef struct str str_t; + +str_t *str_create(); +void str_free(str_t *str); +void str_reset(str_t *str); +int str_append_ch(str_t *str, uint32_t ch); + +#endif diff --git a/include/unicode.h b/include/unicode.h @@ -0,0 +1,43 @@ +#ifndef _SCDOC_UNICODE_H +#define _SCDOC_UNICODE_H +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> + +// Technically UTF-8 supports up to 6 byte codepoints, but Unicode itself +// doesn't really bother with more than 4. +#define UTF8_MAX_SIZE 4 + +#define UTF8_INVALID 0x80 + +/** + * Grabs the next UTF-8 character and advances the string pointer + */ +uint32_t utf8_decode(const char **str); + +/** + * Encodes a character as UTF-8 and returns the length of that character. + */ +size_t utf8_encode(char *str, uint32_t ch); + +/** + * Returns the size of the next UTF-8 character + */ +int utf8_size(const char *str); + +/** + * Returns the size of a UTF-8 character + */ +size_t utf8_chsize(uint32_t ch); + +/** + * Reads and returns the next character from the file. + */ +uint32_t utf8_fgetch(FILE *f); + +/** + * Writes this character to the file and returns the number of bytes written. + */ +size_t utf8_fputch(FILE *f, uint32_t ch); + +#endif diff --git a/include/util.h b/include/util.h @@ -0,0 +1,16 @@ +#ifndef _SCDOC_PARSER_H +#define _SCDOC_PARSER_H +#include <stdarg.h> +#include <stdint.h> +#include <stdio.h> + +struct parser { + FILE *input, *output; + int line, col; +}; + +void parser_fatal(struct parser *parser, const char *err); +uint32_t parser_getch(struct parser *parser); +int roff_macro(struct parser *p, char *cmd, ...); + +#endif diff --git a/meson.build b/meson.build @@ -0,0 +1,29 @@ +# TODO: Just use a makefile +project( + 'scdoc', + 'c', + license: 'MIT', + meson_version: '>=0.43.0', + default_options: [ + 'c_std=c99', + 'warning_level=2', + 'werror=true', + ], +) + +add_project_arguments('-Wno-unused-parameter', language: 'c') + +executable( + 'scdoc', [ + 'src/main.c', + 'src/string.c', + 'src/utf8_chsize.c', + 'src/utf8_decode.c', + 'src/utf8_encode.c', + 'src/utf8_fgetch.c', + 'src/utf8_fputch.c', + 'src/utf8_size.c', + 'src/util.c', + ], + include_directories: include_directories('include') +) diff --git a/scdoc.5.scd b/scdoc.5.scd @@ -0,0 +1,80 @@ +scdoc(5) + +# NAME + +scdoc - syntax description for scdoc markup language + +# DESCRIPTION + +scdoc is a tool designed to make the process of writing man pages more +friendly. It converts scdoc files into roff macros, which can then be converted +to man pages or a number of other formats. The syntax is inspired by, but not +directly taken from, markdown. Input files *must* use the UTF-8 encoding. + +# PREAMBLE + +Each scdoc file must begin with the following preamble: + + *name*(_section_) + +The *name* is the name of the man page you are writing, and _section_ is the +section you're writing for (see *man*(1) for information on manual sections). + +# SECTION HEADERS + +Each section of your man page should begin with something similar to the +following: + + # HEADER NAME + +Subsection headers are also understood - use two hashes. Each header must have +an empty line on either side. + +# PARAGRAPHS + +Begin a new paragraph with an empty line. + +# FORMATTING + +Text can be made *bold* or _underlined_ with asterisks and underscores: \*bold\* +or \_underlined\_. + +# INDENTATION + +You may indent lines with tab characters ("\t") to indent them by 4 spaces in +the output. Indented lines may not contain headers. + +# LISTS + +You may start bulleted lists with dashes, like so: + +``` +- Item 1 +- Item 2 +- Item 3 +``` + +You may also use numbered lists like so: + +``` +1. Item 1 +2. Item 2 +3. Item 3 +``` + +# LITERAL TEXT + +You may turn off scdoc formatting and output literal text with escape codes and +literal blocks. Inserting a \\ into your source will cause the subsequent symbol +to be treated as a literal and copied directly to the output. You may also make +blocks of literal syntax like so: + +``` +\`\`\` +_This formatting_ will *not* be interpreted by scdoc. +\`\`\` +``` + +These blocks will be indented one level. Note that literal text is shown +literally in the man viewer - that is, it's not a means for inserting your own +roff macros into the output. diff --git a/src/main.c b/src/main.c @@ -0,0 +1,95 @@ +#include <assert.h> +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> +#include <unistd.h> +#include "string.h" +#include "unicode.h" +#include "util.h" + +char date[256]; + +static int parse_section(struct parser *p) { + str_t *section = str_create(); + uint32_t ch; + while ((ch = parser_getch(p)) != UTF8_INVALID) { + if (isdigit(ch)) { + assert(str_append_ch(section, ch) != -1); + } else if (ch == ')') { + if (!section->str) { + break; + } + int sec = strtol(section->str, NULL, 10); + if (sec < 1 || sec > 9) { + parser_fatal(p, "Expected section between 1 and 9"); + break; + } + str_free(section); + return sec; + } else { + parser_fatal(p, "Expected digit or )"); + break; + } + }; + parser_fatal(p, "Expected manual section"); + return -1; +} + +static void parse_preamble(struct parser *p) { + str_t *name = str_create(); + int section = -1; + uint32_t ch; + do { + ch = parser_getch(p); + if (isalnum(ch)) { + assert(str_append_ch(name, ch) != -1); + } else if (ch == '(') { + section = parse_section(p); + } else if (ch == '\n') { + if (name->len == 0) { + parser_fatal(p, "Expected preamble"); + } + if (section == -1) { + parser_fatal(p, "Expected manual section"); + } + char sec[2] = { '0' + section, 0 }; + roff_macro(p, "TH", name->str, sec, date, NULL); + break; + } + } while (ch != UTF8_INVALID); + str_free(name); +} + +static void output_preamble(struct parser *p) { + // TODO: Add version here + fprintf(p->output, ".\\\" Generated by scdoc\n"); + fprintf(p->output, ".\\\" Fix weird qutation marks:\n"); + fprintf(p->output, ".\\\" http://bugs.debian.org/507673\n"); + fprintf(p->output, ".\\\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html\n"); + fprintf(p->output, ".ie \\n(.g .ds Aq \\(aq\n"); + fprintf(p->output, ".el .ds Aq '\n"); + fprintf(p->output, ".\\\" Disable hyphenation:\n"); + roff_macro(p, "nh", NULL); + fprintf(p->output, ".\\\" Generated content:\n"); +} + +int main(int argc, char **argv) { + if (argc > 1) { + fprintf(stderr, "Usage: scdoc < input.scd > output.roff"); + return 1; + } + time_t now; + time(&now); + struct tm *now_tm = localtime(&now); + strftime(date, sizeof(date), "%F", now_tm); + struct parser p = { + .input = stdin, + .output = stdout, + .line = 1, + .col = 1 + }; + output_preamble(&p); + parse_preamble(&p); + return 0; +} diff --git a/src/string.c b/src/string.c @@ -0,0 +1,55 @@ +#include <stdlib.h> +#include <stdint.h> +#include "string.h" +#include "unicode.h" + +static void sanity_check(str_t *str) { + if (str->str == NULL) { + str->str = malloc(16); + str->size = 16; + str->len = 0; + str->str[0] = '\0'; + } +} + +static int ensure_capacity(str_t *str, size_t len) { + if (len + 1 >= str->size) { + char *new = realloc(str->str, str->size * 2); + if (!new) { + return 0; + } + str->str = new; + str->size *= 2; + } + return 1; +} + +str_t *str_create() { + return calloc(sizeof(str_t), 1); +} + +void str_free(str_t *str) { + if (!str) return; + free(str->str); + free(str); +} + +void str_reset(str_t *str) { + str->len = 0; + str->str[0] = '\0'; +} + +int str_append_ch(str_t *str, uint32_t ch) { + int size = utf8_chsize(ch); + if (size <= 0) { + return -1; + } + sanity_check(str); + if (!ensure_capacity(str, str->len + size)) { + return -1; + } + utf8_encode(&str->str[str->len], ch); + str->len += size; + str->str[str->len] = '\0'; + return size; +} diff --git a/src/utf8_chsize.c b/src/utf8_chsize.c @@ -0,0 +1,14 @@ +#include <stdint.h> +#include <stddef.h> +#include "unicode.h" + +size_t utf8_chsize(uint32_t ch) { + if (ch < 0x80) { + return 1; + } else if (ch < 0x800) { + return 2; + } else if (ch < 0x10000) { + return 3; + } + return 4; +} diff --git a/src/utf8_decode.c b/src/utf8_decode.c @@ -0,0 +1,38 @@ +#include <stdint.h> +#include <stddef.h> +#include "unicode.h" + +uint8_t masks[] = { + 0x7F, + 0x1F, + 0x0F, + 0x07, + 0x03, + 0x01 +}; + +uint32_t utf8_decode(const char **char_str) { + uint8_t **s = (uint8_t **)char_str; + + uint32_t cp = 0; + if (**s < 128) { + // shortcut + cp = **s; + ++*s; + return cp; + } + int size = utf8_size((char *)*s); + if (size == -1) { + ++*s; + return UTF8_INVALID; + } + uint8_t mask = masks[size - 1]; + cp = **s & mask; + ++*s; + while (--size) { + cp <<= 6; + cp |= **s & 0x3f; + ++*s; + } + return cp; +} diff --git a/src/utf8_encode.c b/src/utf8_encode.c @@ -0,0 +1,30 @@ +#include <stdint.h> +#include <stddef.h> +#include "unicode.h" + +size_t utf8_encode(char *str, uint32_t ch) { + size_t len = 0; + uint8_t first; + + if (ch < 0x80) { + first = 0; + len = 1; + } else if (ch < 0x800) { + first = 0xc0; + len = 2; + } else if (ch < 0x10000) { + first = 0xe0; + len = 3; + } else { + first = 0xf0; + len = 4; + } + + for (size_t i = len - 1; i > 0; --i) { + str[i] = (ch & 0x3f) | 0x80; + ch >>= 6; + } + + str[0] = ch | first; + return len; +} diff --git a/src/utf8_fgetch.c b/src/utf8_fgetch.c @@ -0,0 +1,21 @@ +#include <stdint.h> +#include <stdio.h> +#include "unicode.h" + +uint32_t utf8_fgetch(FILE *f) { + char buffer[UTF8_MAX_SIZE]; + int c = fgetc(f); + if (c == EOF) { + return UTF8_INVALID; + } + buffer[0] = (char)c; + int size = utf8_size(buffer); + if (size > 1) { + int amt = fread(&buffer[1], 1, size - 1, f); + if (amt != size - 1) { + return UTF8_INVALID; + } + } + const char *ptr = buffer; + return utf8_decode(&ptr); +} diff --git a/src/utf8_fputch.c b/src/utf8_fputch.c @@ -0,0 +1,10 @@ +#include <stdint.h> +#include <stdio.h> +#include "unicode.h" + +size_t utf8_fputch(FILE *f, uint32_t ch) { + char buffer[UTF8_MAX_SIZE]; + char *ptr = buffer; + size_t size = utf8_encode(ptr, ch); + return fwrite(&buffer, 1, size, f); +} diff --git a/src/utf8_size.c b/src/utf8_size.c @@ -0,0 +1,27 @@ +#include <stdint.h> +#include <stddef.h> +#include "unicode.h" + +struct { + uint8_t mask; + uint8_t result; + int octets; +} sizes[] = { + { 0x80, 0x00, 1 }, + { 0xE0, 0xC0, 2 }, + { 0xF0, 0xE0, 3 }, + { 0xF8, 0xF0, 4 }, + { 0xFC, 0xF8, 5 }, + { 0xFE, 0xF8, 6 }, + { 0x80, 0x80, -1 }, +}; + +int utf8_size(const char *s) { + uint8_t c = (uint8_t)*s; + for (size_t i = 0; i < sizeof(sizes) / 2; ++i) { + if ((c & sizes[i].mask) == sizes[i].result) { + return sizes[i].octets; + } + } + return -1; +} diff --git a/src/util.c b/src/util.c @@ -0,0 +1,50 @@ +#include <stdarg.h> +#include <stdlib.h> +#include <stdint.h> +#include <stdio.h> +#include "unicode.h" +#include "util.h" + +void parser_fatal(struct parser *parser, const char *err) { + fprintf(stderr, "Error at %d:%d: %s\n", + parser->line, parser->col, err); + fclose(parser->input); + fclose(parser->output); + exit(1); +} + +uint32_t parser_getch(struct parser *parser) { + uint32_t ch = utf8_fgetch(parser->input); + if (ch == '\n') { + parser->col = 0; + ++parser->line; + } else { + ++parser->col; + } + return ch; +} + +int roff_macro(struct parser *p, char *cmd, ...) { + FILE *f = p->output; + int l = fprintf(f, ".%s", cmd); + va_list ap; + va_start(ap, cmd); + const char *arg; + while ((arg = va_arg(ap, const char *))) { + fputc(' ', f); + fputc('"', f); + while (*arg) { + uint32_t ch = utf8_decode(&arg); + if (ch == '"') { + fputc('\\', f); + ++l; + } + l += utf8_fputch(f, ch); + } + fputc('"', f); + l += 3; + } + va_end(ap); + fputc('\n', f); + return l + 1; +}