commit a4193949ec755b3848a803a9b02364eeddbb1455
Author: Drew DeVault <sir@cmpwn.com>
Date: Sat, 9 Dec 2017 23:18:57 -0500
Initial commit
Diffstat:
15 files changed, 526 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+build
diff --git a/include/string.h b/include/string.h
@@ -0,0 +1,17 @@
+#ifndef _SCDOC_STRING_H
+#define _SCDOC_STRING_H
+#include <stdint.h>
+
+struct str {
+ char *str;
+ size_t len, size;
+};
+
+typedef struct str str_t;
+
+str_t *str_create();
+void str_free(str_t *str);
+void str_reset(str_t *str);
+int str_append_ch(str_t *str, uint32_t ch);
+
+#endif
diff --git a/include/unicode.h b/include/unicode.h
@@ -0,0 +1,43 @@
+#ifndef _SCDOC_UNICODE_H
+#define _SCDOC_UNICODE_H
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+// Technically UTF-8 supports up to 6 byte codepoints, but Unicode itself
+// doesn't really bother with more than 4.
+#define UTF8_MAX_SIZE 4
+
+#define UTF8_INVALID 0x80
+
+/**
+ * Grabs the next UTF-8 character and advances the string pointer
+ */
+uint32_t utf8_decode(const char **str);
+
+/**
+ * Encodes a character as UTF-8 and returns the length of that character.
+ */
+size_t utf8_encode(char *str, uint32_t ch);
+
+/**
+ * Returns the size of the next UTF-8 character
+ */
+int utf8_size(const char *str);
+
+/**
+ * Returns the size of a UTF-8 character
+ */
+size_t utf8_chsize(uint32_t ch);
+
+/**
+ * Reads and returns the next character from the file.
+ */
+uint32_t utf8_fgetch(FILE *f);
+
+/**
+ * Writes this character to the file and returns the number of bytes written.
+ */
+size_t utf8_fputch(FILE *f, uint32_t ch);
+
+#endif
diff --git a/include/util.h b/include/util.h
@@ -0,0 +1,16 @@
+#ifndef _SCDOC_PARSER_H
+#define _SCDOC_PARSER_H
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+
+struct parser {
+ FILE *input, *output;
+ int line, col;
+};
+
+void parser_fatal(struct parser *parser, const char *err);
+uint32_t parser_getch(struct parser *parser);
+int roff_macro(struct parser *p, char *cmd, ...);
+
+#endif
diff --git a/meson.build b/meson.build
@@ -0,0 +1,29 @@
+# TODO: Just use a makefile
+project(
+ 'scdoc',
+ 'c',
+ license: 'MIT',
+ meson_version: '>=0.43.0',
+ default_options: [
+ 'c_std=c99',
+ 'warning_level=2',
+ 'werror=true',
+ ],
+)
+
+add_project_arguments('-Wno-unused-parameter', language: 'c')
+
+executable(
+ 'scdoc', [
+ 'src/main.c',
+ 'src/string.c',
+ 'src/utf8_chsize.c',
+ 'src/utf8_decode.c',
+ 'src/utf8_encode.c',
+ 'src/utf8_fgetch.c',
+ 'src/utf8_fputch.c',
+ 'src/utf8_size.c',
+ 'src/util.c',
+ ],
+ include_directories: include_directories('include')
+)
diff --git a/scdoc.5.scd b/scdoc.5.scd
@@ -0,0 +1,80 @@
+scdoc(5)
+
+# NAME
+
+scdoc - syntax description for scdoc markup language
+
+# DESCRIPTION
+
+scdoc is a tool designed to make the process of writing man pages more
+friendly. It converts scdoc files into roff macros, which can then be converted
+to man pages or a number of other formats. The syntax is inspired by, but not
+directly taken from, markdown. Input files *must* use the UTF-8 encoding.
+
+# PREAMBLE
+
+Each scdoc file must begin with the following preamble:
+
+ *name*(_section_)
+
+The *name* is the name of the man page you are writing, and _section_ is the
+section you're writing for (see *man*(1) for information on manual sections).
+
+# SECTION HEADERS
+
+Each section of your man page should begin with something similar to the
+following:
+
+ # HEADER NAME
+
+Subsection headers are also understood - use two hashes. Each header must have
+an empty line on either side.
+
+# PARAGRAPHS
+
+Begin a new paragraph with an empty line.
+
+# FORMATTING
+
+Text can be made *bold* or _underlined_ with asterisks and underscores: \*bold\*
+or \_underlined\_.
+
+# INDENTATION
+
+You may indent lines with tab characters ("\t") to indent them by 4 spaces in
+the output. Indented lines may not contain headers.
+
+# LISTS
+
+You may start bulleted lists with dashes, like so:
+
+```
+- Item 1
+- Item 2
+- Item 3
+```
+
+You may also use numbered lists like so:
+
+```
+1. Item 1
+2. Item 2
+3. Item 3
+```
+
+# LITERAL TEXT
+
+You may turn off scdoc formatting and output literal text with escape codes and
+literal blocks. Inserting a \\ into your source will cause the subsequent symbol
+to be treated as a literal and copied directly to the output. You may also make
+blocks of literal syntax like so:
+
+```
+\`\`\`
+_This formatting_ will *not* be interpreted by scdoc.
+\`\`\`
+```
+
+These blocks will be indented one level. Note that literal text is shown
+literally in the man viewer - that is, it's not a means for inserting your own
+roff macros into the output.
diff --git a/src/main.c b/src/main.c
@@ -0,0 +1,95 @@
+#include <assert.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include "string.h"
+#include "unicode.h"
+#include "util.h"
+
+char date[256];
+
+static int parse_section(struct parser *p) {
+ str_t *section = str_create();
+ uint32_t ch;
+ while ((ch = parser_getch(p)) != UTF8_INVALID) {
+ if (isdigit(ch)) {
+ assert(str_append_ch(section, ch) != -1);
+ } else if (ch == ')') {
+ if (!section->str) {
+ break;
+ }
+ int sec = strtol(section->str, NULL, 10);
+ if (sec < 1 || sec > 9) {
+ parser_fatal(p, "Expected section between 1 and 9");
+ break;
+ }
+ str_free(section);
+ return sec;
+ } else {
+ parser_fatal(p, "Expected digit or )");
+ break;
+ }
+ };
+ parser_fatal(p, "Expected manual section");
+ return -1;
+}
+
+static void parse_preamble(struct parser *p) {
+ str_t *name = str_create();
+ int section = -1;
+ uint32_t ch;
+ do {
+ ch = parser_getch(p);
+ if (isalnum(ch)) {
+ assert(str_append_ch(name, ch) != -1);
+ } else if (ch == '(') {
+ section = parse_section(p);
+ } else if (ch == '\n') {
+ if (name->len == 0) {
+ parser_fatal(p, "Expected preamble");
+ }
+ if (section == -1) {
+ parser_fatal(p, "Expected manual section");
+ }
+ char sec[2] = { '0' + section, 0 };
+ roff_macro(p, "TH", name->str, sec, date, NULL);
+ break;
+ }
+ } while (ch != UTF8_INVALID);
+ str_free(name);
+}
+
+static void output_preamble(struct parser *p) {
+ // TODO: Add version here
+ fprintf(p->output, ".\\\" Generated by scdoc\n");
+ fprintf(p->output, ".\\\" Fix weird qutation marks:\n");
+ fprintf(p->output, ".\\\" http://bugs.debian.org/507673\n");
+ fprintf(p->output, ".\\\" http://lists.gnu.org/archive/html/groff/2009-02/msg00013.html\n");
+ fprintf(p->output, ".ie \\n(.g .ds Aq \\(aq\n");
+ fprintf(p->output, ".el .ds Aq '\n");
+ fprintf(p->output, ".\\\" Disable hyphenation:\n");
+ roff_macro(p, "nh", NULL);
+ fprintf(p->output, ".\\\" Generated content:\n");
+}
+
+int main(int argc, char **argv) {
+ if (argc > 1) {
+ fprintf(stderr, "Usage: scdoc < input.scd > output.roff");
+ return 1;
+ }
+ time_t now;
+ time(&now);
+ struct tm *now_tm = localtime(&now);
+ strftime(date, sizeof(date), "%F", now_tm);
+ struct parser p = {
+ .input = stdin,
+ .output = stdout,
+ .line = 1,
+ .col = 1
+ };
+ output_preamble(&p);
+ parse_preamble(&p);
+ return 0;
+}
diff --git a/src/string.c b/src/string.c
@@ -0,0 +1,55 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include "string.h"
+#include "unicode.h"
+
+static void sanity_check(str_t *str) {
+ if (str->str == NULL) {
+ str->str = malloc(16);
+ str->size = 16;
+ str->len = 0;
+ str->str[0] = '\0';
+ }
+}
+
+static int ensure_capacity(str_t *str, size_t len) {
+ if (len + 1 >= str->size) {
+ char *new = realloc(str->str, str->size * 2);
+ if (!new) {
+ return 0;
+ }
+ str->str = new;
+ str->size *= 2;
+ }
+ return 1;
+}
+
+str_t *str_create() {
+ return calloc(sizeof(str_t), 1);
+}
+
+void str_free(str_t *str) {
+ if (!str) return;
+ free(str->str);
+ free(str);
+}
+
+void str_reset(str_t *str) {
+ str->len = 0;
+ str->str[0] = '\0';
+}
+
+int str_append_ch(str_t *str, uint32_t ch) {
+ int size = utf8_chsize(ch);
+ if (size <= 0) {
+ return -1;
+ }
+ sanity_check(str);
+ if (!ensure_capacity(str, str->len + size)) {
+ return -1;
+ }
+ utf8_encode(&str->str[str->len], ch);
+ str->len += size;
+ str->str[str->len] = '\0';
+ return size;
+}
diff --git a/src/utf8_chsize.c b/src/utf8_chsize.c
@@ -0,0 +1,14 @@
+#include <stdint.h>
+#include <stddef.h>
+#include "unicode.h"
+
+size_t utf8_chsize(uint32_t ch) {
+ if (ch < 0x80) {
+ return 1;
+ } else if (ch < 0x800) {
+ return 2;
+ } else if (ch < 0x10000) {
+ return 3;
+ }
+ return 4;
+}
diff --git a/src/utf8_decode.c b/src/utf8_decode.c
@@ -0,0 +1,38 @@
+#include <stdint.h>
+#include <stddef.h>
+#include "unicode.h"
+
+uint8_t masks[] = {
+ 0x7F,
+ 0x1F,
+ 0x0F,
+ 0x07,
+ 0x03,
+ 0x01
+};
+
+uint32_t utf8_decode(const char **char_str) {
+ uint8_t **s = (uint8_t **)char_str;
+
+ uint32_t cp = 0;
+ if (**s < 128) {
+ // shortcut
+ cp = **s;
+ ++*s;
+ return cp;
+ }
+ int size = utf8_size((char *)*s);
+ if (size == -1) {
+ ++*s;
+ return UTF8_INVALID;
+ }
+ uint8_t mask = masks[size - 1];
+ cp = **s & mask;
+ ++*s;
+ while (--size) {
+ cp <<= 6;
+ cp |= **s & 0x3f;
+ ++*s;
+ }
+ return cp;
+}
diff --git a/src/utf8_encode.c b/src/utf8_encode.c
@@ -0,0 +1,30 @@
+#include <stdint.h>
+#include <stddef.h>
+#include "unicode.h"
+
+size_t utf8_encode(char *str, uint32_t ch) {
+ size_t len = 0;
+ uint8_t first;
+
+ if (ch < 0x80) {
+ first = 0;
+ len = 1;
+ } else if (ch < 0x800) {
+ first = 0xc0;
+ len = 2;
+ } else if (ch < 0x10000) {
+ first = 0xe0;
+ len = 3;
+ } else {
+ first = 0xf0;
+ len = 4;
+ }
+
+ for (size_t i = len - 1; i > 0; --i) {
+ str[i] = (ch & 0x3f) | 0x80;
+ ch >>= 6;
+ }
+
+ str[0] = ch | first;
+ return len;
+}
diff --git a/src/utf8_fgetch.c b/src/utf8_fgetch.c
@@ -0,0 +1,21 @@
+#include <stdint.h>
+#include <stdio.h>
+#include "unicode.h"
+
+uint32_t utf8_fgetch(FILE *f) {
+ char buffer[UTF8_MAX_SIZE];
+ int c = fgetc(f);
+ if (c == EOF) {
+ return UTF8_INVALID;
+ }
+ buffer[0] = (char)c;
+ int size = utf8_size(buffer);
+ if (size > 1) {
+ int amt = fread(&buffer[1], 1, size - 1, f);
+ if (amt != size - 1) {
+ return UTF8_INVALID;
+ }
+ }
+ const char *ptr = buffer;
+ return utf8_decode(&ptr);
+}
diff --git a/src/utf8_fputch.c b/src/utf8_fputch.c
@@ -0,0 +1,10 @@
+#include <stdint.h>
+#include <stdio.h>
+#include "unicode.h"
+
+size_t utf8_fputch(FILE *f, uint32_t ch) {
+ char buffer[UTF8_MAX_SIZE];
+ char *ptr = buffer;
+ size_t size = utf8_encode(ptr, ch);
+ return fwrite(&buffer, 1, size, f);
+}
diff --git a/src/utf8_size.c b/src/utf8_size.c
@@ -0,0 +1,27 @@
+#include <stdint.h>
+#include <stddef.h>
+#include "unicode.h"
+
+struct {
+ uint8_t mask;
+ uint8_t result;
+ int octets;
+} sizes[] = {
+ { 0x80, 0x00, 1 },
+ { 0xE0, 0xC0, 2 },
+ { 0xF0, 0xE0, 3 },
+ { 0xF8, 0xF0, 4 },
+ { 0xFC, 0xF8, 5 },
+ { 0xFE, 0xF8, 6 },
+ { 0x80, 0x80, -1 },
+};
+
+int utf8_size(const char *s) {
+ uint8_t c = (uint8_t)*s;
+ for (size_t i = 0; i < sizeof(sizes) / 2; ++i) {
+ if ((c & sizes[i].mask) == sizes[i].result) {
+ return sizes[i].octets;
+ }
+ }
+ return -1;
+}
diff --git a/src/util.c b/src/util.c
@@ -0,0 +1,50 @@
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "unicode.h"
+#include "util.h"
+
+void parser_fatal(struct parser *parser, const char *err) {
+ fprintf(stderr, "Error at %d:%d: %s\n",
+ parser->line, parser->col, err);
+ fclose(parser->input);
+ fclose(parser->output);
+ exit(1);
+}
+
+uint32_t parser_getch(struct parser *parser) {
+ uint32_t ch = utf8_fgetch(parser->input);
+ if (ch == '\n') {
+ parser->col = 0;
+ ++parser->line;
+ } else {
+ ++parser->col;
+ }
+ return ch;
+}
+
+int roff_macro(struct parser *p, char *cmd, ...) {
+ FILE *f = p->output;
+ int l = fprintf(f, ".%s", cmd);
+ va_list ap;
+ va_start(ap, cmd);
+ const char *arg;
+ while ((arg = va_arg(ap, const char *))) {
+ fputc(' ', f);
+ fputc('"', f);
+ while (*arg) {
+ uint32_t ch = utf8_decode(&arg);
+ if (ch == '"') {
+ fputc('\\', f);
+ ++l;
+ }
+ l += utf8_fputch(f, ch);
+ }
+ fputc('"', f);
+ l += 3;
+ }
+ va_end(ap);
+ fputc('\n', f);
+ return l + 1;
+}