scdoc2mdoc

A fork of scdoc to output mdoc(7)
git clone git://git.sgregoratto.me/scdoc2mdoc
Log | Files | Refs | README | LICENSE

utf8.c (2073B)


      1 #include <stddef.h>
      2 #include <stdint.h>
      3 #include <stdio.h>
      4 #include "utf8.h"
      5 
      6 size_t
      7 utf8_chsize(uint32_t ch)
      8 {
      9 	if (ch < 0x80)
     10 		return 1;
     11 	else if (ch < 0x800)
     12 		return 2;
     13 	else if (ch < 0x10000)
     14 		return 3;
     15 	else
     16 		return 4;
     17 }
     18 
     19 uint8_t masks[] = {
     20 	0x7F,
     21 	0x1F,
     22 	0x0F,
     23 	0x07,
     24 	0x03,
     25 	0x01
     26 };
     27 
     28 uint32_t
     29 utf8_decode(const char **char_str)
     30 {
     31 	uint8_t **s = (uint8_t **)char_str;
     32 
     33 	uint32_t cp = 0;
     34 	if (**s < 128) {
     35 		// shortcut
     36 		cp = **s;
     37 		++*s;
     38 		return cp;
     39 	}
     40 	int size = utf8_size((char *)*s);
     41 	if (size == -1) {
     42 		++*s;
     43 		return UTF8_INVALID;
     44 	}
     45 	uint8_t mask = masks[size - 1];
     46 	cp = **s & mask;
     47 	++*s;
     48 	while (--size) {
     49 		cp <<= 6;
     50 		cp |= **s & 0x3F;
     51 		++*s;
     52 	}
     53 	return cp;
     54 }
     55 
     56 size_t
     57 utf8_encode(char *str, uint32_t ch)
     58 {
     59 	size_t len;
     60 	uint8_t first;
     61 	switch(len = utf8_chsize(ch)) {
     62 	case 1:
     63 		first = 0;
     64 		break;
     65 	case 2:
     66 		first = 0xC0;
     67 		break;
     68 	case 3:
     69 		first = 0xE0;
     70 		break;
     71 	default:
     72 		first = 0xF0;
     73 		break;
     74 	}
     75 
     76 	for (size_t i = len - 1; i > 0; --i) {
     77 		str[i] = (ch & 0x3F) | 0x80;
     78 		ch >>= 6;
     79 	}
     80 
     81 	str[0] = ch | first;
     82 	return len;
     83 }
     84 
     85 uint32_t
     86 utf8_fgetch(FILE *f)
     87 {
     88 	char buffer[UTF8_MAX_SIZE];
     89 	int c;
     90 	if ((c = fgetc(f)) == EOF)
     91 		return UTF8_INVALID;
     92 
     93 	buffer[0] = (char)c;
     94 	int size = utf8_size(buffer);
     95 
     96 	if (size > UTF8_MAX_SIZE) {
     97 		fseek(f, size - 1, SEEK_CUR);
     98 		return UTF8_INVALID;
     99 	} else if (size > 1) {
    100 		int amt = fread(&buffer[1], 1, size - 1, f);
    101 		if (amt != size - 1)
    102 			return UTF8_INVALID;
    103 	}
    104 
    105 	const char *ptr = buffer;
    106 	return utf8_decode(&ptr);
    107 }
    108 
    109 size_t
    110 utf8_putch(uint32_t ch)
    111 {
    112 	char buffer[UTF8_MAX_SIZE];
    113 	char *ptr = buffer;
    114 	size_t size = utf8_encode(ptr, ch);
    115 	return fwrite(&buffer, 1, size, stdout);
    116 }
    117 
    118 struct {
    119 	uint8_t mask;
    120 	uint8_t result;
    121 	int octets;
    122 } sizes[] = {
    123 	{ 0x80, 0x00,  1 },
    124 	{ 0xE0, 0xC0,  2 },
    125 	{ 0xF0, 0xE0,  3 },
    126 	{ 0xF8, 0xF0,  4 },
    127 	{ 0xFC, 0xF8,  5 },
    128 	{ 0xFE, 0xF8,  6 },
    129 	{ 0x80, 0x80, -1 },
    130 };
    131 
    132 int
    133 utf8_size(const char *s)
    134 {
    135 	uint8_t c = (uint8_t)*s;
    136 	for (size_t i = 0; i < sizeof(sizes) / 2; ++i)
    137 		if ((c & sizes[i].mask) == sizes[i].result)
    138 			return sizes[i].octets;
    139 
    140 	return -1;
    141 }