utf8.c (2073B)
1 #include <stddef.h> 2 #include <stdint.h> 3 #include <stdio.h> 4 #include "utf8.h" 5 6 size_t 7 utf8_chsize(uint32_t ch) 8 { 9 if (ch < 0x80) 10 return 1; 11 else if (ch < 0x800) 12 return 2; 13 else if (ch < 0x10000) 14 return 3; 15 else 16 return 4; 17 } 18 19 uint8_t masks[] = { 20 0x7F, 21 0x1F, 22 0x0F, 23 0x07, 24 0x03, 25 0x01 26 }; 27 28 uint32_t 29 utf8_decode(const char **char_str) 30 { 31 uint8_t **s = (uint8_t **)char_str; 32 33 uint32_t cp = 0; 34 if (**s < 128) { 35 // shortcut 36 cp = **s; 37 ++*s; 38 return cp; 39 } 40 int size = utf8_size((char *)*s); 41 if (size == -1) { 42 ++*s; 43 return UTF8_INVALID; 44 } 45 uint8_t mask = masks[size - 1]; 46 cp = **s & mask; 47 ++*s; 48 while (--size) { 49 cp <<= 6; 50 cp |= **s & 0x3F; 51 ++*s; 52 } 53 return cp; 54 } 55 56 size_t 57 utf8_encode(char *str, uint32_t ch) 58 { 59 size_t len; 60 uint8_t first; 61 switch(len = utf8_chsize(ch)) { 62 case 1: 63 first = 0; 64 break; 65 case 2: 66 first = 0xC0; 67 break; 68 case 3: 69 first = 0xE0; 70 break; 71 default: 72 first = 0xF0; 73 break; 74 } 75 76 for (size_t i = len - 1; i > 0; --i) { 77 str[i] = (ch & 0x3F) | 0x80; 78 ch >>= 6; 79 } 80 81 str[0] = ch | first; 82 return len; 83 } 84 85 uint32_t 86 utf8_fgetch(FILE *f) 87 { 88 char buffer[UTF8_MAX_SIZE]; 89 int c; 90 if ((c = fgetc(f)) == EOF) 91 return UTF8_INVALID; 92 93 buffer[0] = (char)c; 94 int size = utf8_size(buffer); 95 96 if (size > UTF8_MAX_SIZE) { 97 fseek(f, size - 1, SEEK_CUR); 98 return UTF8_INVALID; 99 } else if (size > 1) { 100 int amt = fread(&buffer[1], 1, size - 1, f); 101 if (amt != size - 1) 102 return UTF8_INVALID; 103 } 104 105 const char *ptr = buffer; 106 return utf8_decode(&ptr); 107 } 108 109 size_t 110 utf8_putch(uint32_t ch) 111 { 112 char buffer[UTF8_MAX_SIZE]; 113 char *ptr = buffer; 114 size_t size = utf8_encode(ptr, ch); 115 return fwrite(&buffer, 1, size, stdout); 116 } 117 118 struct { 119 uint8_t mask; 120 uint8_t result; 121 int octets; 122 } sizes[] = { 123 { 0x80, 0x00, 1 }, 124 { 0xE0, 0xC0, 2 }, 125 { 0xF0, 0xE0, 3 }, 126 { 0xF8, 0xF0, 4 }, 127 { 0xFC, 0xF8, 5 }, 128 { 0xFE, 0xF8, 6 }, 129 { 0x80, 0x80, -1 }, 130 }; 131 132 int 133 utf8_size(const char *s) 134 { 135 uint8_t c = (uint8_t)*s; 136 for (size_t i = 0; i < sizeof(sizes) / 2; ++i) 137 if ((c & sizes[i].mask) == sizes[i].result) 138 return sizes[i].octets; 139 140 return -1; 141 }