Freeciv-3.1
Loading...
Searching...
No Matches
fc_utf8.c
Go to the documentation of this file.
1/***********************************************************************
2 Freeciv - Copyright (C) 1996 - A Kjeldberg, L Gregersen, P Unold
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2, or (at your option)
6 any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12***********************************************************************/
13
14#ifdef HAVE_CONFIG_H
15#include <fc_config.h>
16#endif
17
18#include <stdarg.h>
19#include <string.h>
20
21/* utility */
22#include "log.h"
23#include "mem.h"
24#include "support.h"
25
26#include "fc_utf8.h"
27
28
29/* The length of a character for external use (at least 1 to avoid infinite
30 * loops). See also fc_ut8_next_char(). */
31const char fc_utf8_skip[256] = {
32 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10000000 to 10001111. */
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10010000 to 10011111. */
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10100000 to 10101111. */
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10110000 to 10111111. */
44 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
45 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
46 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
47#ifdef USE_6_BYTES_CHAR
48 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 /* 11110000 to 11111111. */
49#else
50 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1 /* 11110000 to 11111111. */
51#endif /* USE_6_BYTES_CHAR */
52};
53
54/* The length of a character for internal use (0 means an invalid start of
55 * a character). */
56static const char fc_utf8_char_size[256] = {
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10000000 to 10001111. */
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10010000 to 10011111. */
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10100000 to 10101111. */
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10110000 to 10111111. */
69 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
71 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
72#ifdef USE_6_BYTES_CHAR
73 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 /* 11110000 to 11111111. */
74#else
75 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 /* 11110000 to 11111111. */
76#endif /* USE_6_BYTES_CHAR */
77};
78
79#define FC_UTF8_CHAR_SIZE(utf8_char) \
80 fc_utf8_char_size[*(unsigned char *) utf8_char]
81
82#define FC_UTF8_REP_CHAR "\xef\xbf\xbd" /* U+FFFD. */
83
84
85/************************************************************************/
89static inline bool base_fc_utf8_char_validate(const char *utf8_char,
90 char size)
91{
92 if (1 < size) {
93 do {
94 utf8_char++;
95 if (0x80 != (0xC0 & *(const unsigned char *)utf8_char)) {
96 /* Not a valid byte of the sequence. */
97 return FALSE;
98 }
99 size--;
100 } while (1 < size);
101 return TRUE;
102 } else {
103 return (1 == size);
104 }
105}
106
107/************************************************************************/
110static inline size_t base_fc_utf8_strlcpy_trunc(char *dest, const char *src,
111 size_t n)
112{
113 const char *end;
114 size_t len;
115
116 (void) fc_utf8_validate_len(src, n, &end);
117 len = end - src;
118 fc_assert(len < n);
119 if (0 < len) {
120 memcpy(dest, src, len);
121 }
122 dest[len] = '\0';
123 return strlen(src);
124}
125
126/************************************************************************/
129static inline size_t base_fc_utf8_strlcpy_rep(char *dest, const char *src,
130 size_t n)
131{
132 const char *end;
133 size_t src_len, len;
134
135 fc_assert_ret_val(NULL != src, 0);
136
137 src_len = strlen(src);
138 while (TRUE) {
139 if (fc_utf8_validate_len(src, n, &end)) {
140 /* Valid UTF-8. */
141 len = end - src;
142
143 fc_assert(len < n);
144
145 if (0 < len) {
146 memcpy(dest, src, len);
147 }
148 dest[len] = '\0'; /* Valid UTF-8 string part. */
149 return src_len;
150 } else {
151 /* '*end' is not a valid UTF-8 character. */
152 len = end - src;
153
154 fc_assert(len < n);
155
156 if (0 < len) {
157 memcpy(dest, src, len);
158 }
159
160 n -= len;
161 dest += len;
162
163 /* Try to insert the replacement character. */
164 len = sizeof(FC_UTF8_REP_CHAR);
165 if (n > len) {
166 memcpy(dest, FC_UTF8_REP_CHAR, len);
167 n -= len;
168 dest += len;
169 }
170
171 if (1 == n) {
172 *dest = '\0';
173 return src_len; /* End of 'dest' reached. */
174 }
175
176 /* Jump to next character in src. */
177 src = fc_utf8_find_next_char(end);
178 if (src == NULL || *src == '\0') {
179 *dest = '\0';
180 return src_len; /* End of 'src' reached. */
181 }
182 }
183 }
184 fc_assert(FALSE); /* Shouldn't occur! */
185 return src_len;
186}
187
188
189/************************************************************************/
197
198/************************************************************************/
204const char *fc_utf8_find_next_char(const char *utf8_char)
205{
206 do {
207 utf8_char++;
208 } while (0 == FC_UTF8_CHAR_SIZE(utf8_char));
209
210 return utf8_char;
211}
212
213/************************************************************************/
220const char *fc_utf8_find_prev_char(const char *utf8_char,
221 const char *utf8_string)
222{
223 for (utf8_char--; utf8_char > utf8_string; utf8_char--) {
224 if (0 != FC_UTF8_CHAR_SIZE(utf8_char)) {
225 return utf8_char;
226 }
227 }
228
229 return utf8_string;
230}
231
232/************************************************************************/
239bool fc_utf8_validate(const char *utf8_string, const char **end)
240{
241 char size;
242
243 while ('\0' != *utf8_string) {
244 size = FC_UTF8_CHAR_SIZE(utf8_string);
245 if (!base_fc_utf8_char_validate(utf8_string, size)) {
246 if (NULL != end) {
247 *end = utf8_string;
248 }
249 return FALSE;
250 }
251 utf8_string += size;
252 }
253 if (NULL != end) {
254 *end = utf8_string;
255 }
256
257 return TRUE;
258}
259
260/************************************************************************/
268bool fc_utf8_validate_len(const char *utf8_string, size_t byte_len,
269 const char **end)
270{
271 unsigned char size;
272
273 while ('\0' != *utf8_string) {
274 size = FC_UTF8_CHAR_SIZE(utf8_string);
275
276 if (!base_fc_utf8_char_validate(utf8_string, size)) {
277 if (end != NULL) {
278 *end = utf8_string;
279 }
280 return FALSE;
281 }
282
283 if (size > byte_len) {
284 if (end != NULL) {
285 *end = utf8_string;
286 }
287 return FALSE;
288 } else {
289 byte_len -= size;
290 }
291
292 utf8_string += size;
293 }
294
295 if (end != NULL) {
296 *end = utf8_string;
297 }
298
299 return TRUE;
300}
301
302/************************************************************************/
309char *fc_utf8_validate_trunc(char *utf8_string)
310{
311 char *end;
312
313 if (!fc_utf8_validate(utf8_string, (const char **) &end)) {
314 *end = '\0';
315 }
316
317 return utf8_string;
318}
319
320/************************************************************************/
327char *fc_utf8_validate_trunc_len(char *utf8_string, size_t byte_len)
328{
329 char *end;
330
331 if (!fc_utf8_validate_len(utf8_string, byte_len, (const char **) &end)) {
332 *end = '\0';
333 }
334
335 return utf8_string;
336}
337
338/************************************************************************/
345char *fc_utf8_validate_trunc_dup(const char *utf8_string)
346{
347 const char *end;
348 size_t size;
349 char *ret;
350
351 (void) fc_utf8_validate(utf8_string, &end);
352 size = end - utf8_string;
353 ret = fc_malloc(size + 1); /* Keep a spot for '\0'. */
354 memcpy(ret, utf8_string, size);
355 ret[size] = '\0';
356
357 return ret;
358}
359
360/************************************************************************/
368char *fc_utf8_validate_rep_len(char *utf8_string, size_t byte_len)
369{
370 if (0 < byte_len) {
371 char copy[byte_len];
372
373 fc_strlcpy(copy, utf8_string, byte_len);
374 base_fc_utf8_strlcpy_rep(utf8_string, copy, byte_len);
375 }
376
377 return utf8_string;
378}
379
380/************************************************************************/
386char *fc_utf8_validate_rep_dup(const char *utf8_string)
387{
388 char *ret;
389 const char *utf8_char;
390 size_t size = 1; /* '\0'. */
391 char char_size;
392
393 /* Check needed size. */
394 utf8_char = utf8_string;
395 while ('\0' != *utf8_char) {
396 char_size = FC_UTF8_CHAR_SIZE(utf8_char);
397 if (base_fc_utf8_char_validate(utf8_char, char_size)) {
398 /* Normal valid character. */
399 size += char_size;
400 utf8_char += char_size;
401 } else {
402 /* Replacement character. */
403 size += sizeof(FC_UTF8_REP_CHAR);
404 /* Find next character. */
405 do {
406 utf8_char++;
407 } while (0 == FC_UTF8_CHAR_SIZE(utf8_char));
408 }
409 }
410
411 /* Do the allocation. */
412 ret = fc_malloc(size);
413 base_fc_utf8_strlcpy_rep(ret, utf8_string, size);
414
415 return ret;
416}
417
418/************************************************************************/
425size_t fc_utf8_strlen(const char *utf8_string)
426{
427 size_t len;
428
429 for (len = 0; '\0' != *utf8_string; len++) {
430 utf8_string = fc_ut8_next_char(utf8_string);
431 }
432
433 return len;
434}
435
436/************************************************************************/
443size_t fc_utf8_strlcpy_trunc(char *dest, const char *src, size_t n)
444{
445 fc_assert_ret_val(0 < n, -1);
446
447 return base_fc_utf8_strlcpy_trunc(dest, src, n);
448}
449
450/************************************************************************/
457size_t fc_utf8_strlcpy_rep(char *dest, const char *src, size_t n)
458{
459 fc_assert_ret_val(0 < n, -1);
460
461 return base_fc_utf8_strlcpy_rep(dest, src, n);
462}
463
464/************************************************************************/
474size_t fc_utf8_strlcat_trunc(char *dest, const char *src, size_t n)
475{
476 size_t len;
477
478 fc_assert_ret_val(0 < n, -1);
479
480 len = strlen(dest);
481 fc_assert_ret_val(len < n, -1);
482
483 return len + base_fc_utf8_strlcpy_trunc(dest + len, src, n - len);
484}
485
486/************************************************************************/
496size_t fc_utf8_strlcat_rep(char *dest, const char *src, size_t n)
497{
498 size_t len;
499
500 fc_assert_ret_val(0 < n, -1);
501
502 len = strlen(dest);
503 fc_assert_ret_val(len < n, -1);
504
505 return len + base_fc_utf8_strlcpy_rep(dest + len, src, n - len);
506}
507
508/************************************************************************/
515int fc_utf8_snprintf_trunc(char *str, size_t n, const char *format, ...)
516{
517 int ret;
518 va_list args;
519
520 va_start(args, format);
521 ret = fc_utf8_vsnprintf_trunc(str, n, format, args);
522 va_end(args);
523
524 return ret;
525}
526
527/************************************************************************/
534int fc_utf8_snprintf_rep(char *str, size_t n, const char *format, ...)
535{
536 int ret;
537 va_list args;
538
539 va_start(args, format);
540 ret = fc_utf8_vsnprintf_rep(str, n, format, args);
541 va_end(args);
542 return ret;
543}
544
545/************************************************************************/
552int fc_utf8_vsnprintf_trunc(char *str, size_t n, const char *format,
553 va_list args)
554{
555 char *end;
556 int ret;
557
558 fc_assert_ret_val(0 < n, -1);
559
560 ret = fc_vsnprintf(str, n, format, args);
561 if (fc_utf8_validate(str, (const char **) &end)) {
562 /* Already valid UTF-8. */
563 return ret;
564 } else {
565 /* Truncate at last valid UTF-8 character. */
566 *end = '\0';
567 return (-1 == ret ? -1 : end - str);
568 }
569}
570
571/************************************************************************/
578int fc_utf8_vsnprintf_rep(char *str, size_t n, const char *format,
579 va_list args)
580{
581 char *end;
582 int ret;
583
584 fc_assert_ret_val(0 < n, -1);
585
586 ret = fc_vsnprintf(str, n, format, args);
587 if (fc_utf8_validate(str, (const char **) &end)) {
588 /* Already valid UTF-8. */
589 return ret;
590 } else {
591 (void) fc_utf8_validate_rep_len(end, n - (end - str));
592 return ((-1 == ret) ? -1 : (int)strlen(str));
593 }
594}
595
596/************************************************************************/
606int cat_utf8_snprintf_trunc(char *str, size_t n, const char *format, ...)
607{
608 size_t len;
609 int ret;
610 va_list args;
611
612 fc_assert_ret_val(0 < n, -1);
613
614 len = strlen(str);
615 fc_assert_ret_val(len < n, -1);
616
617 va_start(args, format);
618 ret = fc_utf8_vsnprintf_trunc(str + len, n - len, format, args);
619 va_end(args);
620
621 return ((-1 == ret) ? -1 : (int)(ret + len));
622}
623
624/************************************************************************/
634int cat_utf8_snprintf_rep(char *str, size_t n, const char *format, ...)
635{
636 size_t len;
637 int ret;
638 va_list args;
639
640 fc_assert_ret_val(0 < n, -1);
641
642 len = strlen(str);
643 fc_assert_ret_val(len < n, -1);
644
645 va_start(args, format);
646 ret = fc_utf8_vsnprintf_rep(str + len, n - len, format, args);
647 va_end(args);
648
649 return ((-1 == ret) ? -1 : (int)(ret + len));
650}
#define str
Definition astring.c:76
#define n
Definition astring.c:77
#define FC_UTF8_CHAR_SIZE(utf8_char)
Definition fc_utf8.c:79
int fc_utf8_snprintf_rep(char *str, size_t n, const char *format,...)
Definition fc_utf8.c:534
size_t fc_utf8_strlcpy_rep(char *dest, const char *src, size_t n)
Definition fc_utf8.c:457
size_t fc_utf8_strlcpy_trunc(char *dest, const char *src, size_t n)
Definition fc_utf8.c:443
static size_t base_fc_utf8_strlcpy_trunc(char *dest, const char *src, size_t n)
Definition fc_utf8.c:110
static bool base_fc_utf8_char_validate(const char *utf8_char, char size)
Definition fc_utf8.c:89
#define FC_UTF8_REP_CHAR
Definition fc_utf8.c:82
char * fc_utf8_validate_rep_len(char *utf8_string, size_t byte_len)
Definition fc_utf8.c:368
bool fc_utf8_char_validate(const char *utf8_char)
Definition fc_utf8.c:193
size_t fc_utf8_strlcat_rep(char *dest, const char *src, size_t n)
Definition fc_utf8.c:496
const char * fc_utf8_find_prev_char(const char *utf8_char, const char *utf8_string)
Definition fc_utf8.c:220
static const char fc_utf8_char_size[256]
Definition fc_utf8.c:56
char * fc_utf8_validate_trunc(char *utf8_string)
Definition fc_utf8.c:309
char * fc_utf8_validate_trunc_dup(const char *utf8_string)
Definition fc_utf8.c:345
int fc_utf8_vsnprintf_trunc(char *str, size_t n, const char *format, va_list args)
Definition fc_utf8.c:552
int fc_utf8_vsnprintf_rep(char *str, size_t n, const char *format, va_list args)
Definition fc_utf8.c:578
size_t fc_utf8_strlcat_trunc(char *dest, const char *src, size_t n)
Definition fc_utf8.c:474
char * fc_utf8_validate_rep_dup(const char *utf8_string)
Definition fc_utf8.c:386
const char * fc_utf8_find_next_char(const char *utf8_char)
Definition fc_utf8.c:204
bool fc_utf8_validate_len(const char *utf8_string, size_t byte_len, const char **end)
Definition fc_utf8.c:268
static size_t base_fc_utf8_strlcpy_rep(char *dest, const char *src, size_t n)
Definition fc_utf8.c:129
size_t fc_utf8_strlen(const char *utf8_string)
Definition fc_utf8.c:425
int fc_utf8_snprintf_trunc(char *str, size_t n, const char *format,...)
Definition fc_utf8.c:515
int cat_utf8_snprintf_rep(char *str, size_t n, const char *format,...)
Definition fc_utf8.c:634
char * fc_utf8_validate_trunc_len(char *utf8_string, size_t byte_len)
Definition fc_utf8.c:327
const char fc_utf8_skip[256]
Definition fc_utf8.c:31
int cat_utf8_snprintf_trunc(char *str, size_t n, const char *format,...)
Definition fc_utf8.c:606
bool fc_utf8_validate(const char *utf8_string, const char **end)
Definition fc_utf8.c:239
#define fc_ut8_next_char(utf8_char)
Definition fc_utf8.h:42
#define fc_assert(condition)
Definition log.h:176
#define fc_assert_ret_val(condition, val)
Definition log.h:194
#define fc_malloc(sz)
Definition mem.h:34
int len
Definition packhand.c:125
size_t size
Definition specvec.h:72
size_t fc_strlcpy(char *dest, const char *src, size_t n)
Definition support.c:787
int fc_vsnprintf(char *str, size_t n, const char *format, va_list ap)
Definition support.c:896
#define TRUE
Definition support.h:46
#define FALSE
Definition support.h:47