Freeciv-3.3
Loading...
Searching...
No Matches
fc_utf8.c
Go to the documentation of this file.
1/***********************************************************************
2 Freeciv - Copyright (C) 1996 - A Kjeldberg, L Gregersen, P Unold
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2, or (at your option)
6 any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12***********************************************************************/
13
14#ifdef HAVE_CONFIG_H
15#include <fc_config.h>
16#endif
17
18#include <stdarg.h>
19#include <string.h>
20
21/* utility */
22#include "log.h"
23#include "mem.h"
24#include "support.h"
25
26#include "fc_utf8.h"
27
28
29/* The length of a character for external use (at least 1 to avoid infinite
30 * loops). See also fc_ut8_next_char(). */
31const char fc_utf8_skip[256] = {
32 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10000000 to 10001111. */
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10010000 to 10011111. */
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10100000 to 10101111. */
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10110000 to 10111111. */
44 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
45 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
46 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
47#ifdef USE_6_BYTES_CHAR
48 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 /* 11110000 to 11111111. */
49#else
50 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1 /* 11110000 to 11111111. */
51#endif /* USE_6_BYTES_CHAR */
52};
53
54/* The length of a character for internal use (0 means an invalid start of
55 * a character). */
56static const char fc_utf8_char_size[256] = {
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10000000 to 10001111. */
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10010000 to 10011111. */
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10100000 to 10101111. */
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10110000 to 10111111. */
69 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
71 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
72#ifdef USE_6_BYTES_CHAR
73 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 /* 11110000 to 11111111. */
74#else
75 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 /* 11110000 to 11111111. */
76#endif /* USE_6_BYTES_CHAR */
77};
78
79#define FC_UTF8_CHAR_SIZE(utf8_char) \
80 fc_utf8_char_size[*(unsigned char *) utf8_char]
81
82#define FC_UTF8_REP_CHAR "\xef\xbf\xbd" /* U+FFFD. */
83
84
85/************************************************************************/
89static inline bool base_fc_utf8_char_validate(const char *utf8_char,
90 char size)
91{
92 if (1 < size) {
93 do {
94 utf8_char++;
95 if (0x80 != (0xC0 & *(const unsigned char *)utf8_char)) {
96 /* Not a valid byte of the sequence. */
97 return FALSE;
98 }
99 size--;
100 } while (1 < size);
101 return TRUE;
102 } else {
103 return (1 == size);
104 }
105}
106
107/************************************************************************/
110static inline size_t base_fc_utf8_strlcpy_trunc(char *dest, const char *src,
111 size_t n)
112{
113 const char *end;
114 size_t len;
115
116 (void) fc_utf8_validate_len(src, n, &end);
117 len = end - src;
118 fc_assert(len < n);
119 if (0 < len) {
120 memcpy(dest, src, len);
121 }
122 dest[len] = '\0';
123 return strlen(src);
124}
125
126/************************************************************************/
129static inline size_t base_fc_utf8_strlcpy_rep(char *dest, const char *src,
130 size_t n)
131{
132 const char *end;
133 size_t src_len, len;
134
135 fc_assert_ret_val(src != nullptr, 0);
136
137 src_len = strlen(src);
138 while (TRUE) {
139 if (fc_utf8_validate_len(src, n, &end)) {
140 /* Valid UTF-8. */
141 len = end - src;
142
143 fc_assert(len < n);
144
145 if (0 < len) {
146 memcpy(dest, src, len);
147 }
148 dest[len] = '\0'; /* Valid UTF-8 string part. */
149 return src_len;
150 } else {
151 /* '*end' is not a valid UTF-8 character. */
152 len = end - src;
153
154 fc_assert(len < n);
155
156 if (0 < len) {
157 memcpy(dest, src, len);
158 }
159
160 n -= len;
161 dest += len;
162
163 /* Try to insert the replacement character. */
164 len = sizeof(FC_UTF8_REP_CHAR);
165 if (n > len) {
167 n -= len;
168 dest += len;
169 }
170
171 if (1 == n) {
172 *dest = '\0';
173 return src_len; /* End of 'dest' reached. */
174 }
175
176 /* Jump to next character in src. */
177 src = fc_utf8_find_next_char(end);
178 if (src == nullptr || *src == '\0') {
179 *dest = '\0';
180 return src_len; /* End of 'src' reached. */
181 }
182 }
183 }
184 fc_assert(FALSE); /* Shouldn't occur! */
185 return src_len;
186}
187
188
189/************************************************************************/
197
198/************************************************************************/
204const char *fc_utf8_find_next_char(const char *utf8_char)
205{
206 do {
207 utf8_char++;
208 } while (0 == FC_UTF8_CHAR_SIZE(utf8_char));
209
210 return utf8_char;
211}
212
213/************************************************************************/
220const char *fc_utf8_find_prev_char(const char *utf8_char,
221 const char *utf8_string)
222{
224 if (0 != FC_UTF8_CHAR_SIZE(utf8_char)) {
225 return utf8_char;
226 }
227 }
228
229 return utf8_string;
230}
231
232/************************************************************************/
239bool fc_utf8_validate(const char *utf8_string, const char **end)
240{
241 char size;
242
243 while ('\0' != *utf8_string) {
246 if (end != nullptr) {
247 *end = utf8_string;
248 }
249 return FALSE;
250 }
251 utf8_string += size;
252 }
253
254 if (end != nullptr) {
255 *end = utf8_string;
256 }
257
258 return TRUE;
259}
260
261/************************************************************************/
270 const char **end)
271{
272 unsigned char size;
273
274 while ('\0' != *utf8_string) {
276
278 if (end != nullptr) {
279 *end = utf8_string;
280 }
281 return FALSE;
282 }
283
284 if (size > byte_len) {
285 if (end != nullptr) {
286 *end = utf8_string;
287 }
288 return FALSE;
289 } else {
290 byte_len -= size;
291 }
292
293 utf8_string += size;
294 }
295
296 if (end != nullptr) {
297 *end = utf8_string;
298 }
299
300 return TRUE;
301}
302
303/************************************************************************/
311{
312 char *end;
313
314 if (!fc_utf8_validate(utf8_string, (const char **) &end)) {
315 *end = '\0';
316 }
317
318 return utf8_string;
319}
320
321/************************************************************************/
329{
330 char *end;
331
332 if (!fc_utf8_validate_len(utf8_string, byte_len, (const char **) &end)) {
333 *end = '\0';
334 }
335
336 return utf8_string;
337}
338
339/************************************************************************/
347{
348 const char *end;
349 size_t size;
350 char *ret;
351
353 size = end - utf8_string;
354 ret = fc_malloc(size + 1); /* Keep a spot for '\0'. */
356 ret[size] = '\0';
357
358 return ret;
359}
360
361/************************************************************************/
370{
371 if (0 < byte_len) {
372 char copy[byte_len];
373
376 }
377
378 return utf8_string;
379}
380
381/************************************************************************/
388{
389 char *ret;
390 const char *utf8_char;
391 size_t size = 1; /* '\0'. */
392 char char_size;
393
394 /* Check needed size. */
396 while ('\0' != *utf8_char) {
399 /* Normal valid character. */
400 size += char_size;
402 } else {
403 /* Replacement character. */
404 size += sizeof(FC_UTF8_REP_CHAR);
405 /* Find next character. */
406 do {
407 utf8_char++;
408 } while (0 == FC_UTF8_CHAR_SIZE(utf8_char));
409 }
410 }
411
412 /* Do the allocation. */
413 ret = fc_malloc(size);
415
416 return ret;
417}
418
419/************************************************************************/
426size_t fc_utf8_strlen(const char *utf8_string)
427{
428 size_t len;
429
430 for (len = 0; '\0' != *utf8_string; len++) {
432 }
433
434 return len;
435}
436
437/************************************************************************/
444size_t fc_utf8_strlcpy_trunc(char *dest, const char *src, size_t n)
445{
446 fc_assert_ret_val(0 < n, -1);
447
448 return base_fc_utf8_strlcpy_trunc(dest, src, n);
449}
450
451/************************************************************************/
458size_t fc_utf8_strlcpy_rep(char *dest, const char *src, size_t n)
459{
460 fc_assert_ret_val(0 < n, -1);
461
462 return base_fc_utf8_strlcpy_rep(dest, src, n);
463}
464
465/************************************************************************/
475size_t fc_utf8_strlcat_trunc(char *dest, const char *src, size_t n)
476{
477 size_t len;
478
479 fc_assert_ret_val(0 < n, -1);
480
481 len = strlen(dest);
482 fc_assert_ret_val(len < n, -1);
483
484 return len + base_fc_utf8_strlcpy_trunc(dest + len, src, n - len);
485}
486
487/************************************************************************/
497size_t fc_utf8_strlcat_rep(char *dest, const char *src, size_t n)
498{
499 size_t len;
500
501 fc_assert_ret_val(0 < n, -1);
502
503 len = strlen(dest);
504 fc_assert_ret_val(len < n, -1);
505
506 return len + base_fc_utf8_strlcpy_rep(dest + len, src, n - len);
507}
508
509/************************************************************************/
516int fc_utf8_snprintf_trunc(char *str, size_t n, const char *format, ...)
517{
518 int ret;
519 va_list args;
520
521 va_start(args, format);
522 ret = fc_utf8_vsnprintf_trunc(str, n, format, args);
523 va_end(args);
524
525 return ret;
526}
527
528/************************************************************************/
535int fc_utf8_snprintf_rep(char *str, size_t n, const char *format, ...)
536{
537 int ret;
538 va_list args;
539
540 va_start(args, format);
541 ret = fc_utf8_vsnprintf_rep(str, n, format, args);
542 va_end(args);
543
544 return ret;
545}
546
547/************************************************************************/
554int fc_utf8_vsnprintf_trunc(char *str, size_t n, const char *format,
555 va_list args)
556{
557 char *end;
558 int ret;
559
560 fc_assert_ret_val(0 < n, -1);
561
562 ret = fc_vsnprintf(str, n, format, args);
563 if (fc_utf8_validate(str, (const char **) &end)) {
564 /* Already valid UTF-8. */
565 return ret;
566 } else {
567 /* Truncate at last valid UTF-8 character. */
568 *end = '\0';
569 return (-1 == ret ? -1 : end - str);
570 }
571}
572
573/************************************************************************/
580int fc_utf8_vsnprintf_rep(char *str, size_t n, const char *format,
581 va_list args)
582{
583 char *end;
584 int ret;
585
586 fc_assert_ret_val(0 < n, -1);
587
588 ret = fc_vsnprintf(str, n, format, args);
589 if (fc_utf8_validate(str, (const char **) &end)) {
590 /* Already valid UTF-8. */
591 return ret;
592 } else {
593 (void) fc_utf8_validate_rep_len(end, n - (end - str));
594 return ((-1 == ret) ? -1 : (int)strlen(str));
595 }
596}
597
598/************************************************************************/
608int cat_utf8_snprintf_trunc(char *str, size_t n, const char *format, ...)
609{
610 size_t len;
611 int ret;
612 va_list args;
613
614 fc_assert_ret_val(0 < n, -1);
615
616 len = strlen(str);
617 fc_assert_ret_val(len < n, -1);
618
619 va_start(args, format);
620 ret = fc_utf8_vsnprintf_trunc(str + len, n - len, format, args);
621 va_end(args);
622
623 return ((-1 == ret) ? -1 : (int)(ret + len));
624}
625
626/************************************************************************/
636int cat_utf8_snprintf_rep(char *str, size_t n, const char *format, ...)
637{
638 size_t len;
639 int ret;
640 va_list args;
641
642 fc_assert_ret_val(0 < n, -1);
643
644 len = strlen(str);
645 fc_assert_ret_val(len < n, -1);
646
647 va_start(args, format);
648 ret = fc_utf8_vsnprintf_rep(str + len, n - len, format, args);
649 va_end(args);
650
651 return ((-1 == ret) ? -1 : (int)(ret + len));
652}
#define str
Definition astring.c:76
#define n
Definition astring.c:77
char * incite_cost
Definition comments.c:76
#define FC_UTF8_CHAR_SIZE(utf8_char)
Definition fc_utf8.c:79
int fc_utf8_snprintf_rep(char *str, size_t n, const char *format,...)
Definition fc_utf8.c:535
size_t fc_utf8_strlcpy_rep(char *dest, const char *src, size_t n)
Definition fc_utf8.c:458
size_t fc_utf8_strlcpy_trunc(char *dest, const char *src, size_t n)
Definition fc_utf8.c:444
static size_t base_fc_utf8_strlcpy_trunc(char *dest, const char *src, size_t n)
Definition fc_utf8.c:110
static bool base_fc_utf8_char_validate(const char *utf8_char, char size)
Definition fc_utf8.c:89
#define FC_UTF8_REP_CHAR
Definition fc_utf8.c:82
char * fc_utf8_validate_rep_len(char *utf8_string, size_t byte_len)
Definition fc_utf8.c:369
bool fc_utf8_char_validate(const char *utf8_char)
Definition fc_utf8.c:193
size_t fc_utf8_strlcat_rep(char *dest, const char *src, size_t n)
Definition fc_utf8.c:497
const char * fc_utf8_find_prev_char(const char *utf8_char, const char *utf8_string)
Definition fc_utf8.c:220
static const char fc_utf8_char_size[256]
Definition fc_utf8.c:56
char * fc_utf8_validate_trunc(char *utf8_string)
Definition fc_utf8.c:310
char * fc_utf8_validate_trunc_dup(const char *utf8_string)
Definition fc_utf8.c:346
int fc_utf8_vsnprintf_trunc(char *str, size_t n, const char *format, va_list args)
Definition fc_utf8.c:554
int fc_utf8_vsnprintf_rep(char *str, size_t n, const char *format, va_list args)
Definition fc_utf8.c:580
size_t fc_utf8_strlcat_trunc(char *dest, const char *src, size_t n)
Definition fc_utf8.c:475
char * fc_utf8_validate_rep_dup(const char *utf8_string)
Definition fc_utf8.c:387
const char * fc_utf8_find_next_char(const char *utf8_char)
Definition fc_utf8.c:204
bool fc_utf8_validate_len(const char *utf8_string, size_t byte_len, const char **end)
Definition fc_utf8.c:269
static size_t base_fc_utf8_strlcpy_rep(char *dest, const char *src, size_t n)
Definition fc_utf8.c:129
size_t fc_utf8_strlen(const char *utf8_string)
Definition fc_utf8.c:426
int fc_utf8_snprintf_trunc(char *str, size_t n, const char *format,...)
Definition fc_utf8.c:516
int cat_utf8_snprintf_rep(char *str, size_t n, const char *format,...)
Definition fc_utf8.c:636
char * fc_utf8_validate_trunc_len(char *utf8_string, size_t byte_len)
Definition fc_utf8.c:328
const char fc_utf8_skip[256]
Definition fc_utf8.c:31
int cat_utf8_snprintf_trunc(char *str, size_t n, const char *format,...)
Definition fc_utf8.c:608
bool fc_utf8_validate(const char *utf8_string, const char **end)
Definition fc_utf8.c:239
#define fc_ut8_next_char(utf8_char)
Definition fc_utf8.h:42
#define fc_assert(condition)
Definition log.h:177
#define fc_assert_ret_val(condition, val)
Definition log.h:195
#define fc_malloc(sz)
Definition mem.h:34
int len
Definition packhand.c:127
size_t size
Definition specvec.h:72
size_t fc_strlcpy(char *dest, const char *src, size_t n)
Definition support.c:777
int fc_vsnprintf(char *str, size_t n, const char *format, va_list ap)
Definition support.c:886
#define TRUE
Definition support.h:46
#define FALSE
Definition support.h:47