2 Unix SMB/CIFS implementation.
3 minimal iconv implementation
4 Copyright (C) Andrew Tridgell 2001
5 Copyright (C) Jelmer Vernooij 2002
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include "../lib/util/dlinklist.h"
23 #include "system/iconv.h"
24 #include "system/filesys.h"
30 #ifdef static_decl_charset
37 * @brief Samba wrapper/stub for iconv character set conversion.
39 * iconv is the XPG2 interface for converting between character
40 * encodings. This file provides a Samba wrapper around it, and also
41 * a simple reimplementation that is used if the system does not
44 * Samba only works with encodings that are supersets of ASCII: ascii
45 * characters like whitespace can be tested for directly, multibyte
46 * sequences start with a byte with the high bit set, and strings are
47 * terminated by a nul byte.
49 * Note that the only function provided by iconv is conversion between
50 * characters. It doesn't directly support operations like
51 * uppercasing or comparison. We have to convert to UTF-16LE and
54 * @sa Samba Developers Guide
57 static size_t ascii_pull (void *,const char **, size_t *, char **, size_t *);
58 static size_t ascii_push (void *,const char **, size_t *, char **, size_t *);
59 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
60 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
61 static size_t utf8_pull (void *,const char **, size_t *, char **, size_t *);
62 static size_t utf8_push (void *,const char **, size_t *, char **, size_t *);
63 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
64 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
65 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
66 static size_t iconv_copy (void *,const char **, size_t *, char **, size_t *);
67 static size_t iconv_swab (void *,const char **, size_t *, char **, size_t *);
69 static const struct charset_functions builtin_functions[] = {
70 /* windows is closest to UTF-16 */
71 {"UCS-2LE", iconv_copy, iconv_copy},
72 {"UTF-16LE", iconv_copy, iconv_copy},
73 {"UCS-2BE", iconv_swab, iconv_swab},
74 {"UTF-16BE", iconv_swab, iconv_swab},
76 /* we include the UTF-8 alias to cope with differing locale settings */
77 {"UTF8", utf8_pull, utf8_push},
78 {"UTF-8", utf8_pull, utf8_push},
80 /* this handles the munging needed for String2Key */
81 {"UTF16_MUNGED", utf16_munged_pull, iconv_copy},
83 {"ASCII", ascii_pull, ascii_push},
84 {"646", ascii_pull, ascii_push},
85 {"ISO-8859-1", latin1_pull, latin1_push},
86 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
89 static struct charset_functions *charsets = NULL;
91 static struct charset_functions *find_charset_functions(const char *name)
93 struct charset_functions *c;
95 /* Check whether we already have this charset... */
96 for (c = charsets; c != NULL; c = c->next) {
97 if(strcasecmp(c->name, name) == 0) {
106 bool smb_register_charset(const struct charset_functions *funcs_in)
108 struct charset_functions *funcs;
110 DEBUG(5, ("Attempting to register new charset %s\n", funcs_in->name));
111 /* Check whether we already have this charset... */
112 if (find_charset_functions(funcs_in->name)) {
113 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs_in->name));
117 funcs = talloc(NULL, struct charset_functions);
119 DEBUG(0, ("Out of memory duplicating charset %s\n", funcs_in->name));
124 funcs->next = funcs->prev = NULL;
125 DEBUG(5, ("Registered charset %s\n", funcs->name));
126 DLIST_ADD(charsets, funcs);
130 static void lazy_initialize_iconv(void)
132 #ifdef static_init_charset
133 static bool initialized = false;
142 #ifdef HAVE_NATIVE_ICONV
143 /* if there was an error then reset the internal state,
144 this ensures that we don't have a shift state remaining for
145 character sets like SJIS */
146 static size_t sys_iconv(void *cd,
147 const char **inbuf, size_t *inbytesleft,
148 char **outbuf, size_t *outbytesleft)
150 size_t ret = iconv((iconv_t)cd,
151 discard_const_p(char *, inbuf), inbytesleft,
152 outbuf, outbytesleft);
153 if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
159 * This is a simple portable iconv() implementaion.
161 * It only knows about a very small number of character sets - just
162 * enough that Samba works on systems that don't have iconv.
164 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
165 const char **inbuf, size_t *inbytesleft,
166 char **outbuf, size_t *outbytesleft)
168 /* in many cases we can go direct */
170 return cd->direct(cd->cd_direct,
171 inbuf, inbytesleft, outbuf, outbytesleft);
174 /* otherwise we have to do it chunks at a time */
176 #ifndef SMB_ICONV_BUFSIZE
177 #define SMB_ICONV_BUFSIZE 2048
183 #if _SAMBA_BUILD_ == 3
184 mem_ctx = talloc_tos();
188 cvtbuf = talloc_array(mem_ctx, char, SMB_ICONV_BUFSIZE);
194 while (*inbytesleft > 0) {
195 char *bufp1 = cvtbuf;
196 const char *bufp2 = cvtbuf;
197 int saved_errno = errno;
198 bool pull_failed = false;
199 bufsize = SMB_ICONV_BUFSIZE;
201 if (cd->pull(cd->cd_pull,
202 inbuf, inbytesleft, &bufp1, &bufsize) == -1
208 bufsize = SMB_ICONV_BUFSIZE - bufsize;
210 if (cd->push(cd->cd_push,
212 outbuf, outbytesleft) == -1) {
215 } else if (pull_failed) {
216 /* We want the pull errno if possible */
227 static bool is_utf16(const char *name)
229 return strcasecmp(name, "UCS-2LE") == 0 ||
230 strcasecmp(name, "UTF-16LE") == 0;
233 static int smb_iconv_t_destructor(smb_iconv_t hwd)
235 #ifdef HAVE_NATIVE_ICONV
236 if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
237 iconv_close(hwd->cd_pull);
238 if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
239 iconv_close(hwd->cd_push);
240 if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
241 iconv_close(hwd->cd_direct);
247 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
248 const char *fromcode, bool native_iconv)
251 const struct charset_functions *from=NULL, *to=NULL;
254 lazy_initialize_iconv();
256 ret = (smb_iconv_t)talloc_named(mem_ctx,
258 "iconv(%s,%s)", tocode, fromcode);
261 return (smb_iconv_t)-1;
263 memset(ret, 0, sizeof(*ret));
264 talloc_set_destructor(ret, smb_iconv_t_destructor);
266 /* check for the simplest null conversion */
267 if (strcmp(fromcode, tocode) == 0) {
268 ret->direct = iconv_copy;
272 for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
273 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
274 from = &builtin_functions[i];
276 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
277 to = &builtin_functions[i];
282 for (from=charsets; from; from=from->next) {
283 if (strcasecmp(from->name, fromcode) == 0) break;
288 for (to=charsets; to; to=to->next) {
289 if (strcasecmp(to->name, tocode) == 0) break;
293 #ifdef HAVE_NATIVE_ICONV
294 if ((!from || !to) && !native_iconv) {
298 ret->pull = sys_iconv;
299 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
300 if (ret->cd_pull == (iconv_t)-1)
301 ret->cd_pull = iconv_open("UCS-2LE", fromcode);
302 if (ret->cd_pull == (iconv_t)-1) goto failed;
306 ret->push = sys_iconv;
307 ret->cd_push = iconv_open(tocode, "UTF-16LE");
308 if (ret->cd_push == (iconv_t)-1)
309 ret->cd_push = iconv_open(tocode, "UCS-2LE");
310 if (ret->cd_push == (iconv_t)-1) goto failed;
318 /* check for conversion to/from ucs2 */
319 if (is_utf16(fromcode) && to) {
320 ret->direct = to->push;
323 if (is_utf16(tocode) && from) {
324 ret->direct = from->pull;
328 #ifdef HAVE_NATIVE_ICONV
329 if (is_utf16(fromcode)) {
330 ret->direct = sys_iconv;
331 ret->cd_direct = ret->cd_push;
335 if (is_utf16(tocode)) {
336 ret->direct = sys_iconv;
337 ret->cd_direct = ret->cd_pull;
343 /* the general case has to go via a buffer */
344 if (!ret->pull) ret->pull = from->pull;
345 if (!ret->push) ret->push = to->push;
351 return (smb_iconv_t)-1;
355 simple iconv_open() wrapper
357 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
359 return smb_iconv_open_ex(NULL, tocode, fromcode, true);
363 simple iconv_close() wrapper
365 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
372 /**********************************************************************
373 the following functions implement the builtin character sets in Samba
374 and also the "test" character sets that are designed to test
375 multi-byte character set support for english users
376 ***********************************************************************/
379 this takes an ASCII sequence and produces a UTF16 sequence
381 The first 127 codepoints of latin1 matches the first 127 codepoints
382 of unicode, and so can be put into the first byte of UTF16LE
386 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
387 char **outbuf, size_t *outbytesleft)
389 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
390 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
391 /* If this is multi-byte, then it isn't legal ASCII */
395 (*outbuf)[0] = (*inbuf)[0];
398 (*outbytesleft) -= 2;
403 if (*inbytesleft > 0) {
412 this takes a UTF16 sequence and produces an ASCII sequence
414 The first 127 codepoints of ASCII matches the first 127 codepoints
415 of unicode, and so can be read directly from the first byte of UTF16LE
418 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
419 char **outbuf, size_t *outbytesleft)
423 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
424 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
426 /* If this is multi-byte, then it isn't legal ASCII */
430 (*outbuf)[0] = (*inbuf)[0];
432 (*outbytesleft) -= 1;
437 if (*inbytesleft == 1) {
442 if (*inbytesleft > 1) {
451 this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
453 The first 256 codepoints of latin1 matches the first 256 codepoints
454 of unicode, and so can be put into the first byte of UTF16LE
457 static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
458 char **outbuf, size_t *outbytesleft)
460 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
461 (*outbuf)[0] = (*inbuf)[0];
464 (*outbytesleft) -= 2;
469 if (*inbytesleft > 0) {
478 this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
480 The first 256 codepoints of latin1 matches the first 256 codepoints
481 of unicode, and so can be read directly from the first byte of UTF16LE
484 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
485 char **outbuf, size_t *outbytesleft)
489 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
490 (*outbuf)[0] = (*inbuf)[0];
491 if ((*inbuf)[1] != 0) {
492 /* If this is multi-byte, then it isn't legal latin1 */
497 (*outbytesleft) -= 1;
502 if (*inbytesleft == 1) {
507 if (*inbytesleft > 1) {
515 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
516 char **outbuf, size_t *outbytesleft)
518 while (*inbytesleft >= 1 && *outbytesleft >= 2) {
521 if ((*inbuf)[0] != '@') {
522 /* seven bit ascii case */
523 (*outbuf)[0] = (*inbuf)[0];
526 (*outbytesleft) -= 2;
531 /* it's a hex character */
532 if (*inbytesleft < 5) {
537 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
542 (*outbuf)[0] = v&0xff;
545 (*outbytesleft) -= 2;
550 if (*inbytesleft > 0) {
558 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
559 char **outbuf, size_t *outbytesleft)
561 while (*inbytesleft >= 2 && *outbytesleft >= 1) {
564 if ((*inbuf)[1] == 0 &&
565 ((*inbuf)[0] & 0x80) == 0 &&
566 (*inbuf)[0] != '@') {
567 (*outbuf)[0] = (*inbuf)[0];
569 (*outbytesleft) -= 1;
574 if (*outbytesleft < 5) {
578 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
579 memcpy(*outbuf, buf, 5);
581 (*outbytesleft) -= 5;
586 if (*inbytesleft == 1) {
591 if (*inbytesleft > 1) {
599 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
600 char **outbuf, size_t *outbytesleft)
604 n = MIN(*inbytesleft, *outbytesleft);
606 swab(*inbuf, *outbuf, (n&~1));
612 (*outbytesleft) -= n;
616 if (*inbytesleft > 0) {
625 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
626 char **outbuf, size_t *outbytesleft)
630 n = MIN(*inbytesleft, *outbytesleft);
632 memmove(*outbuf, *inbuf, n);
635 (*outbytesleft) -= n;
639 if (*inbytesleft > 0) {
648 this takes a UTF8 sequence and produces a UTF16 sequence
650 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
651 char **outbuf, size_t *outbytesleft)
653 size_t in_left=*inbytesleft, out_left=*outbytesleft;
654 const uint8_t *c = (const uint8_t *)*inbuf;
655 uint8_t *uc = (uint8_t *)*outbuf;
657 while (in_left >= 1 && out_left >= 2) {
658 if ((c[0] & 0x80) == 0) {
668 if ((c[0] & 0xe0) == 0xc0) {
670 (c[1] & 0xc0) != 0x80) {
674 uc[1] = (c[0]>>2) & 0x7;
675 uc[0] = (c[0]<<6) | (c[1]&0x3f);
683 if ((c[0] & 0xf0) == 0xe0) {
685 (c[1] & 0xc0) != 0x80 ||
686 (c[2] & 0xc0) != 0x80) {
690 uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
691 uc[0] = (c[1]<<6) | (c[2]&0x3f);
699 if ((c[0] & 0xf8) == 0xf0) {
700 unsigned int codepoint;
702 (c[1] & 0xc0) != 0x80 ||
703 (c[2] & 0xc0) != 0x80 ||
704 (c[3] & 0xc0) != 0x80) {
713 if (codepoint < 0x10000) {
714 /* accept UTF-8 characters that are not
715 minimally packed, but pack the result */
716 uc[0] = (codepoint & 0xFF);
717 uc[1] = (codepoint >> 8);
725 codepoint -= 0x10000;
732 uc[0] = (codepoint>>10) & 0xFF;
733 uc[1] = (codepoint>>18) | 0xd8;
734 uc[2] = codepoint & 0xFF;
735 uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
743 /* we don't handle 5 byte sequences */
753 *inbytesleft = in_left;
754 *outbytesleft = out_left;
755 *inbuf = (const char *)c;
756 *outbuf = (char *)uc;
760 *inbytesleft = in_left;
761 *outbytesleft = out_left;
762 *inbuf = (const char *)c;
763 *outbuf = (char *)uc;
769 this takes a UTF16 sequence and produces a UTF8 sequence
771 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
772 char **outbuf, size_t *outbytesleft)
774 size_t in_left=*inbytesleft, out_left=*outbytesleft;
775 uint8_t *c = (uint8_t *)*outbuf;
776 const uint8_t *uc = (const uint8_t *)*inbuf;
778 while (in_left >= 2 && out_left >= 1) {
779 unsigned int codepoint;
781 if (uc[1] == 0 && !(uc[0] & 0x80)) {
791 if ((uc[1]&0xf8) == 0) {
792 /* next simplest case */
797 c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
798 c[1] = 0x80 | (uc[0] & 0x3f);
806 if ((uc[1] & 0xfc) == 0xdc) {
807 /* its the second part of a 4 byte sequence. Illegal */
816 if ((uc[1] & 0xfc) != 0xd8) {
817 codepoint = uc[0] | (uc[1]<<8);
822 c[0] = 0xe0 | (codepoint >> 12);
823 c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
824 c[2] = 0x80 | (codepoint & 0x3f);
833 /* its the first part of a 4 byte sequence */
838 if ((uc[3] & 0xfc) != 0xdc) {
842 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
843 (uc[0]<<10) | ((uc[1] & 0x3)<<18));
849 c[0] = 0xf0 | (codepoint >> 18);
850 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
851 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
852 c[3] = 0x80 | (codepoint & 0x3f);
870 *inbytesleft = in_left;
871 *outbytesleft = out_left;
872 *inbuf = (const char *)uc;
878 *inbytesleft = in_left;
879 *outbytesleft = out_left;
880 *inbuf = (const char *)uc;
887 this takes a UTF16 munged sequence, modifies it according to the
888 string2key rules, and produces a UTF16 sequence
892 1) any 0x0000 characters are mapped to 0x0001
894 2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
895 without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
896 U+FFFD (OBJECT REPLACEMENT CHARACTER).
898 3) the same for any low surrogate that was not preceded by a high surrogate.
901 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
902 char **outbuf, size_t *outbytesleft)
904 size_t in_left=*inbytesleft, out_left=*outbytesleft;
905 uint8_t *c = (uint8_t *)*outbuf;
906 const uint8_t *uc = (const uint8_t *)*inbuf;
908 while (in_left >= 2 && out_left >= 2) {
909 unsigned int codepoint = uc[0] | (uc[1]<<8);
911 if (codepoint == 0) {
915 if ((codepoint & 0xfc00) == 0xd800) {
916 /* a high surrogate */
917 unsigned int codepoint2;
922 codepoint2 = uc[2] | (uc[3]<<8);
923 if ((codepoint2 & 0xfc00) != 0xdc00) {
924 /* high surrogate not followed by low
925 surrogate: convert to 0xfffd */
941 if ((codepoint & 0xfc00) == 0xdc00) {
942 /* low surrogate not preceded by high
943 surrogate: convert to 0xfffd */
948 c[0] = codepoint & 0xFF;
949 c[1] = (codepoint>>8) & 0xFF;
968 *inbytesleft = in_left;
969 *outbytesleft = out_left;
970 *inbuf = (const char *)uc;
976 *inbytesleft = in_left;
977 *outbytesleft = out_left;
978 *inbuf = (const char *)uc;