2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Jelmer Vernooij 2007
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #include "system/iconv.h"
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
41 struct smb_iconv_convenience {
42 const char *unix_charset;
43 const char *dos_charset;
45 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
50 * Return the name of a charset to give to iconv().
52 static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
55 case CH_UTF16: return "UTF-16LE";
56 case CH_UNIX: return ic->unix_charset;
57 case CH_DOS: return ic->dos_charset;
58 case CH_UTF8: return "UTF8";
59 case CH_UTF16BE: return "UTF-16BE";
66 re-initialize iconv conversion descriptors
68 static int close_iconv_convenience(struct smb_iconv_convenience *data)
71 for (c1=0;c1<NUM_CHARSETS;c1++) {
72 for (c2=0;c2<NUM_CHARSETS;c2++) {
73 if (data->conv_handles[c1][c2] != NULL) {
74 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
75 smb_iconv_close(data->conv_handles[c1][c2]);
77 data->conv_handles[c1][c2] = NULL;
85 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx,
86 const char *dos_charset,
87 const char *unix_charset,
90 struct smb_iconv_convenience *ret = talloc_zero(mem_ctx,
91 struct smb_iconv_convenience);
97 talloc_set_destructor(ret, close_iconv_convenience);
99 ret->dos_charset = talloc_strdup(ret, dos_charset);
100 ret->unix_charset = talloc_strdup(ret, unix_charset);
101 ret->native_iconv = native_iconv;
107 on-demand initialisation of conversion handles
109 static smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
110 charset_t from, charset_t to)
113 static bool initialised;
115 if (initialised == false) {
119 /* we set back the locale to C to get ASCII-compatible
120 toupper/lower functions. For now we do not need
121 any other POSIX localisations anyway. When we
122 should really need localized string functions one
123 day we need to write our own ascii_tolower etc.
125 setlocale(LC_ALL, "C");
129 if (ic->conv_handles[from][to]) {
130 return ic->conv_handles[from][to];
133 n1 = charset_name(ic, from);
134 n2 = charset_name(ic, to);
136 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
139 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
140 if ((from == CH_DOS || to == CH_DOS) &&
141 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
142 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
143 charset_name(ic, CH_DOS)));
144 ic->dos_charset = "ASCII";
146 n1 = charset_name(ic, from);
147 n2 = charset_name(ic, to);
149 ic->conv_handles[from][to] =
150 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
154 return ic->conv_handles[from][to];
159 * Convert string from one encoding to another, making error checking etc
161 * @param src pointer to source string (multibyte or singlebyte)
162 * @param srclen length of the source string in bytes
163 * @param dest pointer to destination string (multibyte or singlebyte)
164 * @param destlen maximal length allowed for string
165 * @returns the number of bytes occupied in the destination
167 _PUBLIC_ ssize_t convert_string(struct smb_iconv_convenience *ic,
168 charset_t from, charset_t to,
169 void const *src, size_t srclen,
170 void *dest, size_t destlen)
174 const char* inbuf = (const char*)src;
175 char* outbuf = (char*)dest;
176 smb_iconv_t descriptor;
178 if (srclen == (size_t)-1)
179 srclen = strlen(inbuf)+1;
181 descriptor = get_conv_handle(ic, from, to);
183 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
184 /* conversion not supported, use as is */
185 size_t len = MIN(srclen,destlen);
186 memcpy(dest,src,len);
192 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
193 if(retval==(size_t)-1) {
197 reason="Incomplete multibyte sequence";
200 reason="No more room";
201 if (from == CH_UNIX) {
202 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
203 charset_name(ic, from), charset_name(ic, to),
204 (int)srclen, (int)destlen,
207 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
208 charset_name(ic, from), charset_name(ic, to),
209 (int)srclen, (int)destlen));
213 reason="Illegal multibyte sequence";
216 /* smb_panic(reason); */
218 return destlen-o_len;
221 _PUBLIC_ ssize_t convert_string_talloc_descriptor(TALLOC_CTX *ctx, smb_iconv_t descriptor, void const *src, size_t srclen, void **dest)
223 size_t i_len, o_len, destlen;
225 const char *inbuf = (const char *)src;
230 /* it is _very_ rare that a conversion increases the size by
235 destlen = 2 + (destlen*3);
236 ob = talloc_realloc(ctx, outbuf, char, destlen);
238 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
245 /* we give iconv 2 less bytes to allow us to terminate at the
249 retval = smb_iconv(descriptor,
252 if(retval == (size_t)-1) {
253 const char *reason="unknown error";
256 reason="Incomplete multibyte sequence";
261 reason="Illegal multibyte sequence";
264 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
269 destlen = (destlen-2) - o_len;
271 /* guarantee null termination in all charsets */
272 SSVAL(ob, destlen, 0);
280 * Convert between character sets, allocating a new buffer using talloc for the result.
282 * @param srclen length of source buffer.
283 * @param dest always set at least to NULL
284 * @note -1 is not accepted for srclen.
286 * @returns Size in bytes of the converted string; or -1 in case of error.
289 _PUBLIC_ ssize_t convert_string_talloc(TALLOC_CTX *ctx,
290 struct smb_iconv_convenience *ic,
291 charset_t from, charset_t to,
292 void const *src, size_t srclen,
295 smb_iconv_t descriptor;
299 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
302 descriptor = get_conv_handle(ic, from, to);
304 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
305 /* conversion not supported, return -1*/
306 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
307 charset_name(ic, from),
308 charset_name(ic, to)));
312 return convert_string_talloc_descriptor(ctx, descriptor, src, srclen, dest);
316 * Copy a string from a char* unix src to a dos codepage string destination.
318 * @return the number of bytes occupied by the string in the destination.
320 * @param flags can include
322 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
323 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
326 * @param dest_len the maximum length in bytes allowed in the
327 * destination. If @p dest_len is -1 then no maximum is used.
329 static ssize_t push_ascii(struct smb_iconv_convenience *ic,
330 void *dest, const char *src, size_t dest_len, int flags)
335 if (flags & STR_UPPER) {
336 char *tmpbuf = strupper_talloc(NULL, src);
337 if (tmpbuf == NULL) {
340 ret = push_ascii(ic, dest, tmpbuf, dest_len, flags & ~STR_UPPER);
345 src_len = strlen(src);
347 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
350 return convert_string(ic, CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
354 * Copy a string from a unix char* src to an ASCII destination,
355 * allocating a buffer using talloc().
357 * @param dest always set at least to NULL
359 * @returns The number of bytes occupied by the string in the destination
360 * or -1 in case of error.
362 _PUBLIC_ ssize_t push_ascii_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src)
364 size_t src_len = strlen(src)+1;
366 return convert_string_talloc(ctx, ic, CH_UNIX, CH_DOS, src, src_len, (void **)dest);
371 * Copy a string from a dos codepage source to a unix char* destination.
373 * The resulting string in "dest" is always null terminated.
375 * @param flags can have:
377 * <dt>STR_TERMINATE</dt>
378 * <dd>STR_TERMINATE means the string in @p src
379 * is null terminated, and src_len is ignored.</dd>
382 * @param src_len is the length of the source area in bytes.
383 * @returns the number of bytes occupied by the string in @p src.
385 static ssize_t pull_ascii(struct smb_iconv_convenience *ic, char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
389 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) {
390 if (src_len == (size_t)-1) {
391 src_len = strlen((const char *)src) + 1;
393 size_t len = strnlen((const char *)src, src_len);
400 ret = convert_string(ic, CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
403 dest[MIN(ret, dest_len-1)] = 0;
409 * Copy a string from a char* src to a unicode destination.
411 * @returns the number of bytes occupied by the string in the destination.
413 * @param flags can have:
416 * <dt>STR_TERMINATE <dd>means include the null termination.
417 * <dt>STR_UPPER <dd>means uppercase in the destination.
418 * <dt>STR_NOALIGN <dd>means don't do alignment.
421 * @param dest_len is the maximum length allowed in the
422 * destination. If dest_len is -1 then no maxiumum is used.
424 static ssize_t push_ucs2(struct smb_iconv_convenience *ic,
425 void *dest, const char *src, size_t dest_len, int flags)
428 size_t src_len = strlen(src);
431 if (flags & STR_UPPER) {
432 char *tmpbuf = strupper_talloc(NULL, src);
433 if (tmpbuf == NULL) {
436 ret = push_ucs2(ic, dest, tmpbuf, dest_len, flags & ~STR_UPPER);
441 if (flags & STR_TERMINATE)
444 if (ucs2_align(NULL, dest, flags)) {
446 dest = (void *)((char *)dest + 1);
447 if (dest_len) dest_len--;
451 /* ucs2 is always a multiple of 2 bytes */
454 ret = convert_string(ic, CH_UNIX, CH_UTF16, src, src_len, dest, dest_len);
455 if (ret == (size_t)-1) {
466 * Copy a string from a unix char* src to a UCS2 destination,
467 * allocating a buffer using talloc().
469 * @param dest always set at least to NULL
471 * @returns The number of bytes occupied by the string in the destination
472 * or -1 in case of error.
474 _PUBLIC_ ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, void **dest, const char *src)
476 size_t src_len = strlen(src)+1;
478 return convert_string_talloc(ctx, ic, CH_UNIX, CH_UTF16, src, src_len, dest);
483 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
485 * @param dest always set at least to NULL
487 * @returns The number of bytes occupied by the string in the destination
490 _PUBLIC_ ssize_t push_utf8_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src)
492 size_t src_len = strlen(src)+1;
494 return convert_string_talloc(ctx, ic, CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
498 Copy a string from a ucs2 source to a unix char* destination.
500 STR_TERMINATE means the string in src is null terminated.
501 STR_NOALIGN means don't try to align.
502 if STR_TERMINATE is set then src_len is ignored if it is -1.
503 src_len is the length of the source area in bytes
504 Return the number of bytes occupied by the string in src.
505 The resulting string in "dest" is always null terminated.
508 static size_t pull_ucs2(struct smb_iconv_convenience *ic, char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
512 if (ucs2_align(NULL, src, flags)) {
513 src = (const void *)((const char *)src + 1);
518 if (flags & STR_TERMINATE) {
519 if (src_len == (size_t)-1) {
520 src_len = utf16_len(src);
522 src_len = utf16_len_n(src, src_len);
526 /* ucs2 is always a multiple of 2 bytes */
527 if (src_len != (size_t)-1)
530 ret = convert_string(ic, CH_UTF16, CH_UNIX, src, src_len, dest, dest_len);
532 dest[MIN(ret, dest_len-1)] = 0;
538 * Copy a string from a ASCII src to a unix char * destination, allocating a buffer using talloc
540 * @param dest always set at least to NULL
542 * @returns The number of bytes occupied by the string in the destination
545 _PUBLIC_ ssize_t pull_ascii_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src)
547 size_t src_len = strlen(src)+1;
549 return convert_string_talloc(ctx, ic, CH_DOS, CH_UNIX, src, src_len, (void **)dest);
553 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
555 * @param dest always set at least to NULL
557 * @returns The number of bytes occupied by the string in the destination
560 _PUBLIC_ ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const void *src)
562 size_t src_len = utf16_len(src);
564 return convert_string_talloc(ctx, ic, CH_UTF16, CH_UNIX, src, src_len, (void **)dest);
568 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
570 * @param dest always set at least to NULL
572 * @returns The number of bytes occupied by the string in the destination
575 _PUBLIC_ ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src)
577 size_t src_len = strlen(src)+1;
579 return convert_string_talloc(ctx, ic, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
583 Copy a string from a char* src to a unicode or ascii
584 dos codepage destination choosing unicode or ascii based on the
585 flags in the SMB buffer starting at base_ptr.
586 Return the number of bytes occupied by the string in the destination.
588 STR_TERMINATE means include the null termination.
589 STR_UPPER means uppercase in the destination.
590 STR_ASCII use ascii even with unicode packet.
591 STR_NOALIGN means don't do alignment.
592 dest_len is the maximum length allowed in the destination. If dest_len
593 is -1 then no maxiumum is used.
596 _PUBLIC_ ssize_t push_string(struct smb_iconv_convenience *ic,
597 void *dest, const char *src, size_t dest_len, int flags)
599 if (flags & STR_ASCII) {
600 return push_ascii(ic, dest, src, dest_len, flags);
601 } else if (flags & STR_UNICODE) {
602 return push_ucs2(ic, dest, src, dest_len, flags);
604 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
611 Copy a string from a unicode or ascii source (depending on
612 the packet flags) to a char* destination.
614 STR_TERMINATE means the string in src is null terminated.
615 STR_UNICODE means to force as unicode.
616 STR_ASCII use ascii even with unicode packet.
617 STR_NOALIGN means don't do alignment.
618 if STR_TERMINATE is set then src_len is ignored is it is -1
619 src_len is the length of the source area in bytes.
620 Return the number of bytes occupied by the string in src.
621 The resulting string in "dest" is always null terminated.
624 _PUBLIC_ ssize_t pull_string(struct smb_iconv_convenience *ic,
625 char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
627 if (flags & STR_ASCII) {
628 return pull_ascii(ic, dest, src, dest_len, src_len, flags);
629 } else if (flags & STR_UNICODE) {
630 return pull_ucs2(ic, dest, src, dest_len, src_len, flags);
632 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");
639 return the unicode codepoint for the next multi-byte CH_UNIX character
642 also return the number of bytes consumed (which tells the caller
643 how many bytes to skip to get to the next CH_UNIX character)
645 return INVALID_CODEPOINT if the next character cannot be converted
647 _PUBLIC_ codepoint_t next_codepoint(struct smb_iconv_convenience *ic,
648 const char *str, size_t *size)
650 /* it cannot occupy more than 4 bytes in UTF16 format */
652 smb_iconv_t descriptor;
658 if ((str[0] & 0x80) == 0) {
660 return (codepoint_t)str[0];
663 /* we assume that no multi-byte character can take
664 more than 5 bytes. This is OK as we only
665 support codepoints up to 1M */
666 ilen_orig = strnlen(str, 5);
669 descriptor = get_conv_handle(ic, CH_UNIX, CH_UTF16);
670 if (descriptor == (smb_iconv_t)-1) {
672 return INVALID_CODEPOINT;
675 /* this looks a little strange, but it is needed to cope
676 with codepoints above 64k */
678 outbuf = (char *)buf;
679 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
682 outbuf = (char *)buf;
683 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
685 /* we didn't convert any bytes */
687 return INVALID_CODEPOINT;
694 *size = ilen_orig - ilen;
697 return (codepoint_t)SVAL(buf, 0);
700 /* decode a 4 byte UTF16 character manually */
701 return (codepoint_t)0x10000 +
702 (buf[2] | ((buf[3] & 0x3)<<8) |
703 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
706 /* no other length is valid */
707 return INVALID_CODEPOINT;
711 push a single codepoint into a CH_UNIX string the target string must
712 be able to hold the full character, which is guaranteed if it is at
713 least 5 bytes in size. The caller may pass less than 5 bytes if they
714 are sure the character will fit (for example, you can assume that
715 uppercase/lowercase of a character will not add more than 1 byte)
717 return the number of bytes occupied by the CH_UNIX character, or
720 _PUBLIC_ ssize_t push_codepoint(struct smb_iconv_convenience *ic,
721 char *str, codepoint_t c)
723 smb_iconv_t descriptor;
733 descriptor = get_conv_handle(ic,
735 if (descriptor == (smb_iconv_t)-1) {
744 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
753 buf[0] = (c>>10) & 0xFF;
754 buf[1] = (c>>18) | 0xd8;
756 buf[3] = ((c>>8) & 0x3) | 0xdc;
762 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);