2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001-2011
6 Copyright (C) Andrew Bartlett 2011
7 Copyright (C) Simo Sorce 2001
8 Copyright (C) Martin Pool 2003
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation; either version 3 of the License, or
13 (at your option) any later version.
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 GNU General Public License for more details.
20 You should have received a copy of the GNU General Public License
21 along with this program. If not, see <http://www.gnu.org/licenses/>.
25 #include "system/iconv.h"
30 * @brief Character-set conversion routines built on our iconv.
32 * @note Samba's internal character set (at least in the 3.0 series)
33 * is always the same as the one for the Unix filesystem. It is
34 * <b>not</b> necessarily UTF-8 and may be different on machines that
35 * need i18n filenames to be compatible with Unix software. It does
36 * have to be a superset of ASCII. All multibyte sequences must start
37 * with a byte with the high bit set.
44 * Convert string from one encoding to another, making error checking etc
45 * Slow path version - uses (slow) iconv.
47 * @param src pointer to source string (multibyte or singlebyte)
48 * @param srclen length of the source string in bytes
49 * @param dest pointer to destination string (multibyte or singlebyte)
50 * @param destlen maximal length allowed for string
51 * @param converted size is the number of bytes occupied in the destination
53 * @returns false and sets errno on fail, true on success.
55 * Ensure the srclen contains the terminating zero.
59 static bool convert_string_internal(struct smb_iconv_handle *ic,
60 charset_t from, charset_t to,
61 void const *src, size_t srclen,
62 void *dest, size_t destlen, size_t *converted_size)
66 const char* inbuf = (const char*)src;
67 char* outbuf = (char*)dest;
68 smb_iconv_t descriptor;
70 descriptor = get_conv_handle(ic, from, to);
72 if (srclen == (size_t)-1) {
73 if (from == CH_UTF16LE || from == CH_UTF16BE) {
74 srclen = (strlen_w((const smb_ucs2_t *)src)+1) * 2;
76 srclen = strlen((const char *)src)+1;
81 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
89 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
90 *converted_size = destlen-o_len;
92 return (retval != (size_t)-1);
96 * Convert string from one encoding to another, making error checking etc
97 * Fast path version - handles ASCII first.
99 * @param src pointer to source string (multibyte or singlebyte)
100 * @param srclen length of the source string in bytes, or -1 for nul terminated.
101 * @param dest pointer to destination string (multibyte or singlebyte)
102 * @param destlen maximal length allowed for string - *NEVER* -1.
103 * @param converted size is the number of bytes occupied in the destination
105 * @returns false and sets errno on fail, true on success.
107 * Ensure the srclen contains the terminating zero.
109 * This function has been hand-tuned to provide a fast path.
110 * Don't change unless you really know what you are doing. JRA.
113 bool convert_string_error_handle(struct smb_iconv_handle *ic,
114 charset_t from, charset_t to,
115 void const *src, size_t srclen,
116 void *dest, size_t destlen,
117 size_t *converted_size)
120 * NB. We deliberately don't do a strlen here if srclen == -1.
121 * This is very expensive over millions of calls and is taken
122 * care of in the slow path in convert_string_internal. JRA.
126 SMB_ASSERT(destlen != (size_t)-1);
134 if (from != CH_UTF16LE && from != CH_UTF16BE && to != CH_UTF16LE && to != CH_UTF16BE) {
135 const unsigned char *p = (const unsigned char *)src;
136 unsigned char *q = (unsigned char *)dest;
137 size_t slen = srclen;
138 size_t dlen = destlen;
139 unsigned char lastp = '\0';
142 /* If all characters are ascii, fast path here. */
143 while (slen && dlen) {
144 if ((lastp = *p) <= 0x7f) {
146 if (slen != (size_t)-1) {
154 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
157 bool ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
158 *converted_size += retval;
164 *converted_size = retval;
167 /* Even if we fast path we should note if we ran out of room. */
168 if (((slen != (size_t)-1) && slen) ||
169 ((slen == (size_t)-1) && lastp)) {
175 } else if (from == CH_UTF16LE && to != CH_UTF16LE) {
176 const unsigned char *p = (const unsigned char *)src;
177 unsigned char *q = (unsigned char *)dest;
179 size_t slen = srclen;
180 size_t dlen = destlen;
181 unsigned char lastp = '\0';
184 if (slen == (size_t)-1) {
186 ((lastp = *p) <= 0x7f) && (p[1] == 0)) {
194 if (lastp != 0) goto slow_path;
196 while (slen >= 2 && dlen &&
197 (*p <= 0x7f) && (p[1] == 0)) {
204 if (slen != 0) goto slow_path;
207 *converted_size = retval;
210 /* Even if we fast path we should note if we ran out of room. */
211 if (((slen != (size_t)-1) && slen) ||
212 ((slen == (size_t)-1) && lastp)) {
220 /* come here when we hit a character we can't deal
221 * with in the fast path
223 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
226 ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
227 *converted_size += retval;
231 } else if (from != CH_UTF16LE && from != CH_UTF16BE && to == CH_UTF16LE) {
232 const unsigned char *p = (const unsigned char *)src;
233 unsigned char *q = (unsigned char *)dest;
235 size_t slen = srclen;
236 size_t dlen = destlen;
237 unsigned char lastp = '\0';
239 /* If all characters are ascii, fast path here. */
240 while (slen && (dlen >= 1)) {
241 if (dlen >=2 && (lastp = *p) <= 0x7F) {
244 if (slen != (size_t)-1) {
252 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
255 bool ret = convert_string_internal(ic, from, to, p, slen, q, dlen, converted_size);
256 *converted_size += retval;
262 *converted_size = retval;
265 /* Even if we fast path we should note if we ran out of room. */
266 if (((slen != (size_t)-1) && slen) ||
267 ((slen == (size_t)-1) && lastp)) {
275 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
278 return convert_string_internal(ic, from, to, src, srclen, dest, destlen, converted_size);
281 bool convert_string_handle(struct smb_iconv_handle *ic,
282 charset_t from, charset_t to,
283 void const *src, size_t srclen,
284 void *dest, size_t destlen,
285 size_t *converted_size)
287 bool ret = convert_string_error_handle(ic, from, to, src, srclen, dest, destlen, converted_size);
290 const char *reason="unknown error";
293 reason="Incomplete multibyte sequence";
294 DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",
295 reason, (const char *)src));
299 reason="No more room";
300 if (from == CH_UNIX) {
301 DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u - '%s'\n",
302 charset_name(ic, from), charset_name(ic, to),
303 (unsigned int)srclen, (unsigned int)destlen, (const char *)src));
305 DEBUG(3,("E2BIG: convert_string(%s,%s): srclen=%u destlen=%u\n",
306 charset_name(ic, from), charset_name(ic, to),
307 (unsigned int)srclen, (unsigned int)destlen));
312 reason="Illegal multibyte sequence";
313 DEBUG(3,("convert_string_internal: Conversion error: %s(%s)\n",
314 reason, (const char *)src));
317 DEBUG(0,("convert_string_internal: Conversion error: %s(%s)\n",
318 reason, (const char *)src));
321 /* smb_panic(reason); */
328 * Convert between character sets, allocating a new buffer using talloc for the result.
330 * @param srclen length of source buffer.
331 * @param dest always set at least to NULL
332 * @parm converted_size set to the number of bytes occupied by the string in
333 * the destination on success.
334 * @note -1 is not accepted for srclen.
336 * @return true if new buffer was correctly allocated, and string was
339 * Ensure the srclen contains the terminating zero.
341 * I hate the goto's in this function. It's embarressing.....
342 * There has to be a cleaner way to do this. JRA.
344 bool convert_string_talloc_handle(TALLOC_CTX *ctx, struct smb_iconv_handle *ic,
345 charset_t from, charset_t to,
346 void const *src, size_t srclen, void *dst,
347 size_t *converted_size)
350 size_t i_len, o_len, destlen = (srclen * 3) / 2;
352 const char *inbuf = (const char *)src;
353 char *outbuf = NULL, *ob = NULL;
354 smb_iconv_t descriptor;
355 void **dest = (void **)dst;
359 if (src == NULL || srclen == (size_t)-1) {
365 /* We really should treat this as an error, but
366 there are too many callers that need this to
367 return a NULL terminated string in the correct
369 if (to == CH_UTF16LE|| to == CH_UTF16BE || to == CH_UTF16MUNGED) {
374 ob = talloc_zero_array(ctx, char, destlen);
379 if (converted_size != NULL) {
380 *converted_size = destlen;
386 descriptor = get_conv_handle(ic, from, to);
388 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
389 DEBUG(0,("convert_string_talloc: Conversion not supported.\n"));
396 /* +2 is for ucs2 null termination. */
397 if ((destlen*2)+2 < destlen) {
398 /* wrapped ! abort. */
399 DEBUG(0, ("convert_string_talloc: destlen wrapped !\n"));
404 destlen = destlen * 2;
407 /* +2 is for ucs2 null termination. */
408 ob = talloc_realloc(ctx, ob, char, destlen + 2);
411 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
419 retval = smb_iconv(descriptor,
422 if(retval == (size_t)-1) {
423 const char *reason="unknown error";
426 reason="Incomplete multibyte sequence";
427 DEBUG(3,("convert_string_talloc: Conversion error: %s(%s)\n",reason,inbuf));
432 reason="Illegal multibyte sequence";
433 DEBUG(3,("convert_string_talloc: Conversion error: %s(%s)\n",reason,inbuf));
436 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
437 /* smb_panic(reason); */
442 destlen = destlen - o_len;
443 /* Don't shrink unless we're reclaiming a lot of
444 * space. This is in the hot codepath and these
445 * reallocs *cost*. JRA.
448 /* We're shrinking here so we know the +2 is safe from wrap. */
449 ob = talloc_realloc(ctx,ob, char, destlen + 2);
452 if (destlen && !ob) {
453 DEBUG(0, ("convert_string_talloc: out of memory!\n"));
460 /* Must ucs2 null terminate in the extra space we allocated. */
462 ob[destlen+1] = '\0';
464 /* Ensure we can never return a *converted_size of zero. */
466 /* As we're now returning false on a bad smb_iconv call,
467 this should never happen. But be safe anyway. */
468 if (to == CH_UTF16LE|| to == CH_UTF16BE || to == CH_UTF16MUNGED) {
475 if (converted_size != NULL) {
476 *converted_size = destlen;
482 * Convert string from one encoding to another, making error checking etc
484 * @param src pointer to source string (multibyte or singlebyte)
485 * @param srclen length of the source string in bytes
486 * @param dest pointer to destination string (multibyte or singlebyte)
487 * @param destlen maximal length allowed for string
488 * @param converted_size the number of bytes occupied in the destination
490 * @returns true on success, false on fail.
492 _PUBLIC_ bool convert_string(charset_t from, charset_t to,
493 void const *src, size_t srclen,
494 void *dest, size_t destlen,
495 size_t *converted_size)
497 return convert_string_handle(get_iconv_handle(), from, to,
499 dest, destlen, converted_size);
503 * Convert string from one encoding to another, making error checking etc
505 * @param src pointer to source string (multibyte or singlebyte)
506 * @param srclen length of the source string in bytes
507 * @param dest pointer to destination string (multibyte or singlebyte)
508 * @param destlen maximal length allowed for string
509 * @param converted_size the number of bytes occupied in the destination
511 * @returns true on success, false on fail.
513 _PUBLIC_ bool convert_string_error(charset_t from, charset_t to,
514 void const *src, size_t srclen,
515 void *dest, size_t destlen,
516 size_t *converted_size)
518 return convert_string_error_handle(get_iconv_handle(), from, to,
520 dest, destlen, converted_size);
524 * Convert between character sets, allocating a new buffer using talloc for the result.
526 * @param srclen length of source buffer.
527 * @param dest always set at least to NULL
528 * @param converted_size Size in bytes of the converted string
529 * @note -1 is not accepted for srclen.
531 * @returns boolean indication whether the conversion succeeded
534 _PUBLIC_ bool convert_string_talloc(TALLOC_CTX *ctx,
535 charset_t from, charset_t to,
536 void const *src, size_t srclen,
537 void *dest, size_t *converted_size)
539 return convert_string_talloc_handle(ctx, get_iconv_handle(),
540 from, to, src, srclen, dest,