2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
23 #include "system/iconv.h"
24 #include "param/param.h"
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
41 char *unix_charset = NULL;
42 char *dos_charset = NULL;
43 char *display_charset = NULL;
46 * Return the name of a charset to give to iconv().
48 static const char *charset_name(charset_t ch)
51 case CH_UTF16: return "UTF-16LE";
52 case CH_UNIX: return unix_charset;
53 case CH_DOS: return dos_charset;
54 case CH_DISPLAY: return display_charset;
55 case CH_UTF8: return "UTF8";
56 case CH_UTF16BE: return "UTF-16BE";
62 static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
65 re-initialize iconv conversion descriptors
67 _PUBLIC_ void init_iconv(void)
70 for (c1=0;c1<NUM_CHARSETS;c1++) {
71 for (c2=0;c2<NUM_CHARSETS;c2++) {
72 if (conv_handles[c1][c2] != NULL) {
73 if (conv_handles[c1][c2] != (smb_iconv_t)-1) {
74 smb_iconv_close(conv_handles[c1][c2]);
76 conv_handles[c1][c2] = NULL;
84 on-demand initialisation of conversion handles
86 static smb_iconv_t get_conv_handle(charset_t from, charset_t to)
89 static int initialised;
90 /* auto-free iconv memory on exit so valgrind reports are easier
92 if (initialised == 0) {
96 /* we set back the locale to C to get ASCII-compatible
97 toupper/lower functions. For now we do not need
98 any other POSIX localisations anyway. When we
99 should really need localized string functions one
100 day we need to write our own ascii_tolower etc.
102 setlocale(LC_ALL, "C");
108 if (conv_handles[from][to]) {
109 return conv_handles[from][to];
112 n1 = charset_name(from);
113 n2 = charset_name(to);
115 conv_handles[from][to] = smb_iconv_open(n2,n1);
117 if (conv_handles[from][to] == (smb_iconv_t)-1) {
118 if ((from == CH_DOS || to == CH_DOS) &&
119 strcasecmp(charset_name(CH_DOS), "ASCII") != 0) {
120 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
121 charset_name(CH_DOS)));
122 lp_set_cmdline("dos charset", "ASCII");
124 n1 = charset_name(from);
125 n2 = charset_name(to);
127 conv_handles[from][to] = smb_iconv_open(n2,n1);
131 return conv_handles[from][to];
136 * Convert string from one encoding to another, making error checking etc
138 * @param src pointer to source string (multibyte or singlebyte)
139 * @param srclen length of the source string in bytes
140 * @param dest pointer to destination string (multibyte or singlebyte)
141 * @param destlen maximal length allowed for string
142 * @returns the number of bytes occupied in the destination
144 _PUBLIC_ ssize_t convert_string(charset_t from, charset_t to,
145 void const *src, size_t srclen,
146 void *dest, size_t destlen)
150 const char* inbuf = (const char*)src;
151 char* outbuf = (char*)dest;
152 smb_iconv_t descriptor;
154 if (srclen == (size_t)-1)
155 srclen = strlen(inbuf)+1;
157 descriptor = get_conv_handle(from, to);
159 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
160 /* conversion not supported, use as is */
161 size_t len = MIN(srclen,destlen);
162 memcpy(dest,src,len);
168 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
169 if(retval==(size_t)-1) {
173 reason="Incomplete multibyte sequence";
176 reason="No more room";
177 if (from == CH_UNIX) {
178 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
179 charset_name(from), charset_name(to),
180 (int)srclen, (int)destlen,
183 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
184 charset_name(from), charset_name(to),
185 (int)srclen, (int)destlen));
189 reason="Illegal multibyte sequence";
192 /* smb_panic(reason); */
194 return destlen-o_len;
198 * Convert between character sets, allocating a new buffer using talloc for the result.
200 * @param srclen length of source buffer.
201 * @param dest always set at least to NULL
202 * @note -1 is not accepted for srclen.
204 * @returns Size in bytes of the converted string; or -1 in case of error.
207 _PUBLIC_ ssize_t convert_string_talloc(TALLOC_CTX *ctx, charset_t from, charset_t to,
208 void const *src, size_t srclen, void **dest)
210 size_t i_len, o_len, destlen;
212 const char *inbuf = (const char *)src;
214 smb_iconv_t descriptor;
218 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
221 descriptor = get_conv_handle(from, to);
223 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
224 /* conversion not supported, return -1*/
225 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
226 charset_name(from), charset_name(to)));
230 /* it is _very_ rare that a conversion increases the size by
235 destlen = 2 + (destlen*3);
236 ob = talloc_realloc(ctx, outbuf, char, destlen);
238 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
245 /* we give iconv 2 less bytes to allow us to terminate at the
249 retval = smb_iconv(descriptor,
252 if(retval == (size_t)-1) {
253 const char *reason="unknown error";
256 reason="Incomplete multibyte sequence";
261 reason="Illegal multibyte sequence";
264 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
269 destlen = (destlen-2) - o_len;
271 /* guarantee null termination in all charsets */
272 SSVAL(ob, destlen, 0);
280 * Copy a string from a char* unix src to a dos codepage string destination.
282 * @return the number of bytes occupied by the string in the destination.
284 * @param flags can include
286 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
287 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
290 * @param dest_len the maximum length in bytes allowed in the
291 * destination. If @p dest_len is -1 then no maximum is used.
293 static ssize_t push_ascii(void *dest, const char *src, size_t dest_len, int flags)
298 if (flags & STR_UPPER) {
299 char *tmpbuf = strupper_talloc(NULL, src);
300 if (tmpbuf == NULL) {
303 ret = push_ascii(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
308 src_len = strlen(src);
310 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
313 return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
317 * Copy a string from a unix char* src to an ASCII destination,
318 * allocating a buffer using talloc().
320 * @param dest always set at least to NULL
322 * @returns The number of bytes occupied by the string in the destination
323 * or -1 in case of error.
325 _PUBLIC_ ssize_t push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
327 size_t src_len = strlen(src)+1;
329 return convert_string_talloc(ctx, CH_UNIX, CH_DOS, src, src_len, (void **)dest);
334 * Copy a string from a dos codepage source to a unix char* destination.
336 * The resulting string in "dest" is always null terminated.
338 * @param flags can have:
340 * <dt>STR_TERMINATE</dt>
341 * <dd>STR_TERMINATE means the string in @p src
342 * is null terminated, and src_len is ignored.</dd>
345 * @param src_len is the length of the source area in bytes.
346 * @returns the number of bytes occupied by the string in @p src.
348 static ssize_t pull_ascii(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
352 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) {
353 if (src_len == (size_t)-1) {
354 src_len = strlen((const char *)src) + 1;
356 size_t len = strnlen((const char *)src, src_len);
363 ret = convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
366 dest[MIN(ret, dest_len-1)] = 0;
372 * Copy a string from a char* src to a unicode destination.
374 * @returns the number of bytes occupied by the string in the destination.
376 * @param flags can have:
379 * <dt>STR_TERMINATE <dd>means include the null termination.
380 * <dt>STR_UPPER <dd>means uppercase in the destination.
381 * <dt>STR_NOALIGN <dd>means don't do alignment.
384 * @param dest_len is the maximum length allowed in the
385 * destination. If dest_len is -1 then no maxiumum is used.
387 static ssize_t push_ucs2(void *dest, const char *src, size_t dest_len, int flags)
390 size_t src_len = strlen(src);
393 if (flags & STR_UPPER) {
394 char *tmpbuf = strupper_talloc(NULL, src);
395 if (tmpbuf == NULL) {
398 ret = push_ucs2(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
403 if (flags & STR_TERMINATE)
406 if (ucs2_align(NULL, dest, flags)) {
408 dest = (void *)((char *)dest + 1);
409 if (dest_len) dest_len--;
413 /* ucs2 is always a multiple of 2 bytes */
416 ret = convert_string(CH_UNIX, CH_UTF16, src, src_len, dest, dest_len);
417 if (ret == (size_t)-1) {
428 * Copy a string from a unix char* src to a UCS2 destination,
429 * allocating a buffer using talloc().
431 * @param dest always set at least to NULL
433 * @returns The number of bytes occupied by the string in the destination
434 * or -1 in case of error.
436 _PUBLIC_ ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, void **dest, const char *src)
438 size_t src_len = strlen(src)+1;
440 return convert_string_talloc(ctx, CH_UNIX, CH_UTF16, src, src_len, dest);
445 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
447 * @param dest always set at least to NULL
449 * @returns The number of bytes occupied by the string in the destination
452 _PUBLIC_ ssize_t push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
454 size_t src_len = strlen(src)+1;
456 return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
460 Copy a string from a ucs2 source to a unix char* destination.
462 STR_TERMINATE means the string in src is null terminated.
463 STR_NOALIGN means don't try to align.
464 if STR_TERMINATE is set then src_len is ignored if it is -1.
465 src_len is the length of the source area in bytes
466 Return the number of bytes occupied by the string in src.
467 The resulting string in "dest" is always null terminated.
470 static size_t pull_ucs2(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
474 if (ucs2_align(NULL, src, flags)) {
475 src = (const void *)((const char *)src + 1);
480 if (flags & STR_TERMINATE) {
481 if (src_len == (size_t)-1) {
482 src_len = utf16_len(src);
484 src_len = utf16_len_n(src, src_len);
488 /* ucs2 is always a multiple of 2 bytes */
489 if (src_len != (size_t)-1)
492 ret = convert_string(CH_UTF16, CH_UNIX, src, src_len, dest, dest_len);
494 dest[MIN(ret, dest_len-1)] = 0;
500 * Copy a string from a ASCII src to a unix char * destination, allocating a buffer using talloc
502 * @param dest always set at least to NULL
504 * @returns The number of bytes occupied by the string in the destination
507 _PUBLIC_ ssize_t pull_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
509 size_t src_len = strlen(src)+1;
511 return convert_string_talloc(ctx, CH_DOS, CH_UNIX, src, src_len, (void **)dest);
515 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
517 * @param dest always set at least to NULL
519 * @returns The number of bytes occupied by the string in the destination
522 _PUBLIC_ ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const void *src)
524 size_t src_len = utf16_len(src);
526 return convert_string_talloc(ctx, CH_UTF16, CH_UNIX, src, src_len, (void **)dest);
530 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
532 * @param dest always set at least to NULL
534 * @returns The number of bytes occupied by the string in the destination
537 _PUBLIC_ ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
539 size_t src_len = strlen(src)+1;
541 return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
545 Copy a string from a char* src to a unicode or ascii
546 dos codepage destination choosing unicode or ascii based on the
547 flags in the SMB buffer starting at base_ptr.
548 Return the number of bytes occupied by the string in the destination.
550 STR_TERMINATE means include the null termination.
551 STR_UPPER means uppercase in the destination.
552 STR_ASCII use ascii even with unicode packet.
553 STR_NOALIGN means don't do alignment.
554 dest_len is the maximum length allowed in the destination. If dest_len
555 is -1 then no maxiumum is used.
558 _PUBLIC_ ssize_t push_string(void *dest, const char *src, size_t dest_len, int flags)
560 if (flags & STR_ASCII) {
561 return push_ascii(dest, src, dest_len, flags);
562 } else if (flags & STR_UNICODE) {
563 return push_ucs2(dest, src, dest_len, flags);
565 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
572 Copy a string from a unicode or ascii source (depending on
573 the packet flags) to a char* destination.
575 STR_TERMINATE means the string in src is null terminated.
576 STR_UNICODE means to force as unicode.
577 STR_ASCII use ascii even with unicode packet.
578 STR_NOALIGN means don't do alignment.
579 if STR_TERMINATE is set then src_len is ignored is it is -1
580 src_len is the length of the source area in bytes.
581 Return the number of bytes occupied by the string in src.
582 The resulting string in "dest" is always null terminated.
585 _PUBLIC_ ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
587 if (flags & STR_ASCII) {
588 return pull_ascii(dest, src, dest_len, src_len, flags);
589 } else if (flags & STR_UNICODE) {
590 return pull_ucs2(dest, src, dest_len, src_len, flags);
592 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");
599 return the unicode codepoint for the next multi-byte CH_UNIX character
602 also return the number of bytes consumed (which tells the caller
603 how many bytes to skip to get to the next CH_UNIX character)
605 return INVALID_CODEPOINT if the next character cannot be converted
607 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
609 /* it cannot occupy more than 4 bytes in UTF16 format */
611 smb_iconv_t descriptor;
617 if ((str[0] & 0x80) == 0) {
619 return (codepoint_t)str[0];
622 /* we assume that no multi-byte character can take
623 more than 5 bytes. This is OK as we only
624 support codepoints up to 1M */
625 ilen_orig = strnlen(str, 5);
628 descriptor = get_conv_handle(CH_UNIX, CH_UTF16);
629 if (descriptor == (smb_iconv_t)-1) {
631 return INVALID_CODEPOINT;
634 /* this looks a little strange, but it is needed to cope
635 with codepoints above 64k */
637 outbuf = (char *)buf;
638 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
641 outbuf = (char *)buf;
642 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
644 /* we didn't convert any bytes */
646 return INVALID_CODEPOINT;
653 *size = ilen_orig - ilen;
656 return (codepoint_t)SVAL(buf, 0);
659 /* decode a 4 byte UTF16 character manually */
660 return (codepoint_t)0x10000 +
661 (buf[2] | ((buf[3] & 0x3)<<8) |
662 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
665 /* no other length is valid */
666 return INVALID_CODEPOINT;
670 push a single codepoint into a CH_UNIX string the target string must
671 be able to hold the full character, which is guaranteed if it is at
672 least 5 bytes in size. The caller may pass less than 5 bytes if they
673 are sure the character will fit (for example, you can assume that
674 uppercase/lowercase of a character will not add more than 1 byte)
676 return the number of bytes occupied by the CH_UNIX character, or
679 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
681 smb_iconv_t descriptor;
691 descriptor = get_conv_handle(CH_UTF16, CH_UNIX);
692 if (descriptor == (smb_iconv_t)-1) {
701 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
710 buf[0] = (c>>10) & 0xFF;
711 buf[1] = (c>>18) | 0xd8;
713 buf[3] = ((c>>8) & 0x3) | 0xdc;
719 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);