s4:dsdb: Fix stack use after scope in gkdi_create_root_key()
[samba.git] / lib / util / charset / iconv.c
1 /*
2    Unix SMB/CIFS implementation.
3    minimal iconv implementation
4    Copyright (C) Andrew Tridgell 2001
5    Copyright (C) Jelmer Vernooij 2002
6
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16
17    You should have received a copy of the GNU General Public License
18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "replace.h"
22 #include "system/iconv.h"
23 #include "system/filesys.h"
24 #include "lib/util/byteorder.h"
25 #include "lib/util/dlinklist.h"
26 #include "lib/util/charset/charset.h"
27 #include "lib/util/charset/charset_proto.h"
28
29 #ifdef HAVE_ICU_I18N
30 #include <unicode/ustring.h>
31 #include <unicode/utrans.h>
32 #endif
33
34 #ifdef strcasecmp
35 #undef strcasecmp
36 #endif
37
38 /**
39  * @file
40  *
41  * @brief Samba wrapper/stub for iconv character set conversion.
42  *
43  * iconv is the XPG2 interface for converting between character
44  * encodings.  This file provides a Samba wrapper around it, and also
45  * a simple reimplementation that is used if the system does not
46  * implement iconv.
47  *
48  * Samba only works with encodings that are supersets of ASCII: ascii
49  * characters like whitespace can be tested for directly, multibyte
50  * sequences start with a byte with the high bit set, and strings are
51  * terminated by a nul byte.
52  *
53  * Note that the only function provided by iconv is conversion between
54  * characters.  It doesn't directly support operations like
55  * uppercasing or comparison.  We have to convert to UTF-16LE and
56  * compare there.
57  *
58  * @sa Samba Developers Guide
59  **/
60
61 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
62 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
63 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
64 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
65 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
66 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
67 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
68 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
69 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
70 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
71 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
72
73 static const struct charset_functions builtin_functions[] = {
74         /* windows is closest to UTF-16 */
75         {
76                 .name = "UCS-2LE",
77                 .pull = iconv_copy,
78                 .push = iconv_copy
79         },
80         {
81                 .name = "UTF-16LE",
82                 .pull = iconv_copy,
83                 .push = iconv_copy
84         },
85         {
86                 .name = "UCS-2BE",
87                 .pull = iconv_swab,
88                 .push = iconv_swab
89         },
90         {
91                 .name = "UTF-16BE",
92                 .pull = iconv_swab,
93                 .push = iconv_swab
94         },
95
96         /* we include the UTF-8 alias to cope with differing locale settings */
97         {
98                 .name = "UTF8",
99                 .pull = utf8_pull,
100                 .push = utf8_push
101         },
102         {
103                 .name = "UTF-8",
104                 .pull = utf8_pull,
105                 .push = utf8_push
106         },
107
108         /* this handles the munging needed for String2Key */
109         {
110                 .name = "UTF16_MUNGED",
111                 .pull = utf16_munged_pull,
112                 .push = iconv_copy,
113                 .samba_internal_charset = true
114         },
115
116         {
117                 .name = "ASCII",
118                 .pull = ascii_pull,
119                 .push = ascii_push
120         },
121         {
122                 .name = "646",
123                 .pull = ascii_pull,
124                 .push = ascii_push
125         },
126         {
127                 .name = "ISO-8859-1",
128                 .pull = latin1_pull,
129                 .push = latin1_push
130         },
131 #ifdef DEVELOPER
132         {
133                 .name = "WEIRD",
134                 .pull = weird_pull,
135                 .push = weird_push,
136                 .samba_internal_charset = true
137         },
138 #endif
139 #ifdef DARWINOS
140         {
141                 .name = "MACOSXFS",
142                 .pull = macosxfs_encoding_pull,
143                 .push = macosxfs_encoding_push,
144                 .samba_internal_charset = true
145         },
146 #endif
147         {
148                 .name = "UCS2-HEX",
149                 .pull = ucs2hex_pull,
150                 .push = ucs2hex_push,
151                 .samba_internal_charset = true
152         }
153 };
154
155 #ifdef HAVE_NATIVE_ICONV
156 /* if there was an error then reset the internal state,
157    this ensures that we don't have a shift state remaining for
158    character sets like SJIS */
159 static size_t sys_iconv(void *cd,
160                         const char **inbuf, size_t *inbytesleft,
161                         char **outbuf, size_t *outbytesleft)
162 {
163         size_t ret = iconv((iconv_t)cd,
164                            discard_const_p(char *, inbuf), inbytesleft,
165                            outbuf, outbytesleft);
166         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
167         return ret;
168 }
169 #endif
170
171 #ifdef HAVE_ICU_I18N
172 static size_t sys_uconv(void *cd,
173                         const char **inbuf,
174                         size_t *inbytesleft,
175                         char **outbuf,
176                         size_t *outbytesleft)
177 {
178         UTransliterator *t = (UTransliterator *)cd;
179         size_t bufsize = *inbytesleft * 2;
180         UChar ustr[bufsize];
181         UChar *up = NULL;
182         char *p = NULL;
183         int32_t ustrlen;
184         int32_t limit;
185         int32_t converted_len;
186         size_t inbuf_consumed;
187         size_t outbut_consumed;
188         UErrorCode ue;
189
190         /* Convert from UTF8 to UCS2 */
191         ue = 0;
192         up = u_strFromUTF8(ustr,           /* dst */
193                            bufsize,        /* dst buflen */
194                            &converted_len, /* dst written */
195                            *inbuf,         /* src */
196                            *inbytesleft,   /* src length */
197                            &ue);
198         if (up == NULL || U_FAILURE(ue)) {
199                 return -1;
200         }
201         if (converted_len > bufsize) {
202                 /*
203                  * u_strFromUTF8() returns the required size in
204                  * converted_len. In theory this should never overflow as the
205                  * ustr[] array is allocated with a size twice as big as
206                  * inbytesleft and converted_len should be equal to inbytesleft,
207                  * but you never know...
208                  */
209                 errno = EOVERFLOW;
210                 return -1;
211         }
212         inbuf_consumed = converted_len;
213
214         /*
215          * The following transliteration function takes two parameters, the
216          * length of the text to be converted (converted_len) and a limit which
217          * may be smaller then converted_len. We just set limit to converted_len
218          * and also ignore the value returned in limit.
219          */
220         limit = converted_len;
221
222         /* Inplace transliteration */
223         utrans_transUChars(t,
224                            ustr,           /* text */
225                            &converted_len, /* text length */
226                            bufsize,        /* text buflen */
227                            0,              /* start */
228                            &limit,         /* limit */
229                            &ue);
230         if (U_FAILURE(ue)) {
231                 return -1;
232         }
233         if (converted_len > bufsize) {
234                 /*
235                  * In theory this should never happen as the ustr[] array is
236                  * allocated with a size twice as big as inbytesleft and
237                  * converted_len should be equal to inbytesleft, but you never
238                  * know...
239                  */
240                 errno = EOVERFLOW;
241                 return -1;
242         }
243         ustrlen = converted_len;
244
245         /* Convert from UCS2 back to UTF8 */
246         ue = 0;
247         p = u_strToUTF8(*outbuf,        /* dst */
248                         *outbytesleft,  /* dst buflen */
249                         &converted_len, /* dst required length */
250                         ustr,           /* src */
251                         ustrlen,        /* src length */
252                         &ue);
253         if (p == NULL || U_FAILURE(ue)) {
254                 return -1;
255         }
256
257         outbut_consumed = converted_len;
258         if (converted_len > *outbytesleft) {
259                 /*
260                  * The caller's result buffer is too small...
261                 */
262                 outbut_consumed = *outbytesleft;
263         }
264
265         *inbuf += inbuf_consumed;
266         *inbytesleft -= inbuf_consumed;
267         *outbuf += outbut_consumed;
268         *outbytesleft -= outbut_consumed;
269
270         return converted_len;
271 }
272 #endif
273
274 /**
275  * This is a simple portable iconv() implementation.
276  *
277  * It only knows about a very small number of character sets - just
278  * enough that Samba works on systems that don't have iconv.
279  **/
280 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
281                  const char **inbuf, size_t *inbytesleft,
282                  char **outbuf, size_t *outbytesleft)
283 {
284         /* in many cases we can go direct */
285         if (cd->direct) {
286                 return cd->direct(cd->cd_direct,
287                                   inbuf, inbytesleft, outbuf, outbytesleft);
288         }
289
290         /* otherwise we have to do it chunks at a time */
291         {
292 #ifndef SMB_ICONV_BUFSIZE
293 #define SMB_ICONV_BUFSIZE 2048
294 #endif
295                 size_t bufsize;
296                 char cvtbuf[SMB_ICONV_BUFSIZE];
297
298                 while (*inbytesleft > 0) {
299                         char *bufp1 = cvtbuf;
300                         const char *bufp2 = cvtbuf;
301                         int saved_errno = errno;
302                         bool pull_failed = false;
303                         bufsize = SMB_ICONV_BUFSIZE;
304
305                         if (cd->pull(cd->cd_pull,
306                                      inbuf, inbytesleft, &bufp1, &bufsize) == -1
307                             && errno != E2BIG) {
308                                 saved_errno = errno;
309                                 pull_failed = true;
310                         }
311
312                         bufsize = SMB_ICONV_BUFSIZE - bufsize;
313
314                         if (cd->push(cd->cd_push,
315                                      &bufp2, &bufsize,
316                                      outbuf, outbytesleft) == -1) {
317                                 return -1;
318                         } else if (pull_failed) {
319                                 /* We want the pull errno if possible */
320                                 errno = saved_errno;
321                                 return -1;
322                         }
323                 }
324         }
325
326         return 0;
327 }
328
329 static bool is_utf16(const char *name)
330 {
331         return strcasecmp(name, "UCS-2LE") == 0 ||
332                 strcasecmp(name, "UTF-16LE") == 0;
333 }
334
335 static int smb_iconv_t_destructor(smb_iconv_t hwd)
336 {
337 #ifdef HAVE_ICU_I18N
338         /*
339          * This has to come first, as the cd_direct member won't be an iconv
340          * handle and must not be passed to iconv_close().
341          */
342         if (hwd->direct == sys_uconv) {
343                 utrans_close(hwd->cd_direct);
344                 return 0;
345         }
346 #endif
347 #ifdef HAVE_NATIVE_ICONV
348         if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
349                 iconv_close(hwd->cd_pull);
350         if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
351                 iconv_close(hwd->cd_push);
352         if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
353                 iconv_close(hwd->cd_direct);
354 #endif
355
356         return 0;
357 }
358
359 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode, 
360                               const char *fromcode, bool use_builtin_handlers)
361 {
362         smb_iconv_t ret;
363         const struct charset_functions *from=NULL, *to=NULL;
364         int i;
365
366         ret = (smb_iconv_t)talloc_named(mem_ctx,
367                                         sizeof(*ret),
368                                         "iconv(%s,%s)", tocode, fromcode);
369         if (!ret) {
370                 errno = ENOMEM;
371                 return (smb_iconv_t)-1;
372         }
373         memset(ret, 0, sizeof(*ret));
374         talloc_set_destructor(ret, smb_iconv_t_destructor);
375
376         /* check for the simplest null conversion */
377         if (strcmp(fromcode, tocode) == 0) {
378                 ret->direct = iconv_copy;
379                 return ret;
380         }
381
382         /* check if we have a builtin function for this conversion */
383         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
384                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
385                         if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
386                                 from = &builtin_functions[i];
387                         }
388                 }
389                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
390                         if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
391                                 to = &builtin_functions[i];
392                         }
393                 }
394         }
395
396 #ifdef HAVE_NATIVE_ICONV
397         /* the from and to variables indicate a samba module or
398          * internal conversion, ret->pull and ret->push are
399          * initialised only in this block for iconv based
400          * conversions */
401
402         if (from == NULL) {
403                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
404                 if (ret->cd_pull == (iconv_t)-1)
405                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
406                 if (ret->cd_pull != (iconv_t)-1) {
407                         ret->pull = sys_iconv;
408                 }
409         }
410
411         if (to == NULL) {
412                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
413                 if (ret->cd_push == (iconv_t)-1)
414                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
415                 if (ret->cd_push != (iconv_t)-1) {
416                         ret->push = sys_iconv;
417                 }
418         }
419 #endif
420
421 #ifdef HAVE_ICU_I18N
422         if (strcasecmp(fromcode, "UTF8-NFD") == 0 &&
423             strcasecmp(tocode, "UTF8-NFC") == 0)
424         {
425                 U_STRING_DECL(t, "any-nfc", 7);
426                 UErrorCode ue = 0;
427
428                 U_STRING_INIT(t, "any-nfc", 7);
429
430                 ret->cd_direct = utrans_openU(t,
431                                               strlen("any-nfc"),
432                                               UTRANS_FORWARD,
433                                               NULL,
434                                               0,
435                                               NULL,
436                                               &ue);
437                 if (U_FAILURE(ue)) {
438                         return (smb_iconv_t)-1;
439                 }
440                 ret->direct = sys_uconv;
441                 return ret;
442         }
443
444         if (strcasecmp(fromcode, "UTF8-NFC") == 0 &&
445             strcasecmp(tocode, "UTF8-NFD") == 0)
446         {
447                 U_STRING_DECL(tname, "any-nfd", 7);
448                 UErrorCode ue = 0;
449
450                 U_STRING_INIT(tname, "any-nfd", 7);
451
452                 ret->cd_direct = utrans_openU(tname,
453                                               7,
454                                               UTRANS_FORWARD,
455                                               NULL,
456                                               0,
457                                               NULL,
458                                               &ue);
459                 if (U_FAILURE(ue)) {
460                         return (smb_iconv_t)-1;
461                 }
462                 ret->direct = sys_uconv;
463                 return ret;
464         }
465 #endif
466
467         if (ret->pull == NULL && from == NULL) {
468                 goto failed;
469         }
470
471         if (ret->push == NULL && to == NULL) {
472                 goto failed;
473         }
474
475         /* check for conversion to/from ucs2 */
476         if (is_utf16(fromcode) && to) {
477                 ret->direct = to->push;
478                 return ret;
479         }
480         if (is_utf16(tocode) && from) {
481                 ret->direct = from->pull;
482                 return ret;
483         }
484
485 #ifdef HAVE_NATIVE_ICONV
486         if (is_utf16(fromcode)) {
487                 ret->direct = sys_iconv;
488                 ret->cd_direct = ret->cd_push;
489                 ret->cd_push = NULL;
490                 return ret;
491         }
492         if (is_utf16(tocode)) {
493                 ret->direct = sys_iconv;
494                 ret->cd_direct = ret->cd_pull;
495                 ret->cd_pull = NULL;
496                 return ret;
497         }
498 #endif
499
500         /* the general case has to go via a buffer */
501         if (!ret->pull) ret->pull = from->pull;
502         if (!ret->push) ret->push = to->push;
503         return ret;
504
505 failed:
506         talloc_free(ret);
507         errno = EINVAL;
508         return (smb_iconv_t)-1;
509 }
510
511 /*
512   simple iconv_open() wrapper
513  */
514 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
515 {
516         return smb_iconv_open_ex(NULL, tocode, fromcode, true);
517 }
518
519 /*
520   simple iconv_close() wrapper
521 */
522 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
523 {
524         talloc_free(cd);
525         return 0;
526 }
527
528
529 /**********************************************************************
530  the following functions implement the builtin character sets in Samba
531  and also the "test" character sets that are designed to test
532  multi-byte character set support for english users
533 ***********************************************************************/
534
535 /*
536   this takes an ASCII sequence and produces a UTF16 sequence
537
538   The first 127 codepoints of latin1 matches the first 127 codepoints
539   of unicode, and so can be put into the first byte of UTF16LE
540
541  */
542
543 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
544                          char **outbuf, size_t *outbytesleft)
545 {
546         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
547                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
548                         /* If this is multi-byte, then it isn't legal ASCII */
549                         errno = EILSEQ;
550                         return -1;
551                 }
552                 (*outbuf)[0] = (*inbuf)[0];
553                 (*outbuf)[1] = 0;
554                 (*inbytesleft)  -= 1;
555                 (*outbytesleft) -= 2;
556                 (*inbuf)  += 1;
557                 (*outbuf) += 2;
558         }
559
560         if (*inbytesleft > 0) {
561                 errno = E2BIG;
562                 return -1;
563         }
564
565         return 0;
566 }
567
568 /*
569   this takes a UTF16 sequence and produces an ASCII sequence
570
571   The first 127 codepoints of ASCII matches the first 127 codepoints
572   of unicode, and so can be read directly from the first byte of UTF16LE
573
574  */
575 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
576                          char **outbuf, size_t *outbytesleft)
577 {
578         int ir_count=0;
579
580         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
581                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
582                         (*inbuf)[1] != 0) {
583                         /* If this is multi-byte, then it isn't legal ASCII */
584                         errno = EILSEQ;
585                         return -1;
586                 }
587                 (*outbuf)[0] = (*inbuf)[0];
588                 (*inbytesleft)  -= 2;
589                 (*outbytesleft) -= 1;
590                 (*inbuf)  += 2;
591                 (*outbuf) += 1;
592         }
593
594         if (*inbytesleft == 1) {
595                 errno = EINVAL;
596                 return -1;
597         }
598
599         if (*inbytesleft > 1) {
600                 errno = E2BIG;
601                 return -1;
602         }
603
604         return ir_count;
605 }
606
607 /*
608   this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
609
610   The first 256 codepoints of latin1 matches the first 256 codepoints
611   of unicode, and so can be put into the first byte of UTF16LE
612
613  */
614 static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
615                           char **outbuf, size_t *outbytesleft)
616 {
617         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
618                 (*outbuf)[0] = (*inbuf)[0];
619                 (*outbuf)[1] = 0;
620                 (*inbytesleft)  -= 1;
621                 (*outbytesleft) -= 2;
622                 (*inbuf)  += 1;
623                 (*outbuf) += 2;
624         }
625
626         if (*inbytesleft > 0) {
627                 errno = E2BIG;
628                 return -1;
629         }
630
631         return 0;
632 }
633
634 /*
635   this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
636
637   The first 256 codepoints of latin1 matches the first 256 codepoints
638   of unicode, and so can be read directly from the first byte of UTF16LE
639
640  */
641 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
642                          char **outbuf, size_t *outbytesleft)
643 {
644         int ir_count=0;
645
646         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
647                 (*outbuf)[0] = (*inbuf)[0];
648                 if ((*inbuf)[1] != 0) {
649                         /* If this is multi-byte, then it isn't legal latin1 */
650                         errno = EILSEQ;
651                         return -1;
652                 }
653                 (*inbytesleft)  -= 2;
654                 (*outbytesleft) -= 1;
655                 (*inbuf)  += 2;
656                 (*outbuf) += 1;
657         }
658
659         if (*inbytesleft == 1) {
660                 errno = EINVAL;
661                 return -1;
662         }
663
664         if (*inbytesleft > 1) {
665                 errno = E2BIG;
666                 return -1;
667         }
668
669         return ir_count;
670 }
671
672 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
673                          char **outbuf, size_t *outbytesleft)
674 {
675         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
676                 uint8_t hi = 0, lo = 0;
677                 bool ok;
678
679                 if ((*inbuf)[0] != '@') {
680                         /* seven bit ascii case */
681                         (*outbuf)[0] = (*inbuf)[0];
682                         (*outbuf)[1] = 0;
683                         (*inbytesleft)  -= 1;
684                         (*outbytesleft) -= 2;
685                         (*inbuf)  += 1;
686                         (*outbuf) += 2;
687                         continue;
688                 }
689                 /* it's a hex character */
690                 if (*inbytesleft < 5) {
691                         errno = EINVAL;
692                         return -1;
693                 }
694
695                 ok = hex_byte(&(*inbuf)[1], &hi) && hex_byte(&(*inbuf)[3], &lo);
696                 if (!ok) {
697                         errno = EILSEQ;
698                         return -1;
699                 }
700
701                 (*outbuf)[0] = lo;
702                 (*outbuf)[1] = hi;
703                 (*inbytesleft)  -= 5;
704                 (*outbytesleft) -= 2;
705                 (*inbuf)  += 5;
706                 (*outbuf) += 2;
707         }
708
709         if (*inbytesleft > 0) {
710                 errno = E2BIG;
711                 return -1;
712         }
713
714         return 0;
715 }
716
717 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
718                            char **outbuf, size_t *outbytesleft)
719 {
720         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
721                 char buf[6];
722
723                 if ((*inbuf)[1] == 0 &&
724                     ((*inbuf)[0] & 0x80) == 0 &&
725                     (*inbuf)[0] != '@') {
726                         (*outbuf)[0] = (*inbuf)[0];
727                         (*inbytesleft)  -= 2;
728                         (*outbytesleft) -= 1;
729                         (*inbuf)  += 2;
730                         (*outbuf) += 1;
731                         continue;
732                 }
733                 if (*outbytesleft < 5) {
734                         errno = E2BIG;
735                         return -1;
736                 }
737                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
738                 memcpy(*outbuf, buf, 5);
739                 (*inbytesleft)  -= 2;
740                 (*outbytesleft) -= 5;
741                 (*inbuf)  += 2;
742                 (*outbuf) += 5;
743         }
744
745         if (*inbytesleft == 1) {
746                 errno = EINVAL;
747                 return -1;
748         }
749
750         if (*inbytesleft > 1) {
751                 errno = E2BIG;
752                 return -1;
753         }
754
755         return 0;
756 }
757
758 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
759                          char **outbuf, size_t *outbytesleft)
760 {
761         int n;
762
763         n = MIN(*inbytesleft, *outbytesleft);
764
765         swab(*inbuf, *outbuf, (n&~1));
766         if (n&1) {
767                 (*outbuf)[n-1] = 0;
768         }
769
770         (*inbytesleft) -= n;
771         (*outbytesleft) -= n;
772         (*inbuf) += n;
773         (*outbuf) += n;
774
775         if (*inbytesleft > 0) {
776                 errno = E2BIG;
777                 return -1;
778         }
779
780         return 0;
781 }
782
783
784 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
785                          char **outbuf, size_t *outbytesleft)
786 {
787         int n;
788
789         n = MIN(*inbytesleft, *outbytesleft);
790
791         memmove(*outbuf, *inbuf, n);
792
793         (*inbytesleft) -= n;
794         (*outbytesleft) -= n;
795         (*inbuf) += n;
796         (*outbuf) += n;
797
798         if (*inbytesleft > 0) {
799                 errno = E2BIG;
800                 return -1;
801         }
802
803         return 0;
804 }
805
806 /*
807   this takes a UTF8 sequence and produces a UTF16 sequence
808  */
809 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
810                          char **outbuf, size_t *outbytesleft)
811 {
812         size_t in_left=*inbytesleft, out_left=*outbytesleft;
813         const uint8_t *c = (const uint8_t *)*inbuf;
814         uint8_t *uc = (uint8_t *)*outbuf;
815
816         while (in_left >= 1 && out_left >= 2) {
817                 if ((c[0] & 0x80) == 0) {
818                         uc[0] = c[0];
819                         uc[1] = 0;
820                         c  += 1;
821                         in_left  -= 1;
822                         out_left -= 2;
823                         uc += 2;
824                         continue;
825                 }
826
827                 if ((c[0] & 0xe0) == 0xc0) {
828                         if (in_left < 2 ||
829                             (c[1] & 0xc0) != 0x80) {
830                                 errno = EILSEQ;
831                                 goto error;
832                         }
833                         uc[1] = (c[0]>>2) & 0x7;
834                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
835                         if (uc[1] == 0 && uc[0] < 0x80) {
836                                 /* this should have been a single byte */
837                                 errno = EILSEQ;
838                                 goto error;
839                         }
840                         c  += 2;
841                         in_left  -= 2;
842                         out_left -= 2;
843                         uc += 2;
844                         continue;
845                 }
846
847                 if ((c[0] & 0xf0) == 0xe0) {
848                         unsigned int codepoint;
849                         if (in_left < 3 ||
850                             (c[1] & 0xc0) != 0x80 ||
851                             (c[2] & 0xc0) != 0x80) {
852                                 errno = EILSEQ;
853                                 goto error;
854                         }
855                         codepoint = ((c[2] & 0x3f)        |
856                                      ((c[1] & 0x3f) << 6) |
857                                      ((c[0] & 0x0f) << 12));
858
859                         if (codepoint < 0x800) {
860                                 /* this should be a 1 or 2 byte sequence */
861                                 errno = EILSEQ;
862                                 goto error;
863                         }
864                         if (codepoint >= 0xd800 && codepoint <= 0xdfff) {
865                                 /*
866                                  * This is an invalid codepoint, per
867                                  * RFC3629, as it encodes part of a
868                                  * UTF-16 surrogate pair for a
869                                  * character over U+10000, which ought
870                                  * to have been encoded as a four byte
871                                  * utf-8 sequence.
872                                  *
873                                  * Prior to Vista, Windows might
874                                  * sometimes produce invalid strings
875                                  * where a utf-16 sequence containing
876                                  * surrogate pairs was converted
877                                  * "verbatim" into utf-8, instead of
878                                  * encoding the actual codepoint. This
879                                  * format is sometimes called "WTF-8".
880                                  *
881                                  * If we were to support that, we'd
882                                  * have a branch here for the case
883                                  * where the codepoint is between
884                                  * 0xd800 and 0xdbff (a "high
885                                  * surrogate"), and read a *six*
886                                  * character sequence from there which
887                                  * would include a low surrogate. But
888                                  * that would undermine the
889                                  * hard-learnt principle that each
890                                  * character should only have one
891                                  * encoding.
892                                  */
893                                 errno = EILSEQ;
894                                 goto error;
895                         }
896
897                         uc[0] = codepoint & 0xff;
898                         uc[1] = codepoint >> 8;
899                         c  += 3;
900                         in_left  -= 3;
901                         out_left -= 2;
902                         uc += 2;
903                         continue;
904                 }
905
906                 if ((c[0] & 0xf8) == 0xf0) {
907                         unsigned int codepoint;
908                         if (in_left < 4 ||
909                             (c[1] & 0xc0) != 0x80 ||
910                             (c[2] & 0xc0) != 0x80 ||
911                             (c[3] & 0xc0) != 0x80) {
912                                 errno = EILSEQ;
913                                 goto error;
914                         }
915                         codepoint =
916                                 (c[3]&0x3f) |
917                                 ((c[2]&0x3f)<<6) |
918                                 ((c[1]&0x3f)<<12) |
919                                 ((c[0]&0x7)<<18);
920                         if (codepoint < 0x10000) {
921                                 /* reject UTF-8 characters that are not
922                                    minimally packed */
923                                 errno = EILSEQ;
924                                 goto error;
925                         }
926                         if (codepoint > 0x10ffff) {
927                                 /*
928                                  * Unicode stops at 0x10ffff, and if
929                                  * we ignore that, we'll end up
930                                  * encoding the wrong characters in
931                                  * the surrogate pair.
932                                  */
933                                 errno = EILSEQ;
934                                 goto error;
935                         }
936
937                         codepoint -= 0x10000;
938
939                         if (out_left < 4) {
940                                 errno = E2BIG;
941                                 goto error;
942                         }
943
944                         uc[0] = (codepoint>>10) & 0xFF;
945                         uc[1] = (codepoint>>18) | 0xd8;
946                         uc[2] = codepoint & 0xFF;
947                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
948                         c  += 4;
949                         in_left  -= 4;
950                         out_left -= 4;
951                         uc += 4;
952                         continue;
953                 }
954
955                 /* we don't handle 5 byte sequences */
956                 errno = EINVAL;
957                 goto error;
958         }
959
960         if (in_left > 0) {
961                 errno = E2BIG;
962                 goto error;
963         }
964
965         *inbytesleft = in_left;
966         *outbytesleft = out_left;
967         *inbuf = (const char *)c;
968         *outbuf = (char *)uc;
969         return 0;
970
971 error:
972         *inbytesleft = in_left;
973         *outbytesleft = out_left;
974         *inbuf = (const char *)c;
975         *outbuf = (char *)uc;
976         return -1;
977 }
978
979
980 /*
981   this takes a UTF16 sequence and produces a UTF8 sequence
982  */
983 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
984                         char **outbuf, size_t *outbytesleft)
985 {
986         size_t in_left=*inbytesleft, out_left=*outbytesleft;
987         uint8_t *c = (uint8_t *)*outbuf;
988         const uint8_t *uc = (const uint8_t *)*inbuf;
989
990         while (in_left >= 2 && out_left >= 1) {
991                 unsigned int codepoint;
992
993                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
994                         /* simplest case */
995                         c[0] = uc[0];
996                         in_left  -= 2;
997                         out_left -= 1;
998                         uc += 2;
999                         c  += 1;
1000                         continue;
1001                 }
1002
1003                 if ((uc[1]&0xf8) == 0) {
1004                         /* next simplest case */
1005                         if (out_left < 2) {
1006                                 errno = E2BIG;
1007                                 goto error;
1008                         }
1009                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
1010                         c[1] = 0x80 | (uc[0] & 0x3f);
1011                         in_left  -= 2;
1012                         out_left -= 2;
1013                         uc += 2;
1014                         c  += 2;
1015                         continue;
1016                 }
1017
1018                 if ((uc[1] & 0xfc) == 0xdc) {
1019                         errno = EILSEQ;
1020 #ifndef HAVE_ICONV_ERRNO_ILLEGAL_MULTIBYTE
1021                         if (in_left < 4) {
1022                                 errno = EINVAL;
1023                         }
1024 #endif
1025                         goto error;
1026                 }
1027
1028                 if ((uc[1] & 0xfc) != 0xd8) {
1029                         codepoint = uc[0] | (uc[1]<<8);
1030                         if (out_left < 3) {
1031                                 errno = E2BIG;
1032                                 goto error;
1033                         }
1034                         c[0] = 0xe0 | (codepoint >> 12);
1035                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
1036                         c[2] = 0x80 | (codepoint & 0x3f);
1037
1038                         in_left  -= 2;
1039                         out_left -= 3;
1040                         uc  += 2;
1041                         c   += 3;
1042                         continue;
1043                 }
1044
1045                 /* its the first part of a 4 byte sequence */
1046                 if (in_left < 4) {
1047                         errno = EINVAL;
1048                         goto error;
1049                 }
1050                 if ((uc[3] & 0xfc) != 0xdc) {
1051                         errno = EILSEQ;
1052                         goto error;
1053                 }
1054                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
1055                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
1056
1057                 if (out_left < 4) {
1058                         errno = E2BIG;
1059                         goto error;
1060                 }
1061                 c[0] = 0xf0 | (codepoint >> 18);
1062                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
1063                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
1064                 c[3] = 0x80 | (codepoint & 0x3f);
1065
1066                 in_left  -= 4;
1067                 out_left -= 4;
1068                 uc       += 4;
1069                 c        += 4;
1070         }
1071
1072         if (in_left == 1) {
1073                 errno = EINVAL;
1074                 goto error;
1075         }
1076
1077         if (in_left > 1) {
1078                 errno = E2BIG;
1079                 goto error;
1080         }
1081
1082         *inbytesleft = in_left;
1083         *outbytesleft = out_left;
1084         *inbuf  = (const char *)uc;
1085         *outbuf = (char *)c;
1086
1087         return 0;
1088
1089 error:
1090         *inbytesleft = in_left;
1091         *outbytesleft = out_left;
1092         *inbuf  = (const char *)uc;
1093         *outbuf = (char *)c;
1094         return -1;
1095 }
1096
1097
1098 /*
1099   this takes a UTF16 munged sequence, modifies it according to the
1100   string2key rules, and produces a UTF16 sequence
1101
1102 The rules are:
1103
1104     1) any 0x0000 characters are mapped to 0x0001
1105
1106     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
1107        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
1108        U+FFFD (OBJECT REPLACEMENT CHARACTER).
1109
1110     3) the same for any low surrogate that was not preceded by a high surrogate.
1111
1112  */
1113 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
1114                                char **outbuf, size_t *outbytesleft)
1115 {
1116         size_t in_left=*inbytesleft, out_left=*outbytesleft;
1117         uint8_t *c = (uint8_t *)*outbuf;
1118         const uint8_t *uc = (const uint8_t *)*inbuf;
1119
1120         while (in_left >= 2 && out_left >= 2) {
1121                 unsigned int codepoint = uc[0] | (uc[1]<<8);
1122
1123                 if (codepoint == 0) {
1124                         codepoint = 1;
1125                 }
1126
1127                 if ((codepoint & 0xfc00) == 0xd800) {
1128                         /* a high surrogate */
1129                         unsigned int codepoint2;
1130                         if (in_left < 4) {
1131                                 codepoint = 0xfffd;
1132                                 goto codepoint16;
1133                         }
1134                         codepoint2 = uc[2] | (uc[3]<<8);
1135                         if ((codepoint2 & 0xfc00) != 0xdc00) {
1136                                 /* high surrogate not followed by low
1137                                    surrogate: convert to 0xfffd */
1138                                 codepoint = 0xfffd;
1139                                 goto codepoint16;
1140                         }
1141                         if (out_left < 4) {
1142                                 errno = E2BIG;
1143                                 goto error;
1144                         }
1145                         memcpy(c, uc, 4);
1146                         in_left  -= 4;
1147                         out_left -= 4;
1148                         uc       += 4;
1149                         c        += 4;
1150                         continue;
1151                 }
1152
1153                 if ((codepoint & 0xfc00) == 0xdc00) {
1154                         /* low surrogate not preceded by high
1155                            surrogate: convert to 0xfffd */
1156                         codepoint = 0xfffd;
1157                 }
1158
1159         codepoint16:
1160                 c[0] = codepoint & 0xFF;
1161                 c[1] = (codepoint>>8) & 0xFF;
1162
1163                 in_left  -= 2;
1164                 out_left -= 2;
1165                 uc  += 2;
1166                 c   += 2;
1167                 continue;
1168         }
1169
1170         if (in_left == 1) {
1171                 errno = EINVAL;
1172                 goto error;
1173         }
1174
1175         if (in_left > 1) {
1176                 errno = E2BIG;
1177                 goto error;
1178         }
1179
1180         *inbytesleft = in_left;
1181         *outbytesleft = out_left;
1182         *inbuf  = (const char *)uc;
1183         *outbuf = (char *)c;
1184
1185         return 0;
1186
1187 error:
1188         *inbytesleft = in_left;
1189         *outbytesleft = out_left;
1190         *inbuf  = (const char *)uc;
1191         *outbuf = (char *)c;
1192         return -1;
1193 }
1194
1195
1196