charconv: Use talloc_tos() in the S3 build
[mdw/samba.git] / lib / util / charset / iconv.c
1 /* 
2    Unix SMB/CIFS implementation.
3    minimal iconv implementation
4    Copyright (C) Andrew Tridgell 2001
5    Copyright (C) Jelmer Vernooij 2002
6    
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "includes.h"
22 #include "../lib/util/dlinklist.h"
23 #include "system/iconv.h"
24 #include "system/filesys.h"
25
26 #ifdef strcasecmp
27 #undef strcasecmp
28 #endif
29
30 #ifdef static_decl_charset
31 static_decl_charset;
32 #endif
33
34 /**
35  * @file
36  *
37  * @brief Samba wrapper/stub for iconv character set conversion.
38  *
39  * iconv is the XPG2 interface for converting between character
40  * encodings.  This file provides a Samba wrapper around it, and also
41  * a simple reimplementation that is used if the system does not
42  * implement iconv.
43  *
44  * Samba only works with encodings that are supersets of ASCII: ascii
45  * characters like whitespace can be tested for directly, multibyte
46  * sequences start with a byte with the high bit set, and strings are
47  * terminated by a nul byte.
48  *
49  * Note that the only function provided by iconv is conversion between
50  * characters.  It doesn't directly support operations like
51  * uppercasing or comparison.  We have to convert to UTF-16LE and
52  * compare there.
53  *
54  * @sa Samba Developers Guide
55  **/
56
57 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
58 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
59 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
60 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
61 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
62 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
63 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
64 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
65 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
66 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
67
68 static const struct charset_functions builtin_functions[] = {
69         /* windows is closest to UTF-16 */
70         {"UCS-2LE",  iconv_copy, iconv_copy},
71         {"UTF-16LE",  iconv_copy, iconv_copy},
72         {"UCS-2BE",  iconv_swab, iconv_swab},
73         {"UTF-16BE",  iconv_swab, iconv_swab},
74
75         /* we include the UTF-8 alias to cope with differing locale settings */
76         {"UTF8",   utf8_pull,  utf8_push},
77         {"UTF-8",   utf8_pull,  utf8_push},
78
79         /* this handles the munging needed for String2Key */
80         {"UTF16_MUNGED",   utf16_munged_pull,  iconv_copy},
81
82         {"ASCII", ascii_pull, ascii_push},
83         {"646", ascii_pull, ascii_push},
84         {"ISO-8859-1", ascii_pull, latin1_push},
85         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
86 };
87
88 static struct charset_functions *charsets = NULL;
89
90 static struct charset_functions *find_charset_functions(const char *name)
91 {
92         struct charset_functions *c;
93
94         /* Check whether we already have this charset... */
95         for (c = charsets; c != NULL; c = c->next) {
96                 if(strcasecmp(c->name, name) == 0) { 
97                         return c;
98                 }
99                 c = c->next;
100         }
101
102         return NULL;
103 }
104
105 bool smb_register_charset(const struct charset_functions *funcs_in)
106 {
107         struct charset_functions *funcs;
108
109         DEBUG(5, ("Attempting to register new charset %s\n", funcs_in->name));
110         /* Check whether we already have this charset... */
111         if (find_charset_functions(funcs_in->name)) {
112                 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs_in->name));
113                 return false;
114         }
115
116         funcs = talloc(NULL, struct charset_functions);
117         if (!funcs) {
118                 DEBUG(0, ("Out of memory duplicating charset %s\n", funcs_in->name));
119                 return false;
120         }
121         *funcs = *funcs_in;
122
123         funcs->next = funcs->prev = NULL;
124         DEBUG(5, ("Registered charset %s\n", funcs->name));
125         DLIST_ADD(charsets, funcs);
126         return true;
127 }
128
129 static void lazy_initialize_iconv(void)
130 {
131 #ifdef static_init_charset
132         static bool initialized = false;
133
134         if (!initialized) {
135                 static_init_charset;
136                 initialized = true;
137         }
138 #endif
139 }
140
141 #ifdef HAVE_NATIVE_ICONV
142 /* if there was an error then reset the internal state,
143    this ensures that we don't have a shift state remaining for
144    character sets like SJIS */
145 static size_t sys_iconv(void *cd, 
146                         const char **inbuf, size_t *inbytesleft,
147                         char **outbuf, size_t *outbytesleft)
148 {
149         size_t ret = iconv((iconv_t)cd, 
150                            discard_const_p(char *, inbuf), inbytesleft, 
151                            outbuf, outbytesleft);
152         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
153         return ret;
154 }
155 #endif
156
157 /**
158  * This is a simple portable iconv() implementaion.
159  *
160  * It only knows about a very small number of character sets - just
161  * enough that Samba works on systems that don't have iconv.
162  **/
163 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd, 
164                  const char **inbuf, size_t *inbytesleft,
165                  char **outbuf, size_t *outbytesleft)
166 {
167         /* in many cases we can go direct */
168         if (cd->direct) {
169                 return cd->direct(cd->cd_direct, 
170                                   inbuf, inbytesleft, outbuf, outbytesleft);
171         }
172
173         /* otherwise we have to do it chunks at a time */
174         {
175 #ifndef SMB_ICONV_BUFSIZE
176 #define SMB_ICONV_BUFSIZE 2048
177 #endif
178                 TALLOC_CTX *mem_ctx;
179                 size_t bufsize;
180                 char *cvtbuf;
181
182 #if _SAMBA_BUILD_ == 3
183                 mem_ctx = talloc_tos();
184 #else
185                 mem_ctx = cd;
186 #endif
187                 cvtbuf = talloc_array(mem_ctx, char, SMB_ICONV_BUFSIZE);
188
189                 if (!cvtbuf) {
190                         return (size_t)-1;
191                 }
192
193                 while (*inbytesleft > 0) {
194                         char *bufp1 = cvtbuf;
195                         const char *bufp2 = cvtbuf;
196
197                         bufsize = SMB_ICONV_BUFSIZE;
198
199                         if (cd->pull(cd->cd_pull,
200                                      inbuf, inbytesleft, &bufp1, &bufsize) == -1
201                             && errno != E2BIG) {
202                                 talloc_free(cvtbuf);
203                                 return -1;
204                         }
205
206                         bufsize = SMB_ICONV_BUFSIZE - bufsize;
207
208                         if (cd->push(cd->cd_push,
209                                      &bufp2, &bufsize,
210                                      outbuf, outbytesleft) == -1) {
211                                 talloc_free(cvtbuf);
212                                 return -1;
213                         }
214                 }
215                 talloc_free(cvtbuf);
216         }
217
218         return 0;
219 }
220
221 static bool is_utf16(const char *name)
222 {
223         return strcasecmp(name, "UCS-2LE") == 0 ||
224                 strcasecmp(name, "UTF-16LE") == 0;
225 }
226
227 static int smb_iconv_t_destructor(smb_iconv_t hwd)
228 {
229 #ifdef HAVE_NATIVE_ICONV
230         if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
231                 iconv_close(hwd->cd_pull);
232         if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
233                 iconv_close(hwd->cd_push);
234         if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
235                 iconv_close(hwd->cd_direct);
236 #endif
237
238         return 0;
239 }
240
241 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode, 
242                               const char *fromcode, bool native_iconv)
243 {
244         smb_iconv_t ret;
245         const struct charset_functions *from=NULL, *to=NULL;
246         int i;
247
248         lazy_initialize_iconv();
249
250         ret = (smb_iconv_t)talloc_named(mem_ctx,
251                                         sizeof(*ret), 
252                                         "iconv(%s,%s)", tocode, fromcode);
253         if (!ret) {
254                 errno = ENOMEM;
255                 return (smb_iconv_t)-1;
256         }
257         memset(ret, 0, sizeof(*ret));
258         talloc_set_destructor(ret, smb_iconv_t_destructor);
259
260         /* check for the simplest null conversion */
261         if (strcmp(fromcode, tocode) == 0) {
262                 ret->direct = iconv_copy;
263                 return ret;
264         }
265
266         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
267                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
268                         from = &builtin_functions[i];
269                 }
270                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
271                         to = &builtin_functions[i];
272                 }
273         }
274
275         if (from == NULL) {
276                 for (from=charsets; from; from=from->next) {
277                         if (strcasecmp(from->name, fromcode) == 0) break;
278                 }
279         }
280
281         if (to == NULL) {
282                 for (to=charsets; to; to=to->next) {
283                         if (strcasecmp(to->name, tocode) == 0) break;
284                 }
285         }
286
287 #ifdef HAVE_NATIVE_ICONV
288         if ((!from || !to) && !native_iconv) {
289                 goto failed;
290         }
291         if (!from) {
292                 ret->pull = sys_iconv;
293                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
294                 if (ret->cd_pull == (iconv_t)-1)
295                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
296                 if (ret->cd_pull == (iconv_t)-1) goto failed;
297         }
298
299         if (!to) {
300                 ret->push = sys_iconv;
301                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
302                 if (ret->cd_push == (iconv_t)-1)
303                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
304                 if (ret->cd_push == (iconv_t)-1) goto failed;
305         }
306 #else
307         if (!from || !to) {
308                 goto failed;
309         }
310 #endif
311
312         /* check for conversion to/from ucs2 */
313         if (is_utf16(fromcode) && to) {
314                 ret->direct = to->push;
315                 return ret;
316         }
317         if (is_utf16(tocode) && from) {
318                 ret->direct = from->pull;
319                 return ret;
320         }
321
322 #ifdef HAVE_NATIVE_ICONV
323         if (is_utf16(fromcode)) {
324                 ret->direct = sys_iconv;
325                 ret->cd_direct = ret->cd_push;
326                 ret->cd_push = NULL;
327                 return ret;
328         }
329         if (is_utf16(tocode)) {
330                 ret->direct = sys_iconv;
331                 ret->cd_direct = ret->cd_pull;
332                 ret->cd_pull = NULL;
333                 return ret;
334         }
335 #endif
336
337         /* the general case has to go via a buffer */
338         if (!ret->pull) ret->pull = from->pull;
339         if (!ret->push) ret->push = to->push;
340         return ret;
341
342 failed:
343         talloc_free(ret);
344         errno = EINVAL;
345         return (smb_iconv_t)-1;
346 }
347
348 /*
349   simple iconv_open() wrapper
350  */
351 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
352 {
353         return smb_iconv_open_ex(NULL, tocode, fromcode, true);
354 }
355
356 /*
357   simple iconv_close() wrapper
358 */
359 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
360 {
361         talloc_free(cd);
362         return 0;
363 }
364
365
366 /**********************************************************************
367  the following functions implement the builtin character sets in Samba
368  and also the "test" character sets that are designed to test
369  multi-byte character set support for english users
370 ***********************************************************************/
371 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
372                          char **outbuf, size_t *outbytesleft)
373 {
374         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
375                 (*outbuf)[0] = (*inbuf)[0];
376                 (*outbuf)[1] = 0;
377                 (*inbytesleft)  -= 1;
378                 (*outbytesleft) -= 2;
379                 (*inbuf)  += 1;
380                 (*outbuf) += 2;
381         }
382
383         if (*inbytesleft > 0) {
384                 errno = E2BIG;
385                 return -1;
386         }
387         
388         return 0;
389 }
390
391 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
392                          char **outbuf, size_t *outbytesleft)
393 {
394         int ir_count=0;
395
396         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
397                 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
398                 if ((*inbuf)[1]) ir_count++;
399                 (*inbytesleft)  -= 2;
400                 (*outbytesleft) -= 1;
401                 (*inbuf)  += 2;
402                 (*outbuf) += 1;
403         }
404
405         if (*inbytesleft == 1) {
406                 errno = EINVAL;
407                 return -1;
408         }
409
410         if (*inbytesleft > 1) {
411                 errno = E2BIG;
412                 return -1;
413         }
414         
415         return ir_count;
416 }
417
418 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
419                          char **outbuf, size_t *outbytesleft)
420 {
421         int ir_count=0;
422
423         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
424                 (*outbuf)[0] = (*inbuf)[0];
425                 if ((*inbuf)[1]) ir_count++;
426                 (*inbytesleft)  -= 2;
427                 (*outbytesleft) -= 1;
428                 (*inbuf)  += 2;
429                 (*outbuf) += 1;
430         }
431
432         if (*inbytesleft == 1) {
433                 errno = EINVAL;
434                 return -1;
435         }
436
437         if (*inbytesleft > 1) {
438                 errno = E2BIG;
439                 return -1;
440         }
441
442         return ir_count;
443 }
444
445 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
446                          char **outbuf, size_t *outbytesleft)
447 {
448         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
449                 unsigned int v;
450
451                 if ((*inbuf)[0] != '@') {
452                         /* seven bit ascii case */
453                         (*outbuf)[0] = (*inbuf)[0];
454                         (*outbuf)[1] = 0;
455                         (*inbytesleft)  -= 1;
456                         (*outbytesleft) -= 2;
457                         (*inbuf)  += 1;
458                         (*outbuf) += 2;
459                         continue;
460                 }
461                 /* it's a hex character */
462                 if (*inbytesleft < 5) {
463                         errno = EINVAL;
464                         return -1;
465                 }
466                 
467                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
468                         errno = EILSEQ;
469                         return -1;
470                 }
471
472                 (*outbuf)[0] = v&0xff;
473                 (*outbuf)[1] = v>>8;
474                 (*inbytesleft)  -= 5;
475                 (*outbytesleft) -= 2;
476                 (*inbuf)  += 5;
477                 (*outbuf) += 2;
478         }
479
480         if (*inbytesleft > 0) {
481                 errno = E2BIG;
482                 return -1;
483         }
484         
485         return 0;
486 }
487
488 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
489                            char **outbuf, size_t *outbytesleft)
490 {
491         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
492                 char buf[6];
493
494                 if ((*inbuf)[1] == 0 && 
495                     ((*inbuf)[0] & 0x80) == 0 &&
496                     (*inbuf)[0] != '@') {
497                         (*outbuf)[0] = (*inbuf)[0];
498                         (*inbytesleft)  -= 2;
499                         (*outbytesleft) -= 1;
500                         (*inbuf)  += 2;
501                         (*outbuf) += 1;
502                         continue;
503                 }
504                 if (*outbytesleft < 5) {
505                         errno = E2BIG;
506                         return -1;
507                 }
508                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
509                 memcpy(*outbuf, buf, 5);
510                 (*inbytesleft)  -= 2;
511                 (*outbytesleft) -= 5;
512                 (*inbuf)  += 2;
513                 (*outbuf) += 5;
514         }
515
516         if (*inbytesleft == 1) {
517                 errno = EINVAL;
518                 return -1;
519         }
520
521         if (*inbytesleft > 1) {
522                 errno = E2BIG;
523                 return -1;
524         }
525         
526         return 0;
527 }
528
529 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
530                          char **outbuf, size_t *outbytesleft)
531 {
532         int n;
533
534         n = MIN(*inbytesleft, *outbytesleft);
535
536         swab(*inbuf, *outbuf, (n&~1));
537         if (n&1) {
538                 (*outbuf)[n-1] = 0;
539         }
540
541         (*inbytesleft) -= n;
542         (*outbytesleft) -= n;
543         (*inbuf) += n;
544         (*outbuf) += n;
545
546         if (*inbytesleft > 0) {
547                 errno = E2BIG;
548                 return -1;
549         }
550
551         return 0;
552 }
553
554
555 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
556                          char **outbuf, size_t *outbytesleft)
557 {
558         int n;
559
560         n = MIN(*inbytesleft, *outbytesleft);
561
562         memmove(*outbuf, *inbuf, n);
563
564         (*inbytesleft) -= n;
565         (*outbytesleft) -= n;
566         (*inbuf) += n;
567         (*outbuf) += n;
568
569         if (*inbytesleft > 0) {
570                 errno = E2BIG;
571                 return -1;
572         }
573
574         return 0;
575 }
576
577 /*
578   this takes a UTF8 sequence and produces a UTF16 sequence
579  */
580 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
581                          char **outbuf, size_t *outbytesleft)
582 {
583         size_t in_left=*inbytesleft, out_left=*outbytesleft;
584         const uint8_t *c = (const uint8_t *)*inbuf;
585         uint8_t *uc = (uint8_t *)*outbuf;
586
587         while (in_left >= 1 && out_left >= 2) {
588                 if ((c[0] & 0x80) == 0) {
589                         uc[0] = c[0];
590                         uc[1] = 0;
591                         c  += 1;
592                         in_left  -= 1;
593                         out_left -= 2;
594                         uc += 2;
595                         continue;
596                 }
597
598                 if ((c[0] & 0xe0) == 0xc0) {
599                         if (in_left < 2 ||
600                             (c[1] & 0xc0) != 0x80) {
601                                 errno = EILSEQ;
602                                 goto error;
603                         }
604                         uc[1] = (c[0]>>2) & 0x7;
605                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
606                         c  += 2;
607                         in_left  -= 2;
608                         out_left -= 2;
609                         uc += 2;
610                         continue;
611                 }
612
613                 if ((c[0] & 0xf0) == 0xe0) {
614                         if (in_left < 3 ||
615                             (c[1] & 0xc0) != 0x80 || 
616                             (c[2] & 0xc0) != 0x80) {
617                                 errno = EILSEQ;
618                                 goto error;
619                         }
620                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
621                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
622                         c  += 3;
623                         in_left  -= 3;
624                         out_left -= 2;
625                         uc += 2;
626                         continue;
627                 }
628
629                 if ((c[0] & 0xf8) == 0xf0) {
630                         unsigned int codepoint;
631                         if (in_left < 4 ||
632                             (c[1] & 0xc0) != 0x80 || 
633                             (c[2] & 0xc0) != 0x80 ||
634                             (c[3] & 0xc0) != 0x80) {
635                                 errno = EILSEQ;
636                                 goto error;
637                         }
638                         codepoint = 
639                                 (c[3]&0x3f) | 
640                                 ((c[2]&0x3f)<<6) | 
641                                 ((c[1]&0x3f)<<12) |
642                                 ((c[0]&0x7)<<18);
643                         if (codepoint < 0x10000) {
644                                 /* accept UTF-8 characters that are not
645                                    minimally packed, but pack the result */
646                                 uc[0] = (codepoint & 0xFF);
647                                 uc[1] = (codepoint >> 8);
648                                 c += 4;
649                                 in_left -= 4;
650                                 out_left -= 2;
651                                 uc += 2;
652                                 continue;
653                         }
654
655                         codepoint -= 0x10000;
656
657                         if (out_left < 4) {
658                                 errno = E2BIG;
659                                 goto error;
660                         }
661
662                         uc[0] = (codepoint>>10) & 0xFF;
663                         uc[1] = (codepoint>>18) | 0xd8;
664                         uc[2] = codepoint & 0xFF;
665                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
666                         c  += 4;
667                         in_left  -= 4;
668                         out_left -= 4;
669                         uc += 4;
670                         continue;
671                 }
672
673                 /* we don't handle 5 byte sequences */
674                 errno = EINVAL;
675                 goto error;
676         }
677
678         if (in_left > 0) {
679                 errno = E2BIG;
680                 goto error;
681         }
682
683         *inbytesleft = in_left;
684         *outbytesleft = out_left;
685         *inbuf = (const char *)c;
686         *outbuf = (char *)uc;
687         return 0;
688
689 error:
690         *inbytesleft = in_left;
691         *outbytesleft = out_left;
692         *inbuf = (const char *)c;
693         *outbuf = (char *)uc;
694         return -1;
695 }
696
697
698 /*
699   this takes a UTF16 sequence and produces a UTF8 sequence
700  */
701 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
702                         char **outbuf, size_t *outbytesleft)
703 {
704         size_t in_left=*inbytesleft, out_left=*outbytesleft;
705         uint8_t *c = (uint8_t *)*outbuf;
706         const uint8_t *uc = (const uint8_t *)*inbuf;
707
708         while (in_left >= 2 && out_left >= 1) {
709                 unsigned int codepoint;
710
711                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
712                         /* simplest case */
713                         c[0] = uc[0];
714                         in_left  -= 2;
715                         out_left -= 1;
716                         uc += 2;
717                         c  += 1;
718                         continue;
719                 }
720
721                 if ((uc[1]&0xf8) == 0) {
722                         /* next simplest case */
723                         if (out_left < 2) {
724                                 errno = E2BIG;
725                                 goto error;
726                         }
727                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
728                         c[1] = 0x80 | (uc[0] & 0x3f);
729                         in_left  -= 2;
730                         out_left -= 2;
731                         uc += 2;
732                         c  += 2;
733                         continue;
734                 }
735
736                 if ((uc[1] & 0xfc) == 0xdc) {
737                         /* its the second part of a 4 byte sequence. Illegal */
738                         if (in_left < 4) {
739                                 errno = EINVAL;
740                         } else {
741                                 errno = EILSEQ;
742                         }
743                         goto error;
744                 }
745
746                 if ((uc[1] & 0xfc) != 0xd8) {
747                         codepoint = uc[0] | (uc[1]<<8);
748                         if (out_left < 3) {
749                                 errno = E2BIG;
750                                 goto error;
751                         }
752                         c[0] = 0xe0 | (codepoint >> 12);
753                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
754                         c[2] = 0x80 | (codepoint & 0x3f);
755                         
756                         in_left  -= 2;
757                         out_left -= 3;
758                         uc  += 2;
759                         c   += 3;
760                         continue;
761                 }
762
763                 /* its the first part of a 4 byte sequence */
764                 if (in_left < 4) {
765                         errno = EINVAL;
766                         goto error;
767                 }
768                 if ((uc[3] & 0xfc) != 0xdc) {
769                         errno = EILSEQ;
770                         goto error;
771                 }
772                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 
773                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
774                 
775                 if (out_left < 4) {
776                         errno = E2BIG;
777                         goto error;
778                 }
779                 c[0] = 0xf0 | (codepoint >> 18);
780                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
781                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
782                 c[3] = 0x80 | (codepoint & 0x3f);
783                 
784                 in_left  -= 4;
785                 out_left -= 4;
786                 uc       += 4;
787                 c        += 4;
788         }
789
790         if (in_left == 1) {
791                 errno = EINVAL;
792                 goto error;
793         }
794
795         if (in_left > 1) {
796                 errno = E2BIG;
797                 goto error;
798         }
799
800         *inbytesleft = in_left;
801         *outbytesleft = out_left;
802         *inbuf  = (const char *)uc;
803         *outbuf = (char *)c;
804         
805         return 0;
806
807 error:
808         *inbytesleft = in_left;
809         *outbytesleft = out_left;
810         *inbuf  = (const char *)uc;
811         *outbuf = (char *)c;
812         return -1;
813 }
814
815
816 /*
817   this takes a UTF16 munged sequence, modifies it according to the
818   string2key rules, and produces a UTF16 sequence
819
820 The rules are:
821
822     1) any 0x0000 characters are mapped to 0x0001
823
824     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
825        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
826        U+FFFD (OBJECT REPLACEMENT CHARACTER).
827
828     3) the same for any low surrogate that was not preceded by a high surrogate.
829
830  */
831 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
832                                char **outbuf, size_t *outbytesleft)
833 {
834         size_t in_left=*inbytesleft, out_left=*outbytesleft;
835         uint8_t *c = (uint8_t *)*outbuf;
836         const uint8_t *uc = (const uint8_t *)*inbuf;
837
838         while (in_left >= 2 && out_left >= 2) {
839                 unsigned int codepoint = uc[0] | (uc[1]<<8);
840
841                 if (codepoint == 0) {
842                         codepoint = 1;
843                 }
844
845                 if ((codepoint & 0xfc00) == 0xd800) {
846                         /* a high surrogate */
847                         unsigned int codepoint2;
848                         if (in_left < 4) {
849                                 codepoint = 0xfffd;
850                                 goto codepoint16;                               
851                         }
852                         codepoint2 = uc[2] | (uc[3]<<8);
853                         if ((codepoint2 & 0xfc00) != 0xdc00) {
854                                 /* high surrogate not followed by low
855                                    surrogate: convert to 0xfffd */
856                                 codepoint = 0xfffd;
857                                 goto codepoint16;
858                         }
859                         if (out_left < 4) {
860                                 errno = E2BIG;
861                                 goto error;
862                         }
863                         memcpy(c, uc, 4);
864                         in_left  -= 4;
865                         out_left -= 4;
866                         uc       += 4;
867                         c        += 4;
868                         continue;
869                 }
870
871                 if ((codepoint & 0xfc00) == 0xdc00) {
872                         /* low surrogate not preceded by high
873                            surrogate: convert to 0xfffd */
874                         codepoint = 0xfffd;
875                 }
876
877         codepoint16:
878                 c[0] = codepoint & 0xFF;
879                 c[1] = (codepoint>>8) & 0xFF;
880                 
881                 in_left  -= 2;
882                 out_left -= 2;
883                 uc  += 2;
884                 c   += 2;
885                 continue;               
886         }
887
888         if (in_left == 1) {
889                 errno = EINVAL;
890                 goto error;
891         }
892
893         if (in_left > 1) {
894                 errno = E2BIG;
895                 goto error;
896         }
897
898         *inbytesleft = in_left;
899         *outbytesleft = out_left;
900         *inbuf  = (const char *)uc;
901         *outbuf = (char *)c;
902         
903         return 0;
904
905 error:
906         *inbytesleft = in_left;
907         *outbytesleft = out_left;
908         *inbuf  = (const char *)uc;
909         *outbuf = (char *)c;
910         return -1;
911 }
912
913
914