python:tests: Store keys as bytes rather than as lists of ints
[samba.git] / lib / util / charset / iconv.c
1 /* 
2    Unix SMB/CIFS implementation.
3    minimal iconv implementation
4    Copyright (C) Andrew Tridgell 2001
5    Copyright (C) Jelmer Vernooij 2002
6    
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "includes.h"
22 #include "../lib/util/dlinklist.h"
23 #include "system/iconv.h"
24 #include "system/filesys.h"
25
26 #ifdef strcasecmp
27 #undef strcasecmp
28 #endif
29
30 #ifdef static_decl_charset
31 static_decl_charset;
32 #endif
33
34 /**
35  * @file
36  *
37  * @brief Samba wrapper/stub for iconv character set conversion.
38  *
39  * iconv is the XPG2 interface for converting between character
40  * encodings.  This file provides a Samba wrapper around it, and also
41  * a simple reimplementation that is used if the system does not
42  * implement iconv.
43  *
44  * Samba only works with encodings that are supersets of ASCII: ascii
45  * characters like whitespace can be tested for directly, multibyte
46  * sequences start with a byte with the high bit set, and strings are
47  * terminated by a nul byte.
48  *
49  * Note that the only function provided by iconv is conversion between
50  * characters.  It doesn't directly support operations like
51  * uppercasing or comparison.  We have to convert to UTF-16LE and
52  * compare there.
53  *
54  * @sa Samba Developers Guide
55  **/
56
57 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
58 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
59 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
60 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
61 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
62 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
63 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
64 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
65 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
66 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
67 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
68
69 static const struct charset_functions builtin_functions[] = {
70         /* windows is closest to UTF-16 */
71         {"UCS-2LE",  iconv_copy, iconv_copy},
72         {"UTF-16LE",  iconv_copy, iconv_copy},
73         {"UCS-2BE",  iconv_swab, iconv_swab},
74         {"UTF-16BE",  iconv_swab, iconv_swab},
75
76         /* we include the UTF-8 alias to cope with differing locale settings */
77         {"UTF8",   utf8_pull,  utf8_push},
78         {"UTF-8",   utf8_pull,  utf8_push},
79
80         /* this handles the munging needed for String2Key */
81         {"UTF16_MUNGED",   utf16_munged_pull,  iconv_copy},
82
83         {"ASCII", ascii_pull, ascii_push},
84         {"646", ascii_pull, ascii_push},
85         {"ISO-8859-1", latin1_pull, latin1_push},
86         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
87 };
88
89 static struct charset_functions *charsets = NULL;
90
91 static struct charset_functions *find_charset_functions(const char *name)
92 {
93         struct charset_functions *c;
94
95         /* Check whether we already have this charset... */
96         for (c = charsets; c != NULL; c = c->next) {
97                 if(strcasecmp(c->name, name) == 0) { 
98                         return c;
99                 }
100                 c = c->next;
101         }
102
103         return NULL;
104 }
105
106 bool smb_register_charset(const struct charset_functions *funcs_in)
107 {
108         struct charset_functions *funcs;
109
110         DEBUG(5, ("Attempting to register new charset %s\n", funcs_in->name));
111         /* Check whether we already have this charset... */
112         if (find_charset_functions(funcs_in->name)) {
113                 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs_in->name));
114                 return false;
115         }
116
117         funcs = talloc(NULL, struct charset_functions);
118         if (!funcs) {
119                 DEBUG(0, ("Out of memory duplicating charset %s\n", funcs_in->name));
120                 return false;
121         }
122         *funcs = *funcs_in;
123
124         funcs->next = funcs->prev = NULL;
125         DEBUG(5, ("Registered charset %s\n", funcs->name));
126         DLIST_ADD(charsets, funcs);
127         return true;
128 }
129
130 static void lazy_initialize_iconv(void)
131 {
132 #ifdef static_init_charset
133         static bool initialized = false;
134
135         if (!initialized) {
136                 static_init_charset;
137                 initialized = true;
138         }
139 #endif
140 }
141
142 #ifdef HAVE_NATIVE_ICONV
143 /* if there was an error then reset the internal state,
144    this ensures that we don't have a shift state remaining for
145    character sets like SJIS */
146 static size_t sys_iconv(void *cd, 
147                         const char **inbuf, size_t *inbytesleft,
148                         char **outbuf, size_t *outbytesleft)
149 {
150         size_t ret = iconv((iconv_t)cd, 
151                            discard_const_p(char *, inbuf), inbytesleft, 
152                            outbuf, outbytesleft);
153         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
154         return ret;
155 }
156 #endif
157
158 /**
159  * This is a simple portable iconv() implementaion.
160  *
161  * It only knows about a very small number of character sets - just
162  * enough that Samba works on systems that don't have iconv.
163  **/
164 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd, 
165                  const char **inbuf, size_t *inbytesleft,
166                  char **outbuf, size_t *outbytesleft)
167 {
168         /* in many cases we can go direct */
169         if (cd->direct) {
170                 return cd->direct(cd->cd_direct, 
171                                   inbuf, inbytesleft, outbuf, outbytesleft);
172         }
173
174         /* otherwise we have to do it chunks at a time */
175         {
176 #ifndef SMB_ICONV_BUFSIZE
177 #define SMB_ICONV_BUFSIZE 2048
178 #endif
179                 TALLOC_CTX *mem_ctx;
180                 size_t bufsize;
181                 char *cvtbuf;
182
183 #if _SAMBA_BUILD_ == 3
184                 mem_ctx = talloc_tos();
185 #else
186                 mem_ctx = cd;
187 #endif
188                 cvtbuf = talloc_array(mem_ctx, char, SMB_ICONV_BUFSIZE);
189
190                 if (!cvtbuf) {
191                         return (size_t)-1;
192                 }
193
194                 while (*inbytesleft > 0) {
195                         char *bufp1 = cvtbuf;
196                         const char *bufp2 = cvtbuf;
197                         int saved_errno = errno;
198                         bool pull_failed = false;
199                         bufsize = SMB_ICONV_BUFSIZE;
200
201                         if (cd->pull(cd->cd_pull,
202                                      inbuf, inbytesleft, &bufp1, &bufsize) == -1
203                             && errno != E2BIG) {
204                                 saved_errno = errno;
205                                 pull_failed = true;
206                         }
207
208                         bufsize = SMB_ICONV_BUFSIZE - bufsize;
209
210                         if (cd->push(cd->cd_push,
211                                      &bufp2, &bufsize,
212                                      outbuf, outbytesleft) == -1) {
213                                 talloc_free(cvtbuf);
214                                 return -1;
215                         } else if (pull_failed) {
216                                 /* We want the pull errno if possible */
217                                 errno = saved_errno;
218                                 return -1;
219                         }
220                 }
221                 talloc_free(cvtbuf);
222         }
223
224         return 0;
225 }
226
227 static bool is_utf16(const char *name)
228 {
229         return strcasecmp(name, "UCS-2LE") == 0 ||
230                 strcasecmp(name, "UTF-16LE") == 0;
231 }
232
233 static int smb_iconv_t_destructor(smb_iconv_t hwd)
234 {
235 #ifdef HAVE_NATIVE_ICONV
236         if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
237                 iconv_close(hwd->cd_pull);
238         if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
239                 iconv_close(hwd->cd_push);
240         if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
241                 iconv_close(hwd->cd_direct);
242 #endif
243
244         return 0;
245 }
246
247 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode, 
248                               const char *fromcode, bool native_iconv)
249 {
250         smb_iconv_t ret;
251         const struct charset_functions *from=NULL, *to=NULL;
252         int i;
253
254         lazy_initialize_iconv();
255
256         ret = (smb_iconv_t)talloc_named(mem_ctx,
257                                         sizeof(*ret), 
258                                         "iconv(%s,%s)", tocode, fromcode);
259         if (!ret) {
260                 errno = ENOMEM;
261                 return (smb_iconv_t)-1;
262         }
263         memset(ret, 0, sizeof(*ret));
264         talloc_set_destructor(ret, smb_iconv_t_destructor);
265
266         /* check for the simplest null conversion */
267         if (strcmp(fromcode, tocode) == 0) {
268                 ret->direct = iconv_copy;
269                 return ret;
270         }
271
272         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
273                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
274                         from = &builtin_functions[i];
275                 }
276                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
277                         to = &builtin_functions[i];
278                 }
279         }
280
281         if (from == NULL) {
282                 for (from=charsets; from; from=from->next) {
283                         if (strcasecmp(from->name, fromcode) == 0) break;
284                 }
285         }
286
287         if (to == NULL) {
288                 for (to=charsets; to; to=to->next) {
289                         if (strcasecmp(to->name, tocode) == 0) break;
290                 }
291         }
292
293 #ifdef HAVE_NATIVE_ICONV
294         if ((!from || !to) && !native_iconv) {
295                 goto failed;
296         }
297         if (!from) {
298                 ret->pull = sys_iconv;
299                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
300                 if (ret->cd_pull == (iconv_t)-1)
301                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
302                 if (ret->cd_pull == (iconv_t)-1) goto failed;
303         }
304
305         if (!to) {
306                 ret->push = sys_iconv;
307                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
308                 if (ret->cd_push == (iconv_t)-1)
309                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
310                 if (ret->cd_push == (iconv_t)-1) goto failed;
311         }
312 #else
313         if (!from || !to) {
314                 goto failed;
315         }
316 #endif
317
318         /* check for conversion to/from ucs2 */
319         if (is_utf16(fromcode) && to) {
320                 ret->direct = to->push;
321                 return ret;
322         }
323         if (is_utf16(tocode) && from) {
324                 ret->direct = from->pull;
325                 return ret;
326         }
327
328 #ifdef HAVE_NATIVE_ICONV
329         if (is_utf16(fromcode)) {
330                 ret->direct = sys_iconv;
331                 ret->cd_direct = ret->cd_push;
332                 ret->cd_push = NULL;
333                 return ret;
334         }
335         if (is_utf16(tocode)) {
336                 ret->direct = sys_iconv;
337                 ret->cd_direct = ret->cd_pull;
338                 ret->cd_pull = NULL;
339                 return ret;
340         }
341 #endif
342
343         /* the general case has to go via a buffer */
344         if (!ret->pull) ret->pull = from->pull;
345         if (!ret->push) ret->push = to->push;
346         return ret;
347
348 failed:
349         talloc_free(ret);
350         errno = EINVAL;
351         return (smb_iconv_t)-1;
352 }
353
354 /*
355   simple iconv_open() wrapper
356  */
357 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
358 {
359         return smb_iconv_open_ex(NULL, tocode, fromcode, true);
360 }
361
362 /*
363   simple iconv_close() wrapper
364 */
365 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
366 {
367         talloc_free(cd);
368         return 0;
369 }
370
371
372 /**********************************************************************
373  the following functions implement the builtin character sets in Samba
374  and also the "test" character sets that are designed to test
375  multi-byte character set support for english users
376 ***********************************************************************/
377
378 /*
379   this takes an ASCII sequence and produces a UTF16 sequence
380
381   The first 127 codepoints of latin1 matches the first 127 codepoints
382   of unicode, and so can be put into the first byte of UTF16LE
383
384  */
385
386 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
387                          char **outbuf, size_t *outbytesleft)
388 {
389         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
390                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
391                         /* If this is multi-byte, then it isn't legal ASCII */
392                         errno = EILSEQ;
393                         return -1;
394                 }
395                 (*outbuf)[0] = (*inbuf)[0];
396                 (*outbuf)[1] = 0;
397                 (*inbytesleft)  -= 1;
398                 (*outbytesleft) -= 2;
399                 (*inbuf)  += 1;
400                 (*outbuf) += 2;
401         }
402
403         if (*inbytesleft > 0) {
404                 errno = E2BIG;
405                 return -1;
406         }
407         
408         return 0;
409 }
410
411 /*
412   this takes a UTF16 sequence and produces an ASCII sequence
413
414   The first 127 codepoints of ASCII matches the first 127 codepoints
415   of unicode, and so can be read directly from the first byte of UTF16LE
416
417  */
418 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
419                          char **outbuf, size_t *outbytesleft)
420 {
421         int ir_count=0;
422
423         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
424                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
425                         (*inbuf)[1] != 0) {
426                         /* If this is multi-byte, then it isn't legal ASCII */
427                         errno = EILSEQ;
428                         return -1;
429                 }
430                 (*outbuf)[0] = (*inbuf)[0];
431                 (*inbytesleft)  -= 2;
432                 (*outbytesleft) -= 1;
433                 (*inbuf)  += 2;
434                 (*outbuf) += 1;
435         }
436
437         if (*inbytesleft == 1) {
438                 errno = EINVAL;
439                 return -1;
440         }
441
442         if (*inbytesleft > 1) {
443                 errno = E2BIG;
444                 return -1;
445         }
446         
447         return ir_count;
448 }
449
450 /*
451   this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
452
453   The first 256 codepoints of latin1 matches the first 256 codepoints
454   of unicode, and so can be put into the first byte of UTF16LE
455
456  */
457 static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
458                           char **outbuf, size_t *outbytesleft)
459 {
460         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
461                 (*outbuf)[0] = (*inbuf)[0];
462                 (*outbuf)[1] = 0;
463                 (*inbytesleft)  -= 1;
464                 (*outbytesleft) -= 2;
465                 (*inbuf)  += 1;
466                 (*outbuf) += 2;
467         }
468
469         if (*inbytesleft > 0) {
470                 errno = E2BIG;
471                 return -1;
472         }
473
474         return 0;
475 }
476
477 /*
478   this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
479
480   The first 256 codepoints of latin1 matches the first 256 codepoints
481   of unicode, and so can be read directly from the first byte of UTF16LE
482
483  */
484 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
485                          char **outbuf, size_t *outbytesleft)
486 {
487         int ir_count=0;
488
489         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
490                 (*outbuf)[0] = (*inbuf)[0];
491                 if ((*inbuf)[1] != 0) {
492                         /* If this is multi-byte, then it isn't legal latin1 */
493                         errno = EILSEQ;
494                         return -1;
495                 }
496                 (*inbytesleft)  -= 2;
497                 (*outbytesleft) -= 1;
498                 (*inbuf)  += 2;
499                 (*outbuf) += 1;
500         }
501
502         if (*inbytesleft == 1) {
503                 errno = EINVAL;
504                 return -1;
505         }
506
507         if (*inbytesleft > 1) {
508                 errno = E2BIG;
509                 return -1;
510         }
511
512         return ir_count;
513 }
514
515 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
516                          char **outbuf, size_t *outbytesleft)
517 {
518         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
519                 unsigned int v;
520
521                 if ((*inbuf)[0] != '@') {
522                         /* seven bit ascii case */
523                         (*outbuf)[0] = (*inbuf)[0];
524                         (*outbuf)[1] = 0;
525                         (*inbytesleft)  -= 1;
526                         (*outbytesleft) -= 2;
527                         (*inbuf)  += 1;
528                         (*outbuf) += 2;
529                         continue;
530                 }
531                 /* it's a hex character */
532                 if (*inbytesleft < 5) {
533                         errno = EINVAL;
534                         return -1;
535                 }
536                 
537                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
538                         errno = EILSEQ;
539                         return -1;
540                 }
541
542                 (*outbuf)[0] = v&0xff;
543                 (*outbuf)[1] = v>>8;
544                 (*inbytesleft)  -= 5;
545                 (*outbytesleft) -= 2;
546                 (*inbuf)  += 5;
547                 (*outbuf) += 2;
548         }
549
550         if (*inbytesleft > 0) {
551                 errno = E2BIG;
552                 return -1;
553         }
554         
555         return 0;
556 }
557
558 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
559                            char **outbuf, size_t *outbytesleft)
560 {
561         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
562                 char buf[6];
563
564                 if ((*inbuf)[1] == 0 && 
565                     ((*inbuf)[0] & 0x80) == 0 &&
566                     (*inbuf)[0] != '@') {
567                         (*outbuf)[0] = (*inbuf)[0];
568                         (*inbytesleft)  -= 2;
569                         (*outbytesleft) -= 1;
570                         (*inbuf)  += 2;
571                         (*outbuf) += 1;
572                         continue;
573                 }
574                 if (*outbytesleft < 5) {
575                         errno = E2BIG;
576                         return -1;
577                 }
578                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
579                 memcpy(*outbuf, buf, 5);
580                 (*inbytesleft)  -= 2;
581                 (*outbytesleft) -= 5;
582                 (*inbuf)  += 2;
583                 (*outbuf) += 5;
584         }
585
586         if (*inbytesleft == 1) {
587                 errno = EINVAL;
588                 return -1;
589         }
590
591         if (*inbytesleft > 1) {
592                 errno = E2BIG;
593                 return -1;
594         }
595         
596         return 0;
597 }
598
599 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
600                          char **outbuf, size_t *outbytesleft)
601 {
602         int n;
603
604         n = MIN(*inbytesleft, *outbytesleft);
605
606         swab(*inbuf, *outbuf, (n&~1));
607         if (n&1) {
608                 (*outbuf)[n-1] = 0;
609         }
610
611         (*inbytesleft) -= n;
612         (*outbytesleft) -= n;
613         (*inbuf) += n;
614         (*outbuf) += n;
615
616         if (*inbytesleft > 0) {
617                 errno = E2BIG;
618                 return -1;
619         }
620
621         return 0;
622 }
623
624
625 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
626                          char **outbuf, size_t *outbytesleft)
627 {
628         int n;
629
630         n = MIN(*inbytesleft, *outbytesleft);
631
632         memmove(*outbuf, *inbuf, n);
633
634         (*inbytesleft) -= n;
635         (*outbytesleft) -= n;
636         (*inbuf) += n;
637         (*outbuf) += n;
638
639         if (*inbytesleft > 0) {
640                 errno = E2BIG;
641                 return -1;
642         }
643
644         return 0;
645 }
646
647 /*
648   this takes a UTF8 sequence and produces a UTF16 sequence
649  */
650 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
651                          char **outbuf, size_t *outbytesleft)
652 {
653         size_t in_left=*inbytesleft, out_left=*outbytesleft;
654         const uint8_t *c = (const uint8_t *)*inbuf;
655         uint8_t *uc = (uint8_t *)*outbuf;
656
657         while (in_left >= 1 && out_left >= 2) {
658                 if ((c[0] & 0x80) == 0) {
659                         uc[0] = c[0];
660                         uc[1] = 0;
661                         c  += 1;
662                         in_left  -= 1;
663                         out_left -= 2;
664                         uc += 2;
665                         continue;
666                 }
667
668                 if ((c[0] & 0xe0) == 0xc0) {
669                         if (in_left < 2 ||
670                             (c[1] & 0xc0) != 0x80) {
671                                 errno = EILSEQ;
672                                 goto error;
673                         }
674                         uc[1] = (c[0]>>2) & 0x7;
675                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
676                         c  += 2;
677                         in_left  -= 2;
678                         out_left -= 2;
679                         uc += 2;
680                         continue;
681                 }
682
683                 if ((c[0] & 0xf0) == 0xe0) {
684                         if (in_left < 3 ||
685                             (c[1] & 0xc0) != 0x80 || 
686                             (c[2] & 0xc0) != 0x80) {
687                                 errno = EILSEQ;
688                                 goto error;
689                         }
690                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
691                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
692                         c  += 3;
693                         in_left  -= 3;
694                         out_left -= 2;
695                         uc += 2;
696                         continue;
697                 }
698
699                 if ((c[0] & 0xf8) == 0xf0) {
700                         unsigned int codepoint;
701                         if (in_left < 4 ||
702                             (c[1] & 0xc0) != 0x80 || 
703                             (c[2] & 0xc0) != 0x80 ||
704                             (c[3] & 0xc0) != 0x80) {
705                                 errno = EILSEQ;
706                                 goto error;
707                         }
708                         codepoint = 
709                                 (c[3]&0x3f) | 
710                                 ((c[2]&0x3f)<<6) | 
711                                 ((c[1]&0x3f)<<12) |
712                                 ((c[0]&0x7)<<18);
713                         if (codepoint < 0x10000) {
714                                 /* accept UTF-8 characters that are not
715                                    minimally packed, but pack the result */
716                                 uc[0] = (codepoint & 0xFF);
717                                 uc[1] = (codepoint >> 8);
718                                 c += 4;
719                                 in_left -= 4;
720                                 out_left -= 2;
721                                 uc += 2;
722                                 continue;
723                         }
724
725                         codepoint -= 0x10000;
726
727                         if (out_left < 4) {
728                                 errno = E2BIG;
729                                 goto error;
730                         }
731
732                         uc[0] = (codepoint>>10) & 0xFF;
733                         uc[1] = (codepoint>>18) | 0xd8;
734                         uc[2] = codepoint & 0xFF;
735                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
736                         c  += 4;
737                         in_left  -= 4;
738                         out_left -= 4;
739                         uc += 4;
740                         continue;
741                 }
742
743                 /* we don't handle 5 byte sequences */
744                 errno = EINVAL;
745                 goto error;
746         }
747
748         if (in_left > 0) {
749                 errno = E2BIG;
750                 goto error;
751         }
752
753         *inbytesleft = in_left;
754         *outbytesleft = out_left;
755         *inbuf = (const char *)c;
756         *outbuf = (char *)uc;
757         return 0;
758
759 error:
760         *inbytesleft = in_left;
761         *outbytesleft = out_left;
762         *inbuf = (const char *)c;
763         *outbuf = (char *)uc;
764         return -1;
765 }
766
767
768 /*
769   this takes a UTF16 sequence and produces a UTF8 sequence
770  */
771 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
772                         char **outbuf, size_t *outbytesleft)
773 {
774         size_t in_left=*inbytesleft, out_left=*outbytesleft;
775         uint8_t *c = (uint8_t *)*outbuf;
776         const uint8_t *uc = (const uint8_t *)*inbuf;
777
778         while (in_left >= 2 && out_left >= 1) {
779                 unsigned int codepoint;
780
781                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
782                         /* simplest case */
783                         c[0] = uc[0];
784                         in_left  -= 2;
785                         out_left -= 1;
786                         uc += 2;
787                         c  += 1;
788                         continue;
789                 }
790
791                 if ((uc[1]&0xf8) == 0) {
792                         /* next simplest case */
793                         if (out_left < 2) {
794                                 errno = E2BIG;
795                                 goto error;
796                         }
797                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
798                         c[1] = 0x80 | (uc[0] & 0x3f);
799                         in_left  -= 2;
800                         out_left -= 2;
801                         uc += 2;
802                         c  += 2;
803                         continue;
804                 }
805
806                 if ((uc[1] & 0xfc) == 0xdc) {
807                         /* its the second part of a 4 byte sequence. Illegal */
808                         if (in_left < 4) {
809                                 errno = EINVAL;
810                         } else {
811                                 errno = EILSEQ;
812                         }
813                         goto error;
814                 }
815
816                 if ((uc[1] & 0xfc) != 0xd8) {
817                         codepoint = uc[0] | (uc[1]<<8);
818                         if (out_left < 3) {
819                                 errno = E2BIG;
820                                 goto error;
821                         }
822                         c[0] = 0xe0 | (codepoint >> 12);
823                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
824                         c[2] = 0x80 | (codepoint & 0x3f);
825                         
826                         in_left  -= 2;
827                         out_left -= 3;
828                         uc  += 2;
829                         c   += 3;
830                         continue;
831                 }
832
833                 /* its the first part of a 4 byte sequence */
834                 if (in_left < 4) {
835                         errno = EINVAL;
836                         goto error;
837                 }
838                 if ((uc[3] & 0xfc) != 0xdc) {
839                         errno = EILSEQ;
840                         goto error;
841                 }
842                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 
843                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
844                 
845                 if (out_left < 4) {
846                         errno = E2BIG;
847                         goto error;
848                 }
849                 c[0] = 0xf0 | (codepoint >> 18);
850                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
851                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
852                 c[3] = 0x80 | (codepoint & 0x3f);
853                 
854                 in_left  -= 4;
855                 out_left -= 4;
856                 uc       += 4;
857                 c        += 4;
858         }
859
860         if (in_left == 1) {
861                 errno = EINVAL;
862                 goto error;
863         }
864
865         if (in_left > 1) {
866                 errno = E2BIG;
867                 goto error;
868         }
869
870         *inbytesleft = in_left;
871         *outbytesleft = out_left;
872         *inbuf  = (const char *)uc;
873         *outbuf = (char *)c;
874         
875         return 0;
876
877 error:
878         *inbytesleft = in_left;
879         *outbytesleft = out_left;
880         *inbuf  = (const char *)uc;
881         *outbuf = (char *)c;
882         return -1;
883 }
884
885
886 /*
887   this takes a UTF16 munged sequence, modifies it according to the
888   string2key rules, and produces a UTF16 sequence
889
890 The rules are:
891
892     1) any 0x0000 characters are mapped to 0x0001
893
894     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
895        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
896        U+FFFD (OBJECT REPLACEMENT CHARACTER).
897
898     3) the same for any low surrogate that was not preceded by a high surrogate.
899
900  */
901 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
902                                char **outbuf, size_t *outbytesleft)
903 {
904         size_t in_left=*inbytesleft, out_left=*outbytesleft;
905         uint8_t *c = (uint8_t *)*outbuf;
906         const uint8_t *uc = (const uint8_t *)*inbuf;
907
908         while (in_left >= 2 && out_left >= 2) {
909                 unsigned int codepoint = uc[0] | (uc[1]<<8);
910
911                 if (codepoint == 0) {
912                         codepoint = 1;
913                 }
914
915                 if ((codepoint & 0xfc00) == 0xd800) {
916                         /* a high surrogate */
917                         unsigned int codepoint2;
918                         if (in_left < 4) {
919                                 codepoint = 0xfffd;
920                                 goto codepoint16;                               
921                         }
922                         codepoint2 = uc[2] | (uc[3]<<8);
923                         if ((codepoint2 & 0xfc00) != 0xdc00) {
924                                 /* high surrogate not followed by low
925                                    surrogate: convert to 0xfffd */
926                                 codepoint = 0xfffd;
927                                 goto codepoint16;
928                         }
929                         if (out_left < 4) {
930                                 errno = E2BIG;
931                                 goto error;
932                         }
933                         memcpy(c, uc, 4);
934                         in_left  -= 4;
935                         out_left -= 4;
936                         uc       += 4;
937                         c        += 4;
938                         continue;
939                 }
940
941                 if ((codepoint & 0xfc00) == 0xdc00) {
942                         /* low surrogate not preceded by high
943                            surrogate: convert to 0xfffd */
944                         codepoint = 0xfffd;
945                 }
946
947         codepoint16:
948                 c[0] = codepoint & 0xFF;
949                 c[1] = (codepoint>>8) & 0xFF;
950                 
951                 in_left  -= 2;
952                 out_left -= 2;
953                 uc  += 2;
954                 c   += 2;
955                 continue;               
956         }
957
958         if (in_left == 1) {
959                 errno = EINVAL;
960                 goto error;
961         }
962
963         if (in_left > 1) {
964                 errno = E2BIG;
965                 goto error;
966         }
967
968         *inbytesleft = in_left;
969         *outbytesleft = out_left;
970         *inbuf  = (const char *)uc;
971         *outbuf = (char *)c;
972         
973         return 0;
974
975 error:
976         *inbytesleft = in_left;
977         *outbytesleft = out_left;
978         *inbuf  = (const char *)uc;
979         *outbuf = (char *)c;
980         return -1;
981 }
982
983
984