lib/util/charset/iconv.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    minimal iconv implementation
   4    Copyright (C) Andrew Tridgell 2001
   5    Copyright (C) Jelmer Vernooij 2002
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 #include "replace.h"
  22 #include "system/iconv.h"
  23 #include "system/filesys.h"
  24 #include "lib/util/byteorder.h"
  25 #include "lib/util/dlinklist.h"
  26 #include "lib/util/charset/charset.h"
  27 #include "lib/util/charset/charset_proto.h"
  28
  29 #ifdef HAVE_ICU_I18N
  30 #include <unicode/ustring.h>
  31 #include <unicode/utrans.h>
  32 #endif
  33
  34 #ifdef strcasecmp
  35 #undef strcasecmp
  36 #endif
  37
  38 /**
  39  * @file
  40  *
  41  * @brief Samba wrapper/stub for iconv character set conversion.
  42  *
  43  * iconv is the XPG2 interface for converting between character
  44  * encodings.  This file provides a Samba wrapper around it, and also
  45  * a simple reimplementation that is used if the system does not
  46  * implement iconv.
  47  *
  48  * Samba only works with encodings that are supersets of ASCII: ascii
  49  * characters like whitespace can be tested for directly, multibyte
  50  * sequences start with a byte with the high bit set, and strings are
  51  * terminated by a nul byte.
  52  *
  53  * Note that the only function provided by iconv is conversion between
  54  * characters.  It doesn't directly support operations like
  55  * uppercasing or comparison.  We have to convert to UTF-16LE and
  56  * compare there.
  57  *
  58  * @sa Samba Developers Guide
  59  **/
  60
  61 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
  62 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
  63 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
  64 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
  65 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
  66 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
  67 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
  68 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
  69 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
  70 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
  71 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
  72
  73 static const struct charset_functions builtin_functions[] = {
  74         /* windows is closest to UTF-16 */
  75         {
  76                 .name = "UCS-2LE",
  77                 .pull = iconv_copy,
  78                 .push = iconv_copy
  79         },
  80         {
  81                 .name = "UTF-16LE",
  82                 .pull = iconv_copy,
  83                 .push = iconv_copy
  84         },
  85         {
  86                 .name = "UCS-2BE",
  87                 .pull = iconv_swab,
  88                 .push = iconv_swab
  89         },
  90         {
  91                 .name = "UTF-16BE",
  92                 .pull = iconv_swab,
  93                 .push = iconv_swab
  94         },
  95
  96         /* we include the UTF-8 alias to cope with differing locale settings */
  97         {
  98                 .name = "UTF8",
  99                 .pull = utf8_pull,
 100                 .push = utf8_push
 101         },
 102         {
 103                 .name = "UTF-8",
 104                 .pull = utf8_pull,
 105                 .push = utf8_push
 106         },
 107
 108         /* this handles the munging needed for String2Key */
 109         {
 110                 .name = "UTF16_MUNGED",
 111                 .pull = utf16_munged_pull,
 112                 .push = iconv_copy,
 113                 .samba_internal_charset = true
 114         },
 115
 116         {
 117                 .name = "ASCII",
 118                 .pull = ascii_pull,
 119                 .push = ascii_push
 120         },
 121         {
 122                 .name = "646",
 123                 .pull = ascii_pull,
 124                 .push = ascii_push
 125         },
 126         {
 127                 .name = "ISO-8859-1",
 128                 .pull = latin1_pull,
 129                 .push = latin1_push
 130         },
 131 #ifdef DEVELOPER
 132         {
 133                 .name = "WEIRD",
 134                 .pull = weird_pull,
 135                 .push = weird_push,
 136                 .samba_internal_charset = true
 137         },
 138 #endif
 139 #ifdef DARWINOS
 140         {
 141                 .name = "MACOSXFS",
 142                 .pull = macosxfs_encoding_pull,
 143                 .push = macosxfs_encoding_push,
 144                 .samba_internal_charset = true
 145         },
 146 #endif
 147         {
 148                 .name = "UCS2-HEX",
 149                 .pull = ucs2hex_pull,
 150                 .push = ucs2hex_push,
 151                 .samba_internal_charset = true
 152         }
 153 };
 154
 155 #ifdef HAVE_NATIVE_ICONV
 156 /* if there was an error then reset the internal state,
 157    this ensures that we don't have a shift state remaining for
 158    character sets like SJIS */
 159 static size_t sys_iconv(void *cd,
 160                         const char **inbuf, size_t *inbytesleft,
 161                         char **outbuf, size_t *outbytesleft)
 162 {
 163         size_t ret = iconv((iconv_t)cd,
 164                            discard_const_p(char *, inbuf), inbytesleft,
 165                            outbuf, outbytesleft);
 166         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
 167         return ret;
 168 }
 169 #endif
 170
 171 #ifdef HAVE_ICU_I18N
 172 static size_t sys_uconv(void *cd,
 173                         const char **inbuf,
 174                         size_t *inbytesleft,
 175                         char **outbuf,
 176                         size_t *outbytesleft)
 177 {
 178         UTransliterator *t = (UTransliterator *)cd;
 179         size_t bufsize = *inbytesleft * 2;
 180         UChar ustr[bufsize];
 181         UChar *up = NULL;
 182         char *p = NULL;
 183         int32_t ustrlen;
 184         int32_t limit;
 185         int32_t converted_len;
 186         size_t inbuf_consumed;
 187         size_t outbut_consumed;
 188         UErrorCode ue;
 189
 190         /* Convert from UTF8 to UCS2 */
 191         ue = 0;
 192         up = u_strFromUTF8(ustr,           /* dst */
 193                            bufsize,        /* dst buflen */
 194                            &converted_len, /* dst written */
 195                            *inbuf,         /* src */
 196                            *inbytesleft,   /* src length */
 197                            &ue);
 198         if (up == NULL || U_FAILURE(ue)) {
 199                 return -1;
 200         }
 201         if (converted_len > bufsize) {
 202                 /*
 203                  * u_strFromUTF8() returns the required size in
 204                  * converted_len. In theory this should never overflow as the
 205                  * ustr[] array is allocated with a size twice as big as
 206                  * inbytesleft and converted_len should be equal to inbytesleft,
 207                  * but you never know...
 208                  */
 209                 errno = EOVERFLOW;
 210                 return -1;
 211         }
 212         inbuf_consumed = converted_len;
 213
 214         /*
 215          * The following transliteration function takes two parameters, the
 216          * length of the text to be converted (converted_len) and a limit which
 217          * may be smaller then converted_len. We just set limit to converted_len
 218          * and also ignore the value returned in limit.
 219          */
 220         limit = converted_len;
 221
 222         /* Inplace transliteration */
 223         utrans_transUChars(t,
 224                            ustr,           /* text */
 225                            &converted_len, /* text length */
 226                            bufsize,        /* text buflen */
 227                            0,              /* start */
 228                            &limit,         /* limit */
 229                            &ue);
 230         if (U_FAILURE(ue)) {
 231                 return -1;
 232         }
 233         if (converted_len > bufsize) {
 234                 /*
 235                  * In theory this should never happen as the ustr[] array is
 236                  * allocated with a size twice as big as inbytesleft and
 237                  * converted_len should be equal to inbytesleft, but you never
 238                  * know...
 239                  */
 240                 errno = EOVERFLOW;
 241                 return -1;
 242         }
 243         ustrlen = converted_len;
 244
 245         /* Convert from UCS2 back to UTF8 */
 246         ue = 0;
 247         p = u_strToUTF8(*outbuf,        /* dst */
 248                         *outbytesleft,  /* dst buflen */
 249                         &converted_len, /* dst required length */
 250                         ustr,           /* src */
 251                         ustrlen,        /* src length */
 252                         &ue);
 253         if (p == NULL || U_FAILURE(ue)) {
 254                 return -1;
 255         }
 256
 257         outbut_consumed = converted_len;
 258         if (converted_len > *outbytesleft) {
 259                 /*
 260                  * The caller's result buffer is too small...
 261                 */
 262                 outbut_consumed = *outbytesleft;
 263         }
 264
 265         *inbuf += inbuf_consumed;
 266         *inbytesleft -= inbuf_consumed;
 267         *outbuf += outbut_consumed;
 268         *outbytesleft -= outbut_consumed;
 269
 270         return converted_len;
 271 }
 272 #endif
 273
 274 /**
 275  * This is a simple portable iconv() implementation.
 276  *
 277  * It only knows about a very small number of character sets - just
 278  * enough that Samba works on systems that don't have iconv.
 279  **/
 280 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
 281                  const char **inbuf, size_t *inbytesleft,
 282                  char **outbuf, size_t *outbytesleft)
 283 {
 284         /* in many cases we can go direct */
 285         if (cd->direct) {
 286                 return cd->direct(cd->cd_direct,
 287                                   inbuf, inbytesleft, outbuf, outbytesleft);
 288         }
 289
 290         /* otherwise we have to do it chunks at a time */
 291         {
 292 #ifndef SMB_ICONV_BUFSIZE
 293 #define SMB_ICONV_BUFSIZE 2048
 294 #endif
 295                 size_t bufsize;
 296                 char cvtbuf[SMB_ICONV_BUFSIZE];
 297
 298                 while (*inbytesleft > 0) {
 299                         char *bufp1 = cvtbuf;
 300                         const char *bufp2 = cvtbuf;
 301                         int saved_errno = errno;
 302                         bool pull_failed = false;
 303                         bufsize = SMB_ICONV_BUFSIZE;
 304
 305                         if (cd->pull(cd->cd_pull,
 306                                      inbuf, inbytesleft, &bufp1, &bufsize) == -1
 307                             && errno != E2BIG) {
 308                                 saved_errno = errno;
 309                                 pull_failed = true;
 310                         }
 311
 312                         bufsize = SMB_ICONV_BUFSIZE - bufsize;
 313
 314                         if (cd->push(cd->cd_push,
 315                                      &bufp2, &bufsize,
 316                                      outbuf, outbytesleft) == -1) {
 317                                 return -1;
 318                         } else if (pull_failed) {
 319                                 /* We want the pull errno if possible */
 320                                 errno = saved_errno;
 321                                 return -1;
 322                         }
 323                 }
 324         }
 325
 326         return 0;
 327 }
 328
 329 static bool is_utf16(const char *name)
 330 {
 331         return strcasecmp(name, "UCS-2LE") == 0 ||
 332                 strcasecmp(name, "UTF-16LE") == 0;
 333 }
 334
 335 static int smb_iconv_t_destructor(smb_iconv_t hwd)
 336 {
 337 #ifdef HAVE_ICU_I18N
 338         /*
 339          * This has to come first, as the cd_direct member won't be an iconv
 340          * handle and must not be passed to iconv_close().
 341          */
 342         if (hwd->direct == sys_uconv) {
 343                 utrans_close(hwd->cd_direct);
 344                 return 0;
 345         }
 346 #endif
 347 #ifdef HAVE_NATIVE_ICONV
 348         if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
 349                 iconv_close(hwd->cd_pull);
 350         if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
 351                 iconv_close(hwd->cd_push);
 352         if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
 353                 iconv_close(hwd->cd_direct);
 354 #endif
 355
 356         return 0;
 357 }
 358
 359 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
 360                               const char *fromcode, bool use_builtin_handlers)
 361 {
 362         smb_iconv_t ret;
 363         const struct charset_functions *from=NULL, *to=NULL;
 364         int i;
 365
 366         ret = (smb_iconv_t)talloc_named(mem_ctx,
 367                                         sizeof(*ret),
 368                                         "iconv(%s,%s)", tocode, fromcode);
 369         if (!ret) {
 370                 errno = ENOMEM;
 371                 return (smb_iconv_t)-1;
 372         }
 373         memset(ret, 0, sizeof(*ret));
 374         talloc_set_destructor(ret, smb_iconv_t_destructor);
 375
 376         /* check for the simplest null conversion */
 377         if (strcmp(fromcode, tocode) == 0) {
 378                 ret->direct = iconv_copy;
 379                 return ret;
 380         }
 381
 382         /* check if we have a builtin function for this conversion */
 383         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
 384                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
 385                         if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
 386                                 from = &builtin_functions[i];
 387                         }
 388                 }
 389                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
 390                         if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
 391                                 to = &builtin_functions[i];
 392                         }
 393                 }
 394         }
 395
 396 #ifdef HAVE_NATIVE_ICONV
 397         /* the from and to variables indicate a samba module or
 398          * internal conversion, ret->pull and ret->push are
 399          * initialised only in this block for iconv based
 400          * conversions */
 401
 402         if (from == NULL) {
 403                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
 404                 if (ret->cd_pull == (iconv_t)-1)
 405                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
 406                 if (ret->cd_pull != (iconv_t)-1) {
 407                         ret->pull = sys_iconv;
 408                 }
 409         }
 410
 411         if (to == NULL) {
 412                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
 413                 if (ret->cd_push == (iconv_t)-1)
 414                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
 415                 if (ret->cd_push != (iconv_t)-1) {
 416                         ret->push = sys_iconv;
 417                 }
 418         }
 419 #endif
 420
 421 #ifdef HAVE_ICU_I18N
 422         if (strcasecmp(fromcode, "UTF8-NFD") == 0 &&
 423             strcasecmp(tocode, "UTF8-NFC") == 0)
 424         {
 425                 U_STRING_DECL(t, "any-nfc", 7);
 426                 UErrorCode ue = 0;
 427
 428                 U_STRING_INIT(t, "any-nfc", 7);
 429
 430                 ret->cd_direct = utrans_openU(t,
 431                                               strlen("any-nfc"),
 432                                               UTRANS_FORWARD,
 433                                               NULL,
 434                                               0,
 435                                               NULL,
 436                                               &ue);
 437                 if (U_FAILURE(ue)) {
 438                         return (smb_iconv_t)-1;
 439                 }
 440                 ret->direct = sys_uconv;
 441                 return ret;
 442         }
 443
 444         if (strcasecmp(fromcode, "UTF8-NFC") == 0 &&
 445             strcasecmp(tocode, "UTF8-NFD") == 0)
 446         {
 447                 U_STRING_DECL(tname, "any-nfd", 7);
 448                 UErrorCode ue = 0;
 449
 450                 U_STRING_INIT(tname, "any-nfd", 7);
 451
 452                 ret->cd_direct = utrans_openU(tname,
 453                                               7,
 454                                               UTRANS_FORWARD,
 455                                               NULL,
 456                                               0,
 457                                               NULL,
 458                                               &ue);
 459                 if (U_FAILURE(ue)) {
 460                         return (smb_iconv_t)-1;
 461                 }
 462                 ret->direct = sys_uconv;
 463                 return ret;
 464         }
 465 #endif
 466
 467         if (ret->pull == NULL && from == NULL) {
 468                 goto failed;
 469         }
 470
 471         if (ret->push == NULL && to == NULL) {
 472                 goto failed;
 473         }
 474
 475         /* check for conversion to/from ucs2 */
 476         if (is_utf16(fromcode) && to) {
 477                 ret->direct = to->push;
 478                 return ret;
 479         }
 480         if (is_utf16(tocode) && from) {
 481                 ret->direct = from->pull;
 482                 return ret;
 483         }
 484
 485 #ifdef HAVE_NATIVE_ICONV
 486         if (is_utf16(fromcode)) {
 487                 ret->direct = sys_iconv;
 488                 ret->cd_direct = ret->cd_push;
 489                 ret->cd_push = NULL;
 490                 return ret;
 491         }
 492         if (is_utf16(tocode)) {
 493                 ret->direct = sys_iconv;
 494                 ret->cd_direct = ret->cd_pull;
 495                 ret->cd_pull = NULL;
 496                 return ret;
 497         }
 498 #endif
 499
 500         /* the general case has to go via a buffer */
 501         if (!ret->pull) ret->pull = from->pull;
 502         if (!ret->push) ret->push = to->push;
 503         return ret;
 504
 505 failed:
 506         talloc_free(ret);
 507         errno = EINVAL;
 508         return (smb_iconv_t)-1;
 509 }
 510
 511 /*
 512   simple iconv_open() wrapper
 513  */
 514 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
 515 {
 516         return smb_iconv_open_ex(NULL, tocode, fromcode, true);
 517 }
 518
 519 /*
 520   simple iconv_close() wrapper
 521 */
 522 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
 523 {
 524         talloc_free(cd);
 525         return 0;
 526 }
 527
 528
 529 /**********************************************************************
 530  the following functions implement the builtin character sets in Samba
 531  and also the "test" character sets that are designed to test
 532  multi-byte character set support for english users
 533 ***********************************************************************/
 534
 535 /*
 536   this takes an ASCII sequence and produces a UTF16 sequence
 537
 538   The first 127 codepoints of latin1 matches the first 127 codepoints
 539   of unicode, and so can be put into the first byte of UTF16LE
 540
 541  */
 542
 543 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 544                          char **outbuf, size_t *outbytesleft)
 545 {
 546         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 547                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
 548                         /* If this is multi-byte, then it isn't legal ASCII */
 549                         errno = EILSEQ;
 550                         return -1;
 551                 }
 552                 (*outbuf)[0] = (*inbuf)[0];
 553                 (*outbuf)[1] = 0;
 554                 (*inbytesleft)  -= 1;
 555                 (*outbytesleft) -= 2;
 556                 (*inbuf)  += 1;
 557                 (*outbuf) += 2;
 558         }
 559
 560         if (*inbytesleft > 0) {
 561                 errno = E2BIG;
 562                 return -1;
 563         }
 564
 565         return 0;
 566 }
 567
 568 /*
 569   this takes a UTF16 sequence and produces an ASCII sequence
 570
 571   The first 127 codepoints of ASCII matches the first 127 codepoints
 572   of unicode, and so can be read directly from the first byte of UTF16LE
 573
 574  */
 575 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
 576                          char **outbuf, size_t *outbytesleft)
 577 {
 578         int ir_count=0;
 579
 580         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 581                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
 582                         (*inbuf)[1] != 0) {
 583                         /* If this is multi-byte, then it isn't legal ASCII */
 584                         errno = EILSEQ;
 585                         return -1;
 586                 }
 587                 (*outbuf)[0] = (*inbuf)[0];
 588                 (*inbytesleft)  -= 2;
 589                 (*outbytesleft) -= 1;
 590                 (*inbuf)  += 2;
 591                 (*outbuf) += 1;
 592         }
 593
 594         if (*inbytesleft == 1) {
 595                 errno = EINVAL;
 596                 return -1;
 597         }
 598
 599         if (*inbytesleft > 1) {
 600                 errno = E2BIG;
 601                 return -1;
 602         }
 603
 604         return ir_count;
 605 }
 606
 607 /*
 608   this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
 609
 610   The first 256 codepoints of latin1 matches the first 256 codepoints
 611   of unicode, and so can be put into the first byte of UTF16LE
 612
 613  */
 614 static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 615                           char **outbuf, size_t *outbytesleft)
 616 {
 617         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 618                 (*outbuf)[0] = (*inbuf)[0];
 619                 (*outbuf)[1] = 0;
 620                 (*inbytesleft)  -= 1;
 621                 (*outbytesleft) -= 2;
 622                 (*inbuf)  += 1;
 623                 (*outbuf) += 2;
 624         }
 625
 626         if (*inbytesleft > 0) {
 627                 errno = E2BIG;
 628                 return -1;
 629         }
 630
 631         return 0;
 632 }
 633
 634 /*
 635   this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
 636
 637   The first 256 codepoints of latin1 matches the first 256 codepoints
 638   of unicode, and so can be read directly from the first byte of UTF16LE
 639
 640  */
 641 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
 642                          char **outbuf, size_t *outbytesleft)
 643 {
 644         int ir_count=0;
 645
 646         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 647                 (*outbuf)[0] = (*inbuf)[0];
 648                 if ((*inbuf)[1] != 0) {
 649                         /* If this is multi-byte, then it isn't legal latin1 */
 650                         errno = EILSEQ;
 651                         return -1;
 652                 }
 653                 (*inbytesleft)  -= 2;
 654                 (*outbytesleft) -= 1;
 655                 (*inbuf)  += 2;
 656                 (*outbuf) += 1;
 657         }
 658
 659         if (*inbytesleft == 1) {
 660                 errno = EINVAL;
 661                 return -1;
 662         }
 663
 664         if (*inbytesleft > 1) {
 665                 errno = E2BIG;
 666                 return -1;
 667         }
 668
 669         return ir_count;
 670 }
 671
 672 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 673                          char **outbuf, size_t *outbytesleft)
 674 {
 675         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 676                 uint8_t hi = 0, lo = 0;
 677                 bool ok;
 678
 679                 if ((*inbuf)[0] != '@') {
 680                         /* seven bit ascii case */
 681                         (*outbuf)[0] = (*inbuf)[0];
 682                         (*outbuf)[1] = 0;
 683                         (*inbytesleft)  -= 1;
 684                         (*outbytesleft) -= 2;
 685                         (*inbuf)  += 1;
 686                         (*outbuf) += 2;
 687                         continue;
 688                 }
 689                 /* it's a hex character */
 690                 if (*inbytesleft < 5) {
 691                         errno = EINVAL;
 692                         return -1;
 693                 }
 694
 695                 ok = hex_byte(&(*inbuf)[1], &hi) && hex_byte(&(*inbuf)[3], &lo);
 696                 if (!ok) {
 697                         errno = EILSEQ;
 698                         return -1;
 699                 }
 700
 701                 (*outbuf)[0] = lo;
 702                 (*outbuf)[1] = hi;
 703                 (*inbytesleft)  -= 5;
 704                 (*outbytesleft) -= 2;
 705                 (*inbuf)  += 5;
 706                 (*outbuf) += 2;
 707         }
 708
 709         if (*inbytesleft > 0) {
 710                 errno = E2BIG;
 711                 return -1;
 712         }
 713
 714         return 0;
 715 }
 716
 717 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
 718                            char **outbuf, size_t *outbytesleft)
 719 {
 720         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 721                 char buf[6];
 722
 723                 if ((*inbuf)[1] == 0 &&
 724                     ((*inbuf)[0] & 0x80) == 0 &&
 725                     (*inbuf)[0] != '@') {
 726                         (*outbuf)[0] = (*inbuf)[0];
 727                         (*inbytesleft)  -= 2;
 728                         (*outbytesleft) -= 1;
 729                         (*inbuf)  += 2;
 730                         (*outbuf) += 1;
 731                         continue;
 732                 }
 733                 if (*outbytesleft < 5) {
 734                         errno = E2BIG;
 735                         return -1;
 736                 }
 737                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
 738                 memcpy(*outbuf, buf, 5);
 739                 (*inbytesleft)  -= 2;
 740                 (*outbytesleft) -= 5;
 741                 (*inbuf)  += 2;
 742                 (*outbuf) += 5;
 743         }
 744
 745         if (*inbytesleft == 1) {
 746                 errno = EINVAL;
 747                 return -1;
 748         }
 749
 750         if (*inbytesleft > 1) {
 751                 errno = E2BIG;
 752                 return -1;
 753         }
 754
 755         return 0;
 756 }
 757
 758 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
 759                          char **outbuf, size_t *outbytesleft)
 760 {
 761         int n;
 762
 763         n = MIN(*inbytesleft, *outbytesleft);
 764
 765         swab(*inbuf, *outbuf, (n&~1));
 766         if (n&1) {
 767                 (*outbuf)[n-1] = 0;
 768         }
 769
 770         (*inbytesleft) -= n;
 771         (*outbytesleft) -= n;
 772         (*inbuf) += n;
 773         (*outbuf) += n;
 774
 775         if (*inbytesleft > 0) {
 776                 errno = E2BIG;
 777                 return -1;
 778         }
 779
 780         return 0;
 781 }
 782
 783
 784 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
 785                          char **outbuf, size_t *outbytesleft)
 786 {
 787         int n;
 788
 789         n = MIN(*inbytesleft, *outbytesleft);
 790
 791         memmove(*outbuf, *inbuf, n);
 792
 793         (*inbytesleft) -= n;
 794         (*outbytesleft) -= n;
 795         (*inbuf) += n;
 796         (*outbuf) += n;
 797
 798         if (*inbytesleft > 0) {
 799                 errno = E2BIG;
 800                 return -1;
 801         }
 802
 803         return 0;
 804 }
 805
 806 /*
 807   this takes a UTF8 sequence and produces a UTF16 sequence
 808  */
 809 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 810                          char **outbuf, size_t *outbytesleft)
 811 {
 812         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 813         const uint8_t *c = (const uint8_t *)*inbuf;
 814         uint8_t *uc = (uint8_t *)*outbuf;
 815
 816         while (in_left >= 1 && out_left >= 2) {
 817                 if ((c[0] & 0x80) == 0) {
 818                         uc[0] = c[0];
 819                         uc[1] = 0;
 820                         c  += 1;
 821                         in_left  -= 1;
 822                         out_left -= 2;
 823                         uc += 2;
 824                         continue;
 825                 }
 826
 827                 if ((c[0] & 0xe0) == 0xc0) {
 828                         if (in_left < 2 ||
 829                             (c[1] & 0xc0) != 0x80) {
 830                                 errno = EILSEQ;
 831                                 goto error;
 832                         }
 833                         uc[1] = (c[0]>>2) & 0x7;
 834                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
 835                         if (uc[1] == 0 && uc[0] < 0x80) {
 836                                 /* this should have been a single byte */
 837                                 errno = EILSEQ;
 838                                 goto error;
 839                         }
 840                         c  += 2;
 841                         in_left  -= 2;
 842                         out_left -= 2;
 843                         uc += 2;
 844                         continue;
 845                 }
 846
 847                 if ((c[0] & 0xf0) == 0xe0) {
 848                         unsigned int codepoint;
 849                         if (in_left < 3 ||
 850                             (c[1] & 0xc0) != 0x80 ||
 851                             (c[2] & 0xc0) != 0x80) {
 852                                 errno = EILSEQ;
 853                                 goto error;
 854                         }
 855                         codepoint = ((c[2] & 0x3f)        |
 856                                      ((c[1] & 0x3f) << 6) |
 857                                      ((c[0] & 0x0f) << 12));
 858
 859                         if (codepoint < 0x800) {
 860                                 /* this should be a 1 or 2 byte sequence */
 861                                 errno = EILSEQ;
 862                                 goto error;
 863                         }
 864                         if (codepoint >= 0xd800 && codepoint <= 0xdfff) {
 865                                 /*
 866                                  * This is an invalid codepoint, per
 867                                  * RFC3629, as it encodes part of a
 868                                  * UTF-16 surrogate pair for a
 869                                  * character over U+10000, which ought
 870                                  * to have been encoded as a four byte
 871                                  * utf-8 sequence.
 872                                  *
 873                                  * Prior to Vista, Windows might
 874                                  * sometimes produce invalid strings
 875                                  * where a utf-16 sequence containing
 876                                  * surrogate pairs was converted
 877                                  * "verbatim" into utf-8, instead of
 878                                  * encoding the actual codepoint. This
 879                                  * format is sometimes called "WTF-8".
 880                                  *
 881                                  * If we were to support that, we'd
 882                                  * have a branch here for the case
 883                                  * where the codepoint is between
 884                                  * 0xd800 and 0xdbff (a "high
 885                                  * surrogate"), and read a *six*
 886                                  * character sequence from there which
 887                                  * would include a low surrogate. But
 888                                  * that would undermine the
 889                                  * hard-learnt principle that each
 890                                  * character should only have one
 891                                  * encoding.
 892                                  */
 893                                 errno = EILSEQ;
 894                                 goto error;
 895                         }
 896
 897                         uc[0] = codepoint & 0xff;
 898                         uc[1] = codepoint >> 8;
 899                         c  += 3;
 900                         in_left  -= 3;
 901                         out_left -= 2;
 902                         uc += 2;
 903                         continue;
 904                 }
 905
 906                 if ((c[0] & 0xf8) == 0xf0) {
 907                         unsigned int codepoint;
 908                         if (in_left < 4 ||
 909                             (c[1] & 0xc0) != 0x80 ||
 910                             (c[2] & 0xc0) != 0x80 ||
 911                             (c[3] & 0xc0) != 0x80) {
 912                                 errno = EILSEQ;
 913                                 goto error;
 914                         }
 915                         codepoint =
 916                                 (c[3]&0x3f) |
 917                                 ((c[2]&0x3f)<<6) |
 918                                 ((c[1]&0x3f)<<12) |
 919                                 ((c[0]&0x7)<<18);
 920                         if (codepoint < 0x10000) {
 921                                 /* reject UTF-8 characters that are not
 922                                    minimally packed */
 923                                 errno = EILSEQ;
 924                                 goto error;
 925                         }
 926                         if (codepoint > 0x10ffff) {
 927                                 /*
 928                                  * Unicode stops at 0x10ffff, and if
 929                                  * we ignore that, we'll end up
 930                                  * encoding the wrong characters in
 931                                  * the surrogate pair.
 932                                  */
 933                                 errno = EILSEQ;
 934                                 goto error;
 935                         }
 936
 937                         codepoint -= 0x10000;
 938
 939                         if (out_left < 4) {
 940                                 errno = E2BIG;
 941                                 goto error;
 942                         }
 943
 944                         uc[0] = (codepoint>>10) & 0xFF;
 945                         uc[1] = (codepoint>>18) | 0xd8;
 946                         uc[2] = codepoint & 0xFF;
 947                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
 948                         c  += 4;
 949                         in_left  -= 4;
 950                         out_left -= 4;
 951                         uc += 4;
 952                         continue;
 953                 }
 954
 955                 /* we don't handle 5 byte sequences */
 956                 errno = EINVAL;
 957                 goto error;
 958         }
 959
 960         if (in_left > 0) {
 961                 errno = E2BIG;
 962                 goto error;
 963         }
 964
 965         *inbytesleft = in_left;
 966         *outbytesleft = out_left;
 967         *inbuf = (const char *)c;
 968         *outbuf = (char *)uc;
 969         return 0;
 970
 971 error:
 972         *inbytesleft = in_left;
 973         *outbytesleft = out_left;
 974         *inbuf = (const char *)c;
 975         *outbuf = (char *)uc;
 976         return -1;
 977 }
 978
 979
 980 /*
 981   this takes a UTF16 sequence and produces a UTF8 sequence
 982  */
 983 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
 984                         char **outbuf, size_t *outbytesleft)
 985 {
 986         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 987         uint8_t *c = (uint8_t *)*outbuf;
 988         const uint8_t *uc = (const uint8_t *)*inbuf;
 989
 990         while (in_left >= 2 && out_left >= 1) {
 991                 unsigned int codepoint;
 992
 993                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
 994                         /* simplest case */
 995                         c[0] = uc[0];
 996                         in_left  -= 2;
 997                         out_left -= 1;
 998                         uc += 2;
 999                         c  += 1;
1000                         continue;
1001                 }
1002
1003                 if ((uc[1]&0xf8) == 0) {
1004                         /* next simplest case */
1005                         if (out_left < 2) {
1006                                 errno = E2BIG;
1007                                 goto error;
1008                         }
1009                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
1010                         c[1] = 0x80 | (uc[0] & 0x3f);
1011                         in_left  -= 2;
1012                         out_left -= 2;
1013                         uc += 2;
1014                         c  += 2;
1015                         continue;
1016                 }
1017
1018                 if ((uc[1] & 0xfc) == 0xdc) {
1019                         errno = EILSEQ;
1020 #ifndef HAVE_ICONV_ERRNO_ILLEGAL_MULTIBYTE
1021                         if (in_left < 4) {
1022                                 errno = EINVAL;
1023                         }
1024 #endif
1025                         goto error;
1026                 }
1027
1028                 if ((uc[1] & 0xfc) != 0xd8) {
1029                         codepoint = uc[0] | (uc[1]<<8);
1030                         if (out_left < 3) {
1031                                 errno = E2BIG;
1032                                 goto error;
1033                         }
1034                         c[0] = 0xe0 | (codepoint >> 12);
1035                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
1036                         c[2] = 0x80 | (codepoint & 0x3f);
1037
1038                         in_left  -= 2;
1039                         out_left -= 3;
1040                         uc  += 2;
1041                         c   += 3;
1042                         continue;
1043                 }
1044
1045                 /* its the first part of a 4 byte sequence */
1046                 if (in_left < 4) {
1047                         errno = EINVAL;
1048                         goto error;
1049                 }
1050                 if ((uc[3] & 0xfc) != 0xdc) {
1051                         errno = EILSEQ;
1052                         goto error;
1053                 }
1054                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
1055                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
1056
1057                 if (out_left < 4) {
1058                         errno = E2BIG;
1059                         goto error;
1060                 }
1061                 c[0] = 0xf0 | (codepoint >> 18);
1062                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
1063                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
1064                 c[3] = 0x80 | (codepoint & 0x3f);
1065
1066                 in_left  -= 4;
1067                 out_left -= 4;
1068                 uc       += 4;
1069                 c        += 4;
1070         }
1071
1072         if (in_left == 1) {
1073                 errno = EINVAL;
1074                 goto error;
1075         }
1076
1077         if (in_left > 1) {
1078                 errno = E2BIG;
1079                 goto error;
1080         }
1081
1082         *inbytesleft = in_left;
1083         *outbytesleft = out_left;
1084         *inbuf  = (const char *)uc;
1085         *outbuf = (char *)c;
1086
1087         return 0;
1088
1089 error:
1090         *inbytesleft = in_left;
1091         *outbytesleft = out_left;
1092         *inbuf  = (const char *)uc;
1093         *outbuf = (char *)c;
1094         return -1;
1095 }
1096
1097
1098 /*
1099   this takes a UTF16 munged sequence, modifies it according to the
1100   string2key rules, and produces a UTF16 sequence
1101
1102 The rules are:
1103
1104     1) any 0x0000 characters are mapped to 0x0001
1105
1106     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
1107        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
1108        U+FFFD (OBJECT REPLACEMENT CHARACTER).
1109
1110     3) the same for any low surrogate that was not preceded by a high surrogate.
1111
1112  */
1113 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
1114                                char **outbuf, size_t *outbytesleft)
1115 {
1116         size_t in_left=*inbytesleft, out_left=*outbytesleft;
1117         uint8_t *c = (uint8_t *)*outbuf;
1118         const uint8_t *uc = (const uint8_t *)*inbuf;
1119
1120         while (in_left >= 2 && out_left >= 2) {
1121                 unsigned int codepoint = uc[0] | (uc[1]<<8);
1122
1123                 if (codepoint == 0) {
1124                         codepoint = 1;
1125                 }
1126
1127                 if ((codepoint & 0xfc00) == 0xd800) {
1128                         /* a high surrogate */
1129                         unsigned int codepoint2;
1130                         if (in_left < 4) {
1131                                 codepoint = 0xfffd;
1132                                 goto codepoint16;
1133                         }
1134                         codepoint2 = uc[2] | (uc[3]<<8);
1135                         if ((codepoint2 & 0xfc00) != 0xdc00) {
1136                                 /* high surrogate not followed by low
1137                                    surrogate: convert to 0xfffd */
1138                                 codepoint = 0xfffd;
1139                                 goto codepoint16;
1140                         }
1141                         if (out_left < 4) {
1142                                 errno = E2BIG;
1143                                 goto error;
1144                         }
1145                         memcpy(c, uc, 4);
1146                         in_left  -= 4;
1147                         out_left -= 4;
1148                         uc       += 4;
1149                         c        += 4;
1150                         continue;
1151                 }
1152
1153                 if ((codepoint & 0xfc00) == 0xdc00) {
1154                         /* low surrogate not preceded by high
1155                            surrogate: convert to 0xfffd */
1156                         codepoint = 0xfffd;
1157                 }
1158
1159         codepoint16:
1160                 c[0] = codepoint & 0xFF;
1161                 c[1] = (codepoint>>8) & 0xFF;
1162
1163                 in_left  -= 2;
1164                 out_left -= 2;
1165                 uc  += 2;
1166                 c   += 2;
1167                 continue;
1168         }
1169
1170         if (in_left == 1) {
1171                 errno = EINVAL;
1172                 goto error;
1173         }
1174
1175         if (in_left > 1) {
1176                 errno = E2BIG;
1177                 goto error;
1178         }
1179
1180         *inbytesleft = in_left;
1181         *outbytesleft = out_left;
1182         *inbuf  = (const char *)uc;
1183         *outbuf = (char *)c;
1184
1185         return 0;
1186
1187 error:
1188         *inbytesleft = in_left;
1189         *outbytesleft = out_left;
1190         *inbuf  = (const char *)uc;
1191         *outbuf = (char *)c;
1192         return -1;
1193 }
1194
1195
1196