lib/util/charset/iconv.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    minimal iconv implementation
   4    Copyright (C) Andrew Tridgell 2001
   5    Copyright (C) Jelmer Vernooij 2002
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 #include "includes.h"
  22 #include "../lib/util/dlinklist.h"
  23 #include "system/iconv.h"
  24 #include "system/filesys.h"
  25
  26 #ifdef strcasecmp
  27 #undef strcasecmp
  28 #endif
  29
  30 #ifdef static_decl_charset
  31 static_decl_charset;
  32 #endif
  33
  34 /**
  35  * @file
  36  *
  37  * @brief Samba wrapper/stub for iconv character set conversion.
  38  *
  39  * iconv is the XPG2 interface for converting between character
  40  * encodings.  This file provides a Samba wrapper around it, and also
  41  * a simple reimplementation that is used if the system does not
  42  * implement iconv.
  43  *
  44  * Samba only works with encodings that are supersets of ASCII: ascii
  45  * characters like whitespace can be tested for directly, multibyte
  46  * sequences start with a byte with the high bit set, and strings are
  47  * terminated by a nul byte.
  48  *
  49  * Note that the only function provided by iconv is conversion between
  50  * characters.  It doesn't directly support operations like
  51  * uppercasing or comparison.  We have to convert to UTF-16LE and
  52  * compare there.
  53  *
  54  * @sa Samba Developers Guide
  55  **/
  56
  57 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
  58 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
  59 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
  60 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
  61 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
  62 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
  63 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
  64 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
  65 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
  66 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
  67 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
  68
  69 static const struct charset_functions builtin_functions[] = {
  70         /* windows is closest to UTF-16 */
  71         {"UCS-2LE",  iconv_copy, iconv_copy},
  72         {"UTF-16LE",  iconv_copy, iconv_copy},
  73         {"UCS-2BE",  iconv_swab, iconv_swab},
  74         {"UTF-16BE",  iconv_swab, iconv_swab},
  75
  76         /* we include the UTF-8 alias to cope with differing locale settings */
  77         {"UTF8",   utf8_pull,  utf8_push},
  78         {"UTF-8",   utf8_pull,  utf8_push},
  79
  80         /* this handles the munging needed for String2Key */
  81         {"UTF16_MUNGED",   utf16_munged_pull,  iconv_copy},
  82
  83         {"ASCII", ascii_pull, ascii_push},
  84         {"646", ascii_pull, ascii_push},
  85         {"ISO-8859-1", latin1_pull, latin1_push},
  86         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
  87 };
  88
  89 static struct charset_functions *charsets = NULL;
  90
  91 static struct charset_functions *find_charset_functions(const char *name)
  92 {
  93         struct charset_functions *c;
  94
  95         /* Check whether we already have this charset... */
  96         for (c = charsets; c != NULL; c = c->next) {
  97                 if(strcasecmp(c->name, name) == 0) {
  98                         return c;
  99                 }
 100                 c = c->next;
 101         }
 102
 103         return NULL;
 104 }
 105
 106 bool smb_register_charset(const struct charset_functions *funcs_in)
 107 {
 108         struct charset_functions *funcs;
 109
 110         DEBUG(5, ("Attempting to register new charset %s\n", funcs_in->name));
 111         /* Check whether we already have this charset... */
 112         if (find_charset_functions(funcs_in->name)) {
 113                 DEBUG(0, ("Duplicate charset %s, not registering\n", funcs_in->name));
 114                 return false;
 115         }
 116
 117         funcs = talloc(NULL, struct charset_functions);
 118         if (!funcs) {
 119                 DEBUG(0, ("Out of memory duplicating charset %s\n", funcs_in->name));
 120                 return false;
 121         }
 122         *funcs = *funcs_in;
 123
 124         funcs->next = funcs->prev = NULL;
 125         DEBUG(5, ("Registered charset %s\n", funcs->name));
 126         DLIST_ADD(charsets, funcs);
 127         return true;
 128 }
 129
 130 static void lazy_initialize_iconv(void)
 131 {
 132 #ifdef static_init_charset
 133         static bool initialized = false;
 134
 135         if (!initialized) {
 136                 static_init_charset;
 137                 initialized = true;
 138         }
 139 #endif
 140 }
 141
 142 #ifdef HAVE_NATIVE_ICONV
 143 /* if there was an error then reset the internal state,
 144    this ensures that we don't have a shift state remaining for
 145    character sets like SJIS */
 146 static size_t sys_iconv(void *cd,
 147                         const char **inbuf, size_t *inbytesleft,
 148                         char **outbuf, size_t *outbytesleft)
 149 {
 150         size_t ret = iconv((iconv_t)cd,
 151                            discard_const_p(char *, inbuf), inbytesleft,
 152                            outbuf, outbytesleft);
 153         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
 154         return ret;
 155 }
 156 #endif
 157
 158 /**
 159  * This is a simple portable iconv() implementaion.
 160  *
 161  * It only knows about a very small number of character sets - just
 162  * enough that Samba works on systems that don't have iconv.
 163  **/
 164 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd,
 165                  const char **inbuf, size_t *inbytesleft,
 166                  char **outbuf, size_t *outbytesleft)
 167 {
 168         /* in many cases we can go direct */
 169         if (cd->direct) {
 170                 return cd->direct(cd->cd_direct,
 171                                   inbuf, inbytesleft, outbuf, outbytesleft);
 172         }
 173
 174         /* otherwise we have to do it chunks at a time */
 175         {
 176 #ifndef SMB_ICONV_BUFSIZE
 177 #define SMB_ICONV_BUFSIZE 2048
 178 #endif
 179                 TALLOC_CTX *mem_ctx;
 180                 size_t bufsize;
 181                 char *cvtbuf;
 182
 183 #if _SAMBA_BUILD_ == 3
 184                 mem_ctx = talloc_tos();
 185 #else
 186                 mem_ctx = cd;
 187 #endif
 188                 cvtbuf = talloc_array(mem_ctx, char, SMB_ICONV_BUFSIZE);
 189
 190                 if (!cvtbuf) {
 191                         return (size_t)-1;
 192                 }
 193
 194                 while (*inbytesleft > 0) {
 195                         char *bufp1 = cvtbuf;
 196                         const char *bufp2 = cvtbuf;
 197                         int saved_errno = errno;
 198                         bool pull_failed = false;
 199                         bufsize = SMB_ICONV_BUFSIZE;
 200
 201                         if (cd->pull(cd->cd_pull,
 202                                      inbuf, inbytesleft, &bufp1, &bufsize) == -1
 203                             && errno != E2BIG) {
 204                                 saved_errno = errno;
 205                                 pull_failed = true;
 206                         }
 207
 208                         bufsize = SMB_ICONV_BUFSIZE - bufsize;
 209
 210                         if (cd->push(cd->cd_push,
 211                                      &bufp2, &bufsize,
 212                                      outbuf, outbytesleft) == -1) {
 213                                 talloc_free(cvtbuf);
 214                                 return -1;
 215                         } else if (pull_failed) {
 216                                 /* We want the pull errno if possible */
 217                                 errno = saved_errno;
 218                                 return -1;
 219                         }
 220                 }
 221                 talloc_free(cvtbuf);
 222         }
 223
 224         return 0;
 225 }
 226
 227 static bool is_utf16(const char *name)
 228 {
 229         return strcasecmp(name, "UCS-2LE") == 0 ||
 230                 strcasecmp(name, "UTF-16LE") == 0;
 231 }
 232
 233 static int smb_iconv_t_destructor(smb_iconv_t hwd)
 234 {
 235 #ifdef HAVE_NATIVE_ICONV
 236         if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
 237                 iconv_close(hwd->cd_pull);
 238         if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
 239                 iconv_close(hwd->cd_push);
 240         if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
 241                 iconv_close(hwd->cd_direct);
 242 #endif
 243
 244         return 0;
 245 }
 246
 247 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode,
 248                               const char *fromcode, bool native_iconv)
 249 {
 250         smb_iconv_t ret;
 251         const struct charset_functions *from=NULL, *to=NULL;
 252         int i;
 253
 254         lazy_initialize_iconv();
 255
 256         ret = (smb_iconv_t)talloc_named(mem_ctx,
 257                                         sizeof(*ret),
 258                                         "iconv(%s,%s)", tocode, fromcode);
 259         if (!ret) {
 260                 errno = ENOMEM;
 261                 return (smb_iconv_t)-1;
 262         }
 263         memset(ret, 0, sizeof(*ret));
 264         talloc_set_destructor(ret, smb_iconv_t_destructor);
 265
 266         /* check for the simplest null conversion */
 267         if (strcmp(fromcode, tocode) == 0) {
 268                 ret->direct = iconv_copy;
 269                 return ret;
 270         }
 271
 272         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
 273                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
 274                         from = &builtin_functions[i];
 275                 }
 276                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
 277                         to = &builtin_functions[i];
 278                 }
 279         }
 280
 281         if (from == NULL) {
 282                 for (from=charsets; from; from=from->next) {
 283                         if (strcasecmp(from->name, fromcode) == 0) break;
 284                 }
 285         }
 286
 287         if (to == NULL) {
 288                 for (to=charsets; to; to=to->next) {
 289                         if (strcasecmp(to->name, tocode) == 0) break;
 290                 }
 291         }
 292
 293 #ifdef HAVE_NATIVE_ICONV
 294         if ((!from || !to) && !native_iconv) {
 295                 goto failed;
 296         }
 297         if (!from) {
 298                 ret->pull = sys_iconv;
 299                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
 300                 if (ret->cd_pull == (iconv_t)-1)
 301                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
 302                 if (ret->cd_pull == (iconv_t)-1) goto failed;
 303         }
 304
 305         if (!to) {
 306                 ret->push = sys_iconv;
 307                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
 308                 if (ret->cd_push == (iconv_t)-1)
 309                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
 310                 if (ret->cd_push == (iconv_t)-1) goto failed;
 311         }
 312 #else
 313         if (!from || !to) {
 314                 goto failed;
 315         }
 316 #endif
 317
 318         /* check for conversion to/from ucs2 */
 319         if (is_utf16(fromcode) && to) {
 320                 ret->direct = to->push;
 321                 return ret;
 322         }
 323         if (is_utf16(tocode) && from) {
 324                 ret->direct = from->pull;
 325                 return ret;
 326         }
 327
 328 #ifdef HAVE_NATIVE_ICONV
 329         if (is_utf16(fromcode)) {
 330                 ret->direct = sys_iconv;
 331                 ret->cd_direct = ret->cd_push;
 332                 ret->cd_push = NULL;
 333                 return ret;
 334         }
 335         if (is_utf16(tocode)) {
 336                 ret->direct = sys_iconv;
 337                 ret->cd_direct = ret->cd_pull;
 338                 ret->cd_pull = NULL;
 339                 return ret;
 340         }
 341 #endif
 342
 343         /* the general case has to go via a buffer */
 344         if (!ret->pull) ret->pull = from->pull;
 345         if (!ret->push) ret->push = to->push;
 346         return ret;
 347
 348 failed:
 349         talloc_free(ret);
 350         errno = EINVAL;
 351         return (smb_iconv_t)-1;
 352 }
 353
 354 /*
 355   simple iconv_open() wrapper
 356  */
 357 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
 358 {
 359         return smb_iconv_open_ex(NULL, tocode, fromcode, true);
 360 }
 361
 362 /*
 363   simple iconv_close() wrapper
 364 */
 365 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
 366 {
 367         talloc_free(cd);
 368         return 0;
 369 }
 370
 371
 372 /**********************************************************************
 373  the following functions implement the builtin character sets in Samba
 374  and also the "test" character sets that are designed to test
 375  multi-byte character set support for english users
 376 ***********************************************************************/
 377
 378 /*
 379   this takes an ASCII sequence and produces a UTF16 sequence
 380
 381   The first 127 codepoints of latin1 matches the first 127 codepoints
 382   of unicode, and so can be put into the first byte of UTF16LE
 383
 384  */
 385
 386 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 387                          char **outbuf, size_t *outbytesleft)
 388 {
 389         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 390                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
 391                         /* If this is multi-byte, then it isn't legal ASCII */
 392                         errno = EILSEQ;
 393                         return -1;
 394                 }
 395                 (*outbuf)[0] = (*inbuf)[0];
 396                 (*outbuf)[1] = 0;
 397                 (*inbytesleft)  -= 1;
 398                 (*outbytesleft) -= 2;
 399                 (*inbuf)  += 1;
 400                 (*outbuf) += 2;
 401         }
 402
 403         if (*inbytesleft > 0) {
 404                 errno = E2BIG;
 405                 return -1;
 406         }
 407
 408         return 0;
 409 }
 410
 411 /*
 412   this takes a UTF16 sequence and produces an ASCII sequence
 413
 414   The first 127 codepoints of ASCII matches the first 127 codepoints
 415   of unicode, and so can be read directly from the first byte of UTF16LE
 416
 417  */
 418 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
 419                          char **outbuf, size_t *outbytesleft)
 420 {
 421         int ir_count=0;
 422
 423         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 424                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
 425                         (*inbuf)[1] != 0) {
 426                         /* If this is multi-byte, then it isn't legal ASCII */
 427                         errno = EILSEQ;
 428                         return -1;
 429                 }
 430                 (*outbuf)[0] = (*inbuf)[0];
 431                 (*inbytesleft)  -= 2;
 432                 (*outbytesleft) -= 1;
 433                 (*inbuf)  += 2;
 434                 (*outbuf) += 1;
 435         }
 436
 437         if (*inbytesleft == 1) {
 438                 errno = EINVAL;
 439                 return -1;
 440         }
 441
 442         if (*inbytesleft > 1) {
 443                 errno = E2BIG;
 444                 return -1;
 445         }
 446
 447         return ir_count;
 448 }
 449
 450 /*
 451   this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
 452
 453   The first 256 codepoints of latin1 matches the first 256 codepoints
 454   of unicode, and so can be put into the first byte of UTF16LE
 455
 456  */
 457 static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 458                           char **outbuf, size_t *outbytesleft)
 459 {
 460         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 461                 (*outbuf)[0] = (*inbuf)[0];
 462                 (*outbuf)[1] = 0;
 463                 (*inbytesleft)  -= 1;
 464                 (*outbytesleft) -= 2;
 465                 (*inbuf)  += 1;
 466                 (*outbuf) += 2;
 467         }
 468
 469         if (*inbytesleft > 0) {
 470                 errno = E2BIG;
 471                 return -1;
 472         }
 473
 474         return 0;
 475 }
 476
 477 /*
 478   this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
 479
 480   The first 256 codepoints of latin1 matches the first 256 codepoints
 481   of unicode, and so can be read directly from the first byte of UTF16LE
 482
 483  */
 484 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
 485                          char **outbuf, size_t *outbytesleft)
 486 {
 487         int ir_count=0;
 488
 489         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 490                 (*outbuf)[0] = (*inbuf)[0];
 491                 if ((*inbuf)[1] != 0) {
 492                         /* If this is multi-byte, then it isn't legal latin1 */
 493                         errno = EILSEQ;
 494                         return -1;
 495                 }
 496                 (*inbytesleft)  -= 2;
 497                 (*outbytesleft) -= 1;
 498                 (*inbuf)  += 2;
 499                 (*outbuf) += 1;
 500         }
 501
 502         if (*inbytesleft == 1) {
 503                 errno = EINVAL;
 504                 return -1;
 505         }
 506
 507         if (*inbytesleft > 1) {
 508                 errno = E2BIG;
 509                 return -1;
 510         }
 511
 512         return ir_count;
 513 }
 514
 515 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 516                          char **outbuf, size_t *outbytesleft)
 517 {
 518         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 519                 unsigned int v;
 520
 521                 if ((*inbuf)[0] != '@') {
 522                         /* seven bit ascii case */
 523                         (*outbuf)[0] = (*inbuf)[0];
 524                         (*outbuf)[1] = 0;
 525                         (*inbytesleft)  -= 1;
 526                         (*outbytesleft) -= 2;
 527                         (*inbuf)  += 1;
 528                         (*outbuf) += 2;
 529                         continue;
 530                 }
 531                 /* it's a hex character */
 532                 if (*inbytesleft < 5) {
 533                         errno = EINVAL;
 534                         return -1;
 535                 }
 536
 537                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
 538                         errno = EILSEQ;
 539                         return -1;
 540                 }
 541
 542                 (*outbuf)[0] = v&0xff;
 543                 (*outbuf)[1] = v>>8;
 544                 (*inbytesleft)  -= 5;
 545                 (*outbytesleft) -= 2;
 546                 (*inbuf)  += 5;
 547                 (*outbuf) += 2;
 548         }
 549
 550         if (*inbytesleft > 0) {
 551                 errno = E2BIG;
 552                 return -1;
 553         }
 554
 555         return 0;
 556 }
 557
 558 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
 559                            char **outbuf, size_t *outbytesleft)
 560 {
 561         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 562                 char buf[6];
 563
 564                 if ((*inbuf)[1] == 0 &&
 565                     ((*inbuf)[0] & 0x80) == 0 &&
 566                     (*inbuf)[0] != '@') {
 567                         (*outbuf)[0] = (*inbuf)[0];
 568                         (*inbytesleft)  -= 2;
 569                         (*outbytesleft) -= 1;
 570                         (*inbuf)  += 2;
 571                         (*outbuf) += 1;
 572                         continue;
 573                 }
 574                 if (*outbytesleft < 5) {
 575                         errno = E2BIG;
 576                         return -1;
 577                 }
 578                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
 579                 memcpy(*outbuf, buf, 5);
 580                 (*inbytesleft)  -= 2;
 581                 (*outbytesleft) -= 5;
 582                 (*inbuf)  += 2;
 583                 (*outbuf) += 5;
 584         }
 585
 586         if (*inbytesleft == 1) {
 587                 errno = EINVAL;
 588                 return -1;
 589         }
 590
 591         if (*inbytesleft > 1) {
 592                 errno = E2BIG;
 593                 return -1;
 594         }
 595
 596         return 0;
 597 }
 598
 599 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
 600                          char **outbuf, size_t *outbytesleft)
 601 {
 602         int n;
 603
 604         n = MIN(*inbytesleft, *outbytesleft);
 605
 606         swab(*inbuf, *outbuf, (n&~1));
 607         if (n&1) {
 608                 (*outbuf)[n-1] = 0;
 609         }
 610
 611         (*inbytesleft) -= n;
 612         (*outbytesleft) -= n;
 613         (*inbuf) += n;
 614         (*outbuf) += n;
 615
 616         if (*inbytesleft > 0) {
 617                 errno = E2BIG;
 618                 return -1;
 619         }
 620
 621         return 0;
 622 }
 623
 624
 625 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
 626                          char **outbuf, size_t *outbytesleft)
 627 {
 628         int n;
 629
 630         n = MIN(*inbytesleft, *outbytesleft);
 631
 632         memmove(*outbuf, *inbuf, n);
 633
 634         (*inbytesleft) -= n;
 635         (*outbytesleft) -= n;
 636         (*inbuf) += n;
 637         (*outbuf) += n;
 638
 639         if (*inbytesleft > 0) {
 640                 errno = E2BIG;
 641                 return -1;
 642         }
 643
 644         return 0;
 645 }
 646
 647 /*
 648   this takes a UTF8 sequence and produces a UTF16 sequence
 649  */
 650 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 651                          char **outbuf, size_t *outbytesleft)
 652 {
 653         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 654         const uint8_t *c = (const uint8_t *)*inbuf;
 655         uint8_t *uc = (uint8_t *)*outbuf;
 656
 657         while (in_left >= 1 && out_left >= 2) {
 658                 if ((c[0] & 0x80) == 0) {
 659                         uc[0] = c[0];
 660                         uc[1] = 0;
 661                         c  += 1;
 662                         in_left  -= 1;
 663                         out_left -= 2;
 664                         uc += 2;
 665                         continue;
 666                 }
 667
 668                 if ((c[0] & 0xe0) == 0xc0) {
 669                         if (in_left < 2 ||
 670                             (c[1] & 0xc0) != 0x80) {
 671                                 errno = EILSEQ;
 672                                 goto error;
 673                         }
 674                         uc[1] = (c[0]>>2) & 0x7;
 675                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
 676                         c  += 2;
 677                         in_left  -= 2;
 678                         out_left -= 2;
 679                         uc += 2;
 680                         continue;
 681                 }
 682
 683                 if ((c[0] & 0xf0) == 0xe0) {
 684                         if (in_left < 3 ||
 685                             (c[1] & 0xc0) != 0x80 ||
 686                             (c[2] & 0xc0) != 0x80) {
 687                                 errno = EILSEQ;
 688                                 goto error;
 689                         }
 690                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
 691                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
 692                         c  += 3;
 693                         in_left  -= 3;
 694                         out_left -= 2;
 695                         uc += 2;
 696                         continue;
 697                 }
 698
 699                 if ((c[0] & 0xf8) == 0xf0) {
 700                         unsigned int codepoint;
 701                         if (in_left < 4 ||
 702                             (c[1] & 0xc0) != 0x80 ||
 703                             (c[2] & 0xc0) != 0x80 ||
 704                             (c[3] & 0xc0) != 0x80) {
 705                                 errno = EILSEQ;
 706                                 goto error;
 707                         }
 708                         codepoint =
 709                                 (c[3]&0x3f) |
 710                                 ((c[2]&0x3f)<<6) |
 711                                 ((c[1]&0x3f)<<12) |
 712                                 ((c[0]&0x7)<<18);
 713                         if (codepoint < 0x10000) {
 714                                 /* accept UTF-8 characters that are not
 715                                    minimally packed, but pack the result */
 716                                 uc[0] = (codepoint & 0xFF);
 717                                 uc[1] = (codepoint >> 8);
 718                                 c += 4;
 719                                 in_left -= 4;
 720                                 out_left -= 2;
 721                                 uc += 2;
 722                                 continue;
 723                         }
 724
 725                         codepoint -= 0x10000;
 726
 727                         if (out_left < 4) {
 728                                 errno = E2BIG;
 729                                 goto error;
 730                         }
 731
 732                         uc[0] = (codepoint>>10) & 0xFF;
 733                         uc[1] = (codepoint>>18) | 0xd8;
 734                         uc[2] = codepoint & 0xFF;
 735                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
 736                         c  += 4;
 737                         in_left  -= 4;
 738                         out_left -= 4;
 739                         uc += 4;
 740                         continue;
 741                 }
 742
 743                 /* we don't handle 5 byte sequences */
 744                 errno = EINVAL;
 745                 goto error;
 746         }
 747
 748         if (in_left > 0) {
 749                 errno = E2BIG;
 750                 goto error;
 751         }
 752
 753         *inbytesleft = in_left;
 754         *outbytesleft = out_left;
 755         *inbuf = (const char *)c;
 756         *outbuf = (char *)uc;
 757         return 0;
 758
 759 error:
 760         *inbytesleft = in_left;
 761         *outbytesleft = out_left;
 762         *inbuf = (const char *)c;
 763         *outbuf = (char *)uc;
 764         return -1;
 765 }
 766
 767
 768 /*
 769   this takes a UTF16 sequence and produces a UTF8 sequence
 770  */
 771 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
 772                         char **outbuf, size_t *outbytesleft)
 773 {
 774         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 775         uint8_t *c = (uint8_t *)*outbuf;
 776         const uint8_t *uc = (const uint8_t *)*inbuf;
 777
 778         while (in_left >= 2 && out_left >= 1) {
 779                 unsigned int codepoint;
 780
 781                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
 782                         /* simplest case */
 783                         c[0] = uc[0];
 784                         in_left  -= 2;
 785                         out_left -= 1;
 786                         uc += 2;
 787                         c  += 1;
 788                         continue;
 789                 }
 790
 791                 if ((uc[1]&0xf8) == 0) {
 792                         /* next simplest case */
 793                         if (out_left < 2) {
 794                                 errno = E2BIG;
 795                                 goto error;
 796                         }
 797                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
 798                         c[1] = 0x80 | (uc[0] & 0x3f);
 799                         in_left  -= 2;
 800                         out_left -= 2;
 801                         uc += 2;
 802                         c  += 2;
 803                         continue;
 804                 }
 805
 806                 if ((uc[1] & 0xfc) == 0xdc) {
 807                         /* its the second part of a 4 byte sequence. Illegal */
 808                         if (in_left < 4) {
 809                                 errno = EINVAL;
 810                         } else {
 811                                 errno = EILSEQ;
 812                         }
 813                         goto error;
 814                 }
 815
 816                 if ((uc[1] & 0xfc) != 0xd8) {
 817                         codepoint = uc[0] | (uc[1]<<8);
 818                         if (out_left < 3) {
 819                                 errno = E2BIG;
 820                                 goto error;
 821                         }
 822                         c[0] = 0xe0 | (codepoint >> 12);
 823                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
 824                         c[2] = 0x80 | (codepoint & 0x3f);
 825
 826                         in_left  -= 2;
 827                         out_left -= 3;
 828                         uc  += 2;
 829                         c   += 3;
 830                         continue;
 831                 }
 832
 833                 /* its the first part of a 4 byte sequence */
 834                 if (in_left < 4) {
 835                         errno = EINVAL;
 836                         goto error;
 837                 }
 838                 if ((uc[3] & 0xfc) != 0xdc) {
 839                         errno = EILSEQ;
 840                         goto error;
 841                 }
 842                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
 843                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
 844
 845                 if (out_left < 4) {
 846                         errno = E2BIG;
 847                         goto error;
 848                 }
 849                 c[0] = 0xf0 | (codepoint >> 18);
 850                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
 851                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
 852                 c[3] = 0x80 | (codepoint & 0x3f);
 853
 854                 in_left  -= 4;
 855                 out_left -= 4;
 856                 uc       += 4;
 857                 c        += 4;
 858         }
 859
 860         if (in_left == 1) {
 861                 errno = EINVAL;
 862                 goto error;
 863         }
 864
 865         if (in_left > 1) {
 866                 errno = E2BIG;
 867                 goto error;
 868         }
 869
 870         *inbytesleft = in_left;
 871         *outbytesleft = out_left;
 872         *inbuf  = (const char *)uc;
 873         *outbuf = (char *)c;
 874
 875         return 0;
 876
 877 error:
 878         *inbytesleft = in_left;
 879         *outbytesleft = out_left;
 880         *inbuf  = (const char *)uc;
 881         *outbuf = (char *)c;
 882         return -1;
 883 }
 884
 885
 886 /*
 887   this takes a UTF16 munged sequence, modifies it according to the
 888   string2key rules, and produces a UTF16 sequence
 889
 890 The rules are:
 891
 892     1) any 0x0000 characters are mapped to 0x0001
 893
 894     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
 895        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
 896        U+FFFD (OBJECT REPLACEMENT CHARACTER).
 897
 898     3) the same for any low surrogate that was not preceded by a high surrogate.
 899
 900  */
 901 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 902                                char **outbuf, size_t *outbytesleft)
 903 {
 904         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 905         uint8_t *c = (uint8_t *)*outbuf;
 906         const uint8_t *uc = (const uint8_t *)*inbuf;
 907
 908         while (in_left >= 2 && out_left >= 2) {
 909                 unsigned int codepoint = uc[0] | (uc[1]<<8);
 910
 911                 if (codepoint == 0) {
 912                         codepoint = 1;
 913                 }
 914
 915                 if ((codepoint & 0xfc00) == 0xd800) {
 916                         /* a high surrogate */
 917                         unsigned int codepoint2;
 918                         if (in_left < 4) {
 919                                 codepoint = 0xfffd;
 920                                 goto codepoint16;
 921                         }
 922                         codepoint2 = uc[2] | (uc[3]<<8);
 923                         if ((codepoint2 & 0xfc00) != 0xdc00) {
 924                                 /* high surrogate not followed by low
 925                                    surrogate: convert to 0xfffd */
 926                                 codepoint = 0xfffd;
 927                                 goto codepoint16;
 928                         }
 929                         if (out_left < 4) {
 930                                 errno = E2BIG;
 931                                 goto error;
 932                         }
 933                         memcpy(c, uc, 4);
 934                         in_left  -= 4;
 935                         out_left -= 4;
 936                         uc       += 4;
 937                         c        += 4;
 938                         continue;
 939                 }
 940
 941                 if ((codepoint & 0xfc00) == 0xdc00) {
 942                         /* low surrogate not preceded by high
 943                            surrogate: convert to 0xfffd */
 944                         codepoint = 0xfffd;
 945                 }
 946
 947         codepoint16:
 948                 c[0] = codepoint & 0xFF;
 949                 c[1] = (codepoint>>8) & 0xFF;
 950
 951                 in_left  -= 2;
 952                 out_left -= 2;
 953                 uc  += 2;
 954                 c   += 2;
 955                 continue;
 956         }
 957
 958         if (in_left == 1) {
 959                 errno = EINVAL;
 960                 goto error;
 961         }
 962
 963         if (in_left > 1) {
 964                 errno = E2BIG;
 965                 goto error;
 966         }
 967
 968         *inbytesleft = in_left;
 969         *outbytesleft = out_left;
 970         *inbuf  = (const char *)uc;
 971         *outbuf = (char *)c;
 972
 973         return 0;
 974
 975 error:
 976         *inbytesleft = in_left;
 977         *outbytesleft = out_left;
 978         *inbuf  = (const char *)uc;
 979         *outbuf = (char *)c;
 980         return -1;
 981 }
 982
 983
 984