source4/lib/charset/util_unistr.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba utility functions
   4    Copyright (C) Andrew Tridgell 1992-2001
   5    Copyright (C) Simo Sorce 2001
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program; if not, write to the Free Software
  19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20 */
  21
  22 #include "includes.h"
  23 #include "system/locale.h"
  24 #include "dynconfig.h"
  25
  26 /**
  27  * @file
  28  * @brief Unicode string manipulation
  29  */
  30
  31 /* these 2 tables define the unicode case handling.  They are loaded
  32    at startup either via mmap() or read() from the lib directory */
  33 static void *upcase_table;
  34 static void *lowcase_table;
  35
  36
  37 /*******************************************************************
  38 load the case handling tables
  39 ********************************************************************/
  40 static void load_case_tables(void)
  41 {
  42         TALLOC_CTX *mem_ctx;
  43
  44         mem_ctx = talloc_init("load_case_tables");
  45         if (!mem_ctx) {
  46                 smb_panic("No memory for case_tables");
  47         }
  48         upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", dyn_DATADIR), 0x20000);
  49         lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", dyn_DATADIR), 0x20000);
  50         talloc_free(mem_ctx);
  51         if (upcase_table == NULL) {
  52                 /* try also under codepages for testing purposes */
  53                 upcase_table = map_file("codepages/upcase.dat", 0x20000);
  54                 if (upcase_table == NULL) {
  55                         upcase_table = (void *)-1;
  56                 }
  57         }
  58         if (lowcase_table == NULL) {
  59                 /* try also under codepages for testing purposes */
  60                 lowcase_table = map_file("codepages/lowcase.dat", 0x20000);
  61                 if (lowcase_table == NULL) {
  62                         lowcase_table = (void *)-1;
  63                 }
  64         }
  65 }
  66
  67 /**
  68  Convert a codepoint_t to upper case.
  69 **/
  70 codepoint_t toupper_w(codepoint_t val)
  71 {
  72         if (val < 128) {
  73                 return toupper(val);
  74         }
  75         if (upcase_table == NULL) {
  76                 load_case_tables();
  77         }
  78         if (upcase_table == (void *)-1) {
  79                 return val;
  80         }
  81         if (val & 0xFFFF0000) {
  82                 return val;
  83         }
  84         return SVAL(upcase_table, val*2);
  85 }
  86
  87 /**
  88  Convert a codepoint_t to lower case.
  89 **/
  90 codepoint_t tolower_w(codepoint_t val)
  91 {
  92         if (val < 128) {
  93                 return tolower(val);
  94         }
  95         if (lowcase_table == NULL) {
  96                 load_case_tables();
  97         }
  98         if (lowcase_table == (void *)-1) {
  99                 return val;
 100         }
 101         if (val & 0xFFFF0000) {
 102                 return val;
 103         }
 104         return SVAL(lowcase_table, val*2);
 105 }
 106
 107 /**
 108   compare two codepoints case insensitively
 109 */
 110 int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
 111 {
 112         if (c1 == c2 ||
 113             toupper_w(c1) == toupper_w(c2)) {
 114                 return 0;
 115         }
 116         return c1 - c2;
 117 }
 118
 119 /**
 120  Case insensitive string compararison
 121 **/
 122 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
 123 {
 124         codepoint_t c1=0, c2=0;
 125         size_t size1, size2;
 126
 127         /* handle null ptr comparisons to simplify the use in qsort */
 128         if (s1 == s2) return 0;
 129         if (s1 == NULL) return -1;
 130         if (s2 == NULL) return 1;
 131
 132         while (*s1 && *s2) {
 133                 c1 = next_codepoint(s1, &size1);
 134                 c2 = next_codepoint(s2, &size2);
 135
 136                 s1 += size1;
 137                 s2 += size2;
 138
 139                 if (c1 == c2) {
 140                         continue;
 141                 }
 142
 143                 if (c1 == INVALID_CODEPOINT ||
 144                     c2 == INVALID_CODEPOINT) {
 145                         /* what else can we do?? */
 146                         return strcasecmp(s1, s2);
 147                 }
 148
 149                 if (toupper_w(c1) != toupper_w(c2)) {
 150                         return c1 - c2;
 151                 }
 152         }
 153
 154         return *s1 - *s2;
 155 }
 156
 157 /**
 158  * Get the next token from a string, return False if none found.
 159  * Handles double-quotes.
 160  *
 161  * Based on a routine by GJC@VILLAGE.COM.
 162  * Extensively modified by Andrew.Tridgell@anu.edu.au
 163  **/
 164 _PUBLIC_ BOOL next_token(const char **ptr,char *buff, const char *sep, size_t bufsize)
 165 {
 166         const char *s;
 167         BOOL quoted;
 168         size_t len=1;
 169
 170         if (!ptr)
 171                 return(False);
 172
 173         s = *ptr;
 174
 175         /* default to simple separators */
 176         if (!sep)
 177                 sep = " \t\n\r";
 178
 179         /* find the first non sep char */
 180         while (*s && strchr_m(sep,*s))
 181                 s++;
 182
 183         /* nothing left? */
 184         if (! *s)
 185                 return(False);
 186
 187         /* copy over the token */
 188         for (quoted = False; len < bufsize && *s && (quoted || !strchr_m(sep,*s)); s++) {
 189                 if (*s == '\"') {
 190                         quoted = !quoted;
 191                 } else {
 192                         len++;
 193                         *buff++ = *s;
 194                 }
 195         }
 196
 197         *ptr = (*s) ? s+1 : s;
 198         *buff = 0;
 199
 200         return(True);
 201 }
 202
 203 /**
 204  Case insensitive string compararison, length limited
 205 **/
 206 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
 207 {
 208         codepoint_t c1=0, c2=0;
 209         size_t size1, size2;
 210
 211         /* handle null ptr comparisons to simplify the use in qsort */
 212         if (s1 == s2) return 0;
 213         if (s1 == NULL) return -1;
 214         if (s2 == NULL) return 1;
 215
 216         while (*s1 && *s2 && n) {
 217                 n--;
 218
 219                 c1 = next_codepoint(s1, &size1);
 220                 c2 = next_codepoint(s2, &size2);
 221
 222                 s1 += size1;
 223                 s2 += size2;
 224
 225                 if (c1 == c2) {
 226                         continue;
 227                 }
 228
 229                 if (c1 == INVALID_CODEPOINT ||
 230                     c2 == INVALID_CODEPOINT) {
 231                         /* what else can we do?? */
 232                         return strcasecmp(s1, s2);
 233                 }
 234
 235                 if (toupper_w(c1) != toupper_w(c2)) {
 236                         return c1 - c2;
 237                 }
 238         }
 239
 240         if (n == 0) {
 241                 return 0;
 242         }
 243
 244         return *s1 - *s2;
 245 }
 246
 247 /**
 248  * Compare 2 strings.
 249  *
 250  * @note The comparison is case-insensitive.
 251  **/
 252 _PUBLIC_ BOOL strequal_w(const char *s1, const char *s2)
 253 {
 254         return strcasecmp_m(s1,s2) == 0;
 255 }
 256
 257 /**
 258  Compare 2 strings (case sensitive).
 259 **/
 260 _PUBLIC_ BOOL strcsequal_w(const char *s1,const char *s2)
 261 {
 262         if (s1 == s2)
 263                 return(True);
 264         if (!s1 || !s2)
 265                 return(False);
 266
 267         return strcmp(s1,s2) == 0;
 268 }
 269
 270
 271 /**
 272  String replace.
 273  NOTE: oldc and newc must be 7 bit characters
 274 **/
 275 _PUBLIC_ void string_replace_w(char *s, char oldc, char newc)
 276 {
 277         while (s && *s) {
 278                 size_t size;
 279                 codepoint_t c = next_codepoint(s, &size);
 280                 if (c == oldc) {
 281                         *s = newc;
 282                 }
 283                 s += size;
 284         }
 285 }
 286
 287 /**
 288  Paranoid strcpy into a buffer of given length (includes terminating
 289  zero. Strips out all but 'a-Z0-9' and the character in other_safe_chars
 290  and replaces with '_'. Deliberately does *NOT* check for multibyte
 291  characters. Don't change it !
 292 **/
 293
 294 _PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength)
 295 {
 296         size_t len, i;
 297
 298         if (maxlength == 0) {
 299                 /* can't fit any bytes at all! */
 300                 return NULL;
 301         }
 302
 303         if (!dest) {
 304                 DEBUG(0,("ERROR: NULL dest in alpha_strcpy\n"));
 305                 return NULL;
 306         }
 307
 308         if (!src) {
 309                 *dest = 0;
 310                 return dest;
 311         }
 312
 313         len = strlen(src);
 314         if (len >= maxlength)
 315                 len = maxlength - 1;
 316
 317         if (!other_safe_chars)
 318                 other_safe_chars = "";
 319
 320         for(i = 0; i < len; i++) {
 321                 int val = (src[i] & 0xff);
 322                 if (isupper(val) || islower(val) || isdigit(val) || strchr_m(other_safe_chars, val))
 323                         dest[i] = src[i];
 324                 else
 325                         dest[i] = '_';
 326         }
 327
 328         dest[i] = '\0';
 329
 330         return dest;
 331 }
 332
 333 /**
 334  Count the number of UCS2 characters in a string. Normally this will
 335  be the same as the number of bytes in a string for single byte strings,
 336  but will be different for multibyte.
 337 **/
 338 _PUBLIC_ size_t strlen_m(const char *s)
 339 {
 340         size_t count = 0;
 341
 342         if (!s) {
 343                 return 0;
 344         }
 345
 346         while (*s && !(((uint8_t)*s) & 0x80)) {
 347                 s++;
 348                 count++;
 349         }
 350
 351         if (!*s) {
 352                 return count;
 353         }
 354
 355         while (*s) {
 356                 size_t c_size;
 357                 codepoint_t c = next_codepoint(s, &c_size);
 358                 if (c < 0x10000) {
 359                         count += 1;
 360                 } else {
 361                         count += 2;
 362                 }
 363                 s += c_size;
 364         }
 365
 366         return count;
 367 }
 368
 369 /**
 370    Work out the number of multibyte chars in a string, including the NULL
 371    terminator.
 372 **/
 373 _PUBLIC_ size_t strlen_m_term(const char *s)
 374 {
 375         if (!s) {
 376                 return 0;
 377         }
 378
 379         return strlen_m(s) + 1;
 380 }
 381
 382 /**
 383  Strchr and strrchr_m are a bit complex on general multi-byte strings.
 384 **/
 385 _PUBLIC_ char *strchr_m(const char *s, char c)
 386 {
 387         /* characters below 0x3F are guaranteed to not appear in
 388            non-initial position in multi-byte charsets */
 389         if ((c & 0xC0) == 0) {
 390                 return strchr(s, c);
 391         }
 392
 393         while (*s) {
 394                 size_t size;
 395                 codepoint_t c2 = next_codepoint(s, &size);
 396                 if (c2 == c) {
 397                         return discard_const(s);
 398                 }
 399                 s += size;
 400         }
 401
 402         return NULL;
 403 }
 404
 405 /**
 406  * Multibyte-character version of strrchr
 407  */
 408 _PUBLIC_ char *strrchr_m(const char *s, char c)
 409 {
 410         char *ret = NULL;
 411
 412         /* characters below 0x3F are guaranteed to not appear in
 413            non-initial position in multi-byte charsets */
 414         if ((c & 0xC0) == 0) {
 415                 return strrchr(s, c);
 416         }
 417
 418         while (*s) {
 419                 size_t size;
 420                 codepoint_t c2 = next_codepoint(s, &size);
 421                 if (c2 == c) {
 422                         ret = discard_const(s);
 423                 }
 424                 s += size;
 425         }
 426
 427         return ret;
 428 }
 429
 430 /**
 431   return True if any (multi-byte) character is lower case
 432 */
 433 _PUBLIC_ BOOL strhaslower(const char *string)
 434 {
 435         while (*string) {
 436                 size_t c_size;
 437                 codepoint_t s;
 438                 codepoint_t t;
 439
 440                 s = next_codepoint(string, &c_size);
 441                 string += c_size;
 442
 443                 t = toupper_w(s);
 444
 445                 if (s != t) {
 446                         return True; /* that means it has lower case chars */
 447                 }
 448         }
 449
 450         return False;
 451 }
 452
 453 /**
 454   return True if any (multi-byte) character is upper case
 455 */
 456 _PUBLIC_ BOOL strhasupper(const char *string)
 457 {
 458         while (*string) {
 459                 size_t c_size;
 460                 codepoint_t s;
 461                 codepoint_t t;
 462
 463                 s = next_codepoint(string, &c_size);
 464                 string += c_size;
 465
 466                 t = tolower_w(s);
 467
 468                 if (s != t) {
 469                         return True; /* that means it has upper case chars */
 470                 }
 471         }
 472
 473         return False;
 474 }
 475
 476 /**
 477  Convert a string to lower case, allocated with talloc
 478 **/
 479 _PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src)
 480 {
 481         size_t size=0;
 482         char *dest;
 483
 484         /* this takes advantage of the fact that upper/lower can't
 485            change the length of a character by more than 1 byte */
 486         dest = talloc_size(ctx, 2*(strlen(src))+1);
 487         if (dest == NULL) {
 488                 return NULL;
 489         }
 490
 491         while (*src) {
 492                 size_t c_size;
 493                 codepoint_t c = next_codepoint(src, &c_size);
 494                 src += c_size;
 495
 496                 c = tolower_w(c);
 497
 498                 c_size = push_codepoint(dest+size, c);
 499                 if (c_size == -1) {
 500                         talloc_free(dest);
 501                         return NULL;
 502                 }
 503                 size += c_size;
 504         }
 505
 506         dest[size] = 0;
 507
 508         /* trim it so talloc_append_string() works */
 509         dest = talloc_realloc_size(ctx, dest, size+1);
 510
 511         talloc_set_name_const(dest, dest);
 512
 513         return dest;
 514 }
 515
 516 /**
 517  Convert a string to UPPER case, allocated with talloc
 518 **/
 519 _PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src)
 520 {
 521         size_t size=0;
 522         char *dest;
 523
 524         if (!src) {
 525                 return NULL;
 526         }
 527
 528         /* this takes advantage of the fact that upper/lower can't
 529            change the length of a character by more than 1 byte */
 530         dest = talloc_size(ctx, 2*(strlen(src))+1);
 531         if (dest == NULL) {
 532                 return NULL;
 533         }
 534
 535         while (*src) {
 536                 size_t c_size;
 537                 codepoint_t c = next_codepoint(src, &c_size);
 538                 src += c_size;
 539
 540                 c = toupper_w(c);
 541
 542                 c_size = push_codepoint(dest+size, c);
 543                 if (c_size == -1) {
 544                         talloc_free(dest);
 545                         return NULL;
 546                 }
 547                 size += c_size;
 548         }
 549
 550         dest[size] = 0;
 551
 552         /* trim it so talloc_append_string() works */
 553         dest = talloc_realloc_size(ctx, dest, size+1);
 554
 555         talloc_set_name_const(dest, dest);
 556
 557         return dest;
 558 }
 559
 560 /**
 561  Convert a string to lower case.
 562 **/
 563 _PUBLIC_ void strlower_m(char *s)
 564 {
 565         char *d;
 566
 567         /* this is quite a common operation, so we want it to be
 568            fast. We optimise for the ascii case, knowing that all our
 569            supported multi-byte character sets are ascii-compatible
 570            (ie. they match for the first 128 chars) */
 571         while (*s && !(((uint8_t)*s) & 0x80)) {
 572                 *s = tolower((uint8_t)*s);
 573                 s++;
 574         }
 575
 576         if (!*s)
 577                 return;
 578
 579         d = s;
 580
 581         while (*s) {
 582                 size_t c_size, c_size2;
 583                 codepoint_t c = next_codepoint(s, &c_size);
 584                 c_size2 = push_codepoint(d, tolower_w(c));
 585                 if (c_size2 > c_size) {
 586                         DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strlower_m\n",
 587                                  c, tolower_w(c), (int)c_size, (int)c_size2));
 588                         smb_panic("codepoint expansion in strlower_m\n");
 589                 }
 590                 s += c_size;
 591                 d += c_size2;
 592         }
 593         *d = 0;
 594 }
 595
 596 /**
 597  Convert a string to UPPER case.
 598 **/
 599 _PUBLIC_ void strupper_m(char *s)
 600 {
 601         char *d;
 602
 603         /* this is quite a common operation, so we want it to be
 604            fast. We optimise for the ascii case, knowing that all our
 605            supported multi-byte character sets are ascii-compatible
 606            (ie. they match for the first 128 chars) */
 607         while (*s && !(((uint8_t)*s) & 0x80)) {
 608                 *s = toupper((uint8_t)*s);
 609                 s++;
 610         }
 611
 612         if (!*s)
 613                 return;
 614
 615         d = s;
 616
 617         while (*s) {
 618                 size_t c_size, c_size2;
 619                 codepoint_t c = next_codepoint(s, &c_size);
 620                 c_size2 = push_codepoint(d, toupper_w(c));
 621                 if (c_size2 > c_size) {
 622                         DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strupper_m\n",
 623                                  c, toupper_w(c), (int)c_size, (int)c_size2));
 624                         smb_panic("codepoint expansion in strupper_m\n");
 625                 }
 626                 s += c_size;
 627                 d += c_size2;
 628         }
 629         *d = 0;
 630 }
 631
 632
 633 /**
 634  Find the number of 'c' chars in a string
 635 **/
 636 _PUBLIC_ size_t count_chars_w(const char *s, char c)
 637 {
 638         size_t count = 0;
 639
 640         while (*s) {
 641                 size_t size;
 642                 codepoint_t c2 = next_codepoint(s, &size);
 643                 if (c2 == c) count++;
 644                 s += size;
 645         }
 646
 647         return count;
 648 }
 649
 650