lib/util/charset/codepoints.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Character set conversion Extensions
   4    Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
   5    Copyright (C) Andrew Tridgell 2001
   6    Copyright (C) Simo Sorce 2001
   7    Copyright (C) Jelmer Vernooij 2007
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 */
  23 #include "includes.h"
  24 #include "lib/util/charset/charset.h"
  25 #include "system/locale.h"
  26 #include "dynconfig/dynconfig.h"
  27
  28 #ifdef strcasecmp
  29 #undef strcasecmp
  30 #endif
  31
  32 /**
  33  * @file
  34  * @brief Unicode string manipulation
  35  */
  36
  37 /* these 2 tables define the unicode case handling.  They are loaded
  38    at startup either via mmap() or read() from the lib directory */
  39 static void *upcase_table;
  40 static void *lowcase_table;
  41
  42
  43 /*******************************************************************
  44 load the case handling tables
  45
  46 This is the function that should be called from library code.
  47 ********************************************************************/
  48 void load_case_tables_library(void)
  49 {
  50         TALLOC_CTX *mem_ctx;
  51
  52         mem_ctx = talloc_init("load_case_tables");
  53         if (!mem_ctx) {
  54                 smb_panic("No memory for case_tables");
  55         }
  56         upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
  57         lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
  58         talloc_free(mem_ctx);
  59         if (upcase_table == NULL) {
  60                 DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
  61                 upcase_table = (void *)-1;
  62         }
  63         if (lowcase_table == NULL) {
  64                 DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
  65                 lowcase_table = (void *)-1;
  66         }
  67 }
  68
  69 /*******************************************************************
  70 load the case handling tables
  71
  72 This MUST only be called from main() in application code, never from a
  73 library.  We don't know if the calling program has already done
  74 setlocale() to another value, and can't tell if they have.
  75 ********************************************************************/
  76 void load_case_tables(void)
  77 {
  78         /* This is a useful global hook where we can ensure that the
  79          * locale is set from the environment.  This is needed so that
  80          * we can use LOCALE as a codepage */
  81 #ifdef HAVE_SETLOCALE
  82         setlocale(LC_ALL, "");
  83 #endif
  84         load_case_tables_library();
  85 }
  86
  87 /**
  88  Convert a codepoint_t to upper case.
  89 **/
  90 _PUBLIC_ codepoint_t toupper_m(codepoint_t val)
  91 {
  92         if (val < 128) {
  93                 return toupper(val);
  94         }
  95         if (upcase_table == NULL) {
  96                 load_case_tables_library();
  97         }
  98         if (upcase_table == (void *)-1) {
  99                 return val;
 100         }
 101         if (val & 0xFFFF0000) {
 102                 return val;
 103         }
 104         return SVAL(upcase_table, val*2);
 105 }
 106
 107 /**
 108  Convert a codepoint_t to lower case.
 109 **/
 110 _PUBLIC_ codepoint_t tolower_m(codepoint_t val)
 111 {
 112         if (val < 128) {
 113                 return tolower(val);
 114         }
 115         if (lowcase_table == NULL) {
 116                 load_case_tables_library();
 117         }
 118         if (lowcase_table == (void *)-1) {
 119                 return val;
 120         }
 121         if (val & 0xFFFF0000) {
 122                 return val;
 123         }
 124         return SVAL(lowcase_table, val*2);
 125 }
 126
 127 /**
 128  If we upper cased this character, would we get the same character?
 129 **/
 130 _PUBLIC_ bool islower_m(codepoint_t val)
 131 {
 132         return (toupper_m(val) != val);
 133 }
 134
 135 /**
 136  If we lower cased this character, would we get the same character?
 137 **/
 138 _PUBLIC_ bool isupper_m(codepoint_t val)
 139 {
 140         return (tolower_m(val) != val);
 141 }
 142
 143 /**
 144   compare two codepoints case insensitively
 145 */
 146 _PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
 147 {
 148         if (c1 == c2 ||
 149             toupper_m(c1) == toupper_m(c2)) {
 150                 return 0;
 151         }
 152         return c1 - c2;
 153 }
 154
 155
 156 struct smb_iconv_handle {
 157         TALLOC_CTX *child_ctx;
 158         const char *unix_charset;
 159         const char *dos_charset;
 160         const char *display_charset;
 161         bool use_builtin_handlers;
 162         smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
 163 };
 164
 165 struct smb_iconv_handle *global_iconv_handle = NULL;
 166
 167 struct smb_iconv_handle *get_iconv_handle(void)
 168 {
 169         if (global_iconv_handle == NULL)
 170                 global_iconv_handle = smb_iconv_handle_reinit(talloc_autofree_context(),
 171                                                               "ASCII", "UTF-8", true, NULL);
 172         return global_iconv_handle;
 173 }
 174
 175 struct smb_iconv_handle *get_iconv_testing_handle(TALLOC_CTX *mem_ctx,
 176                                                   const char *dos_charset,
 177                                                   const char *unix_charset,
 178                                                   bool use_builtin_handlers)
 179 {
 180         return smb_iconv_handle_reinit(mem_ctx,
 181                                        dos_charset, unix_charset, use_builtin_handlers, NULL);
 182 }
 183
 184 /**
 185  * Return the name of a charset to give to iconv().
 186  **/
 187 const char *charset_name(struct smb_iconv_handle *ic, charset_t ch)
 188 {
 189         switch (ch) {
 190         case CH_UTF16: return "UTF-16LE";
 191         case CH_UNIX: return ic->unix_charset;
 192         case CH_DOS: return ic->dos_charset;
 193         case CH_UTF8: return "UTF8";
 194         case CH_UTF16BE: return "UTF-16BE";
 195         case CH_UTF16MUNGED: return "UTF16_MUNGED";
 196         default:
 197         return "ASCII";
 198         }
 199 }
 200
 201 /**
 202  re-initialize iconv conversion descriptors
 203 **/
 204 static int close_iconv_handle(struct smb_iconv_handle *data)
 205 {
 206         unsigned c1, c2;
 207         for (c1=0;c1<NUM_CHARSETS;c1++) {
 208                 for (c2=0;c2<NUM_CHARSETS;c2++) {
 209                         if (data->conv_handles[c1][c2] != NULL) {
 210                                 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
 211                                         smb_iconv_close(data->conv_handles[c1][c2]);
 212                                 }
 213                                 data->conv_handles[c1][c2] = NULL;
 214                         }
 215                 }
 216         }
 217
 218         return 0;
 219 }
 220
 221 /*
 222   the old_ic is passed in here as the smb_iconv_handle structure
 223   is used as a global pointer in some places (eg. python modules). We
 224   don't want to invalidate those global pointers, but we do want to
 225   update them with the right charset information when loadparm
 226   runs. To do that we need to re-use the structure pointer, but
 227   re-fill the elements in the structure with the updated values
 228  */
 229 _PUBLIC_ struct smb_iconv_handle *smb_iconv_handle_reinit(TALLOC_CTX *mem_ctx,
 230                                                                     const char *dos_charset,
 231                                                                     const char *unix_charset,
 232                                                                     bool use_builtin_handlers,
 233                                                                     struct smb_iconv_handle *old_ic)
 234 {
 235         struct smb_iconv_handle *ret;
 236
 237         if (old_ic != NULL) {
 238                 ret = old_ic;
 239                 close_iconv_handle(ret);
 240                 talloc_free(ret->child_ctx);
 241                 ZERO_STRUCTP(ret);
 242         } else {
 243                 ret = talloc_zero(mem_ctx, struct smb_iconv_handle);
 244         }
 245         if (ret == NULL) {
 246                 return NULL;
 247         }
 248
 249         /* we use a child context to allow us to free all ptrs without
 250            freeing the structure itself */
 251         ret->child_ctx = talloc_new(ret);
 252         if (ret->child_ctx == NULL) {
 253                 return NULL;
 254         }
 255
 256         talloc_set_destructor(ret, close_iconv_handle);
 257
 258         if (strcasecmp(dos_charset, "UTF8") == 0 || strcasecmp(dos_charset, "UTF-8") == 0) {
 259                 DEBUG(0,("ERROR: invalid DOS charset: 'dos charset' must not be UTF8, using (default value) CP850 instead\n"));
 260                 dos_charset = "CP850";
 261         }
 262
 263         ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
 264         ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
 265         ret->use_builtin_handlers = use_builtin_handlers;
 266
 267         return ret;
 268 }
 269
 270 /*
 271   on-demand initialisation of conversion handles
 272 */
 273 smb_iconv_t get_conv_handle(struct smb_iconv_handle *ic,
 274                             charset_t from, charset_t to)
 275 {
 276         const char *n1, *n2;
 277
 278         if (ic->conv_handles[from][to]) {
 279                 return ic->conv_handles[from][to];
 280         }
 281
 282         n1 = charset_name(ic, from);
 283         n2 = charset_name(ic, to);
 284
 285         ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
 286                                                        ic->use_builtin_handlers);
 287
 288         if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
 289                 if ((from == CH_DOS || to == CH_DOS) &&
 290                     strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
 291                         DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
 292                                  charset_name(ic, CH_DOS)));
 293                         ic->dos_charset = "ASCII";
 294
 295                         n1 = charset_name(ic, from);
 296                         n2 = charset_name(ic, to);
 297
 298                         ic->conv_handles[from][to] =
 299                                 smb_iconv_open_ex(ic, n2, n1, ic->use_builtin_handlers);
 300                 }
 301         }
 302
 303         return ic->conv_handles[from][to];
 304 }
 305
 306 /**
 307  * Return the unicode codepoint for the next character in the input
 308  * string in the given src_charset.
 309  * The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
 310  *
 311  * Also return the number of bytes consumed (which tells the caller
 312  * how many bytes to skip to get to the next src_charset-character).
 313  *
 314  * This is implemented (in the non-ascii-case) by first converting the
 315  * next character in the input string to UTF16_LE and then calculating
 316  * the unicode codepoint from that.
 317  *
 318  * Return INVALID_CODEPOINT if the next character cannot be converted.
 319  */
 320 _PUBLIC_ codepoint_t next_codepoint_handle_ext(
 321                         struct smb_iconv_handle *ic,
 322                         const char *str, size_t len,
 323                         charset_t src_charset,
 324                         size_t *bytes_consumed)
 325 {
 326         /* it cannot occupy more than 4 bytes in UTF16 format */
 327         uint8_t buf[4];
 328         smb_iconv_t descriptor;
 329         size_t ilen_orig;
 330         size_t ilen;
 331         size_t olen;
 332         char *outbuf;
 333
 334         if ((str[0] & 0x80) == 0) {
 335                 *bytes_consumed = 1;
 336                 return (codepoint_t)str[0];
 337         }
 338
 339         /*
 340          * we assume that no multi-byte character can take more than 5 bytes.
 341          * This is OK as we only support codepoints up to 1M (U+100000)
 342          */
 343         ilen_orig = MIN(len, 5);
 344         ilen = ilen_orig;
 345
 346         descriptor = get_conv_handle(ic, src_charset, CH_UTF16);
 347         if (descriptor == (smb_iconv_t)-1) {
 348                 *bytes_consumed = 1;
 349                 return INVALID_CODEPOINT;
 350         }
 351
 352         /*
 353          * this looks a little strange, but it is needed to cope with
 354          * codepoints above 64k (U+1000) which are encoded as per RFC2781.
 355          */
 356         olen = 2;
 357         outbuf = (char *)buf;
 358         smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
 359         if (olen == 2) {
 360                 olen = 4;
 361                 outbuf = (char *)buf;
 362                 smb_iconv(descriptor,  &str, &ilen, &outbuf, &olen);
 363                 if (olen == 4) {
 364                         /* we didn't convert any bytes */
 365                         *bytes_consumed = 1;
 366                         return INVALID_CODEPOINT;
 367                 }
 368                 olen = 4 - olen;
 369         } else {
 370                 olen = 2 - olen;
 371         }
 372
 373         *bytes_consumed = ilen_orig - ilen;
 374
 375         if (olen == 2) {
 376                 return (codepoint_t)SVAL(buf, 0);
 377         }
 378         if (olen == 4) {
 379                 /* decode a 4 byte UTF16 character manually */
 380                 return (codepoint_t)0x10000 +
 381                         (buf[2] | ((buf[3] & 0x3)<<8) |
 382                          (buf[0]<<10) | ((buf[1] & 0x3)<<18));
 383         }
 384
 385         /* no other length is valid */
 386         return INVALID_CODEPOINT;
 387 }
 388
 389 /*
 390   return the unicode codepoint for the next multi-byte CH_UNIX character
 391   in the string
 392
 393   also return the number of bytes consumed (which tells the caller
 394   how many bytes to skip to get to the next CH_UNIX character)
 395
 396   return INVALID_CODEPOINT if the next character cannot be converted
 397 */
 398 _PUBLIC_ codepoint_t next_codepoint_handle(struct smb_iconv_handle *ic,
 399                                            const char *str, size_t *size)
 400 {
 401         /*
 402          * We assume that no multi-byte character can take more than 5 bytes
 403          * thus avoiding walking all the way down a long string. This is OK as
 404          * Unicode codepoints only go up to (U+10ffff), which can always be
 405          * encoded in 4 bytes or less.
 406          */
 407         return next_codepoint_handle_ext(ic, str, strnlen(str, 5), CH_UNIX,
 408                                          size);
 409 }
 410
 411 /*
 412   push a single codepoint into a CH_UNIX string the target string must
 413   be able to hold the full character, which is guaranteed if it is at
 414   least 5 bytes in size. The caller may pass less than 5 bytes if they
 415   are sure the character will fit (for example, you can assume that
 416   uppercase/lowercase of a character will not add more than 1 byte)
 417
 418   return the number of bytes occupied by the CH_UNIX character, or
 419   -1 on failure
 420 */
 421 _PUBLIC_ ssize_t push_codepoint_handle(struct smb_iconv_handle *ic,
 422                                 char *str, codepoint_t c)
 423 {
 424         smb_iconv_t descriptor;
 425         uint8_t buf[4];
 426         size_t ilen, olen;
 427         const char *inbuf;
 428
 429         if (c < 128) {
 430                 *str = c;
 431                 return 1;
 432         }
 433
 434         descriptor = get_conv_handle(ic,
 435                                      CH_UTF16, CH_UNIX);
 436         if (descriptor == (smb_iconv_t)-1) {
 437                 return -1;
 438         }
 439
 440         if (c < 0x10000) {
 441                 ilen = 2;
 442                 olen = 5;
 443                 inbuf = (char *)buf;
 444                 SSVAL(buf, 0, c);
 445                 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
 446                 if (ilen != 0) {
 447                         return -1;
 448                 }
 449                 return 5 - olen;
 450         }
 451
 452         c -= 0x10000;
 453
 454         buf[0] = (c>>10) & 0xFF;
 455         buf[1] = (c>>18) | 0xd8;
 456         buf[2] = c & 0xFF;
 457         buf[3] = ((c>>8) & 0x3) | 0xdc;
 458
 459         ilen = 4;
 460         olen = 5;
 461         inbuf = (char *)buf;
 462
 463         smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
 464         if (ilen != 0) {
 465                 return -1;
 466         }
 467         return 5 - olen;
 468 }
 469
 470 _PUBLIC_ codepoint_t next_codepoint_ext(const char *str, size_t len,
 471                                         charset_t src_charset, size_t *size)
 472 {
 473         return next_codepoint_handle_ext(get_iconv_handle(), str, len,
 474                                          src_charset, size);
 475 }
 476
 477 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
 478 {
 479         return next_codepoint_handle(get_iconv_handle(), str, size);
 480 }
 481
 482 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
 483 {
 484         return push_codepoint_handle(get_iconv_handle(), str, c);
 485 }