util/rfc1738: simplify and fix rfc1738_escape_part()
authorDouglas Bagnall <douglas.bagnall@catalyst.net.nz>
Tue, 20 Feb 2018 10:56:11 +0000 (23:56 +1300)
committerDouglas Bagnall <dbagnall@samba.org>
Thu, 22 Feb 2018 00:04:18 +0000 (01:04 +0100)
We now encode according to RFC 3986 (section 2.1 - 2.3).

Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Reviewed-by: Andrew Bartlett <abartlet@samba.org>
lib/util/rfc1738.c
selftest/knownfail.d/rfc1738 [deleted file]

index 4b376ac2c2530f3cde7f4871139d7f2a6b6a1299..3c806de92b26f278b53120205a5a7e4f7e7ee16d 100644 (file)
 #include "lib/util/samba_util.h"
 #include "lib/util/util_str_hex.h"
 
+#define RFC1738_ENCODE 1
+#define RFC1738_RESERVED 2
+
 /*
- *  RFC 1738 defines that these characters should be escaped, as well
- *  any non-US-ASCII character or anything between 0x00 - 0x1F.
+ * According to RFC 1738, "$-_.+!*'()," are not reserved or unsafe, but as
+ * that has been obsolete since 2004, we sm instead for RFC 3986, where:
+ *
+ *  reserved =    : / ? # [ ] @ ! $ & ' ( ) * + , ; =
+ *  unreserved = ALPHA DIGIT - . _ ~
+ *
+ * and whatever is not in either of those are what RFC 1738 called "unsafe",
+ * meaning that they should are canonically but not mandatorily escaped.
+ *
+ * Characters below 0x20 or above 0x7E are always enocded.
  */
-static char rfc1738_unsafe_chars[] = {
-    (char) 0x3C,               /* < */
-    (char) 0x3E,               /* > */
-    (char) 0x22,               /* " */
-    (char) 0x23,               /* # */
-#if 0                          /* done in code */
-    (char) 0x25,               /* % */
-#endif
-    (char) 0x7B,               /* { */
-    (char) 0x7D,               /* } */
-    (char) 0x7C,               /* | */
-    (char) 0x5C,               /* \ */
-    (char) 0x5E,               /* ^ */
-    (char) 0x7E,               /* ~ */
-    (char) 0x5B,               /* [ */
-    (char) 0x5D,               /* ] */
-    (char) 0x60,               /* ` */
-    (char) 0x27,               /* ' */
-    (char) 0x20                        /* space */
-};
 
-static char rfc1738_reserved_chars[] = {
-    (char) 0x3b,               /* ; */
-    (char) 0x2f,               /* / */
-    (char) 0x3f,               /* ? */
-    (char) 0x3a,               /* : */
-    (char) 0x40,               /* @ */
-    (char) 0x3d,               /* = */
-    (char) 0x26                        /* & */
+static const unsigned char escapees[127] = {
+       [' '] = RFC1738_ENCODE,
+       ['"'] = RFC1738_ENCODE,
+       ['%'] = RFC1738_ENCODE,
+       ['<'] = RFC1738_ENCODE,
+       ['>'] = RFC1738_ENCODE,
+       ['\\'] = RFC1738_ENCODE,
+       ['^'] = RFC1738_ENCODE,
+       ['`'] = RFC1738_ENCODE,
+       ['{'] = RFC1738_ENCODE,
+       ['|'] = RFC1738_ENCODE,
+       ['}'] = RFC1738_ENCODE,
+       /* reserved : / ? # [ ] @ ! $ & ' ( ) * + , ; = */
+       [':'] = RFC1738_RESERVED,
+       ['/'] = RFC1738_RESERVED,
+       ['?'] = RFC1738_RESERVED,
+       ['#'] = RFC1738_RESERVED,
+       ['['] = RFC1738_RESERVED,
+       [']'] = RFC1738_RESERVED,
+       ['@'] = RFC1738_RESERVED,
+       ['!'] = RFC1738_RESERVED,
+       ['$'] = RFC1738_RESERVED,
+       ['&'] = RFC1738_RESERVED,
+       ['\''] = RFC1738_RESERVED,
+       ['('] = RFC1738_RESERVED,
+       [')'] = RFC1738_RESERVED,
+       ['*'] = RFC1738_RESERVED,
+       ['+'] = RFC1738_RESERVED,
+       [','] = RFC1738_RESERVED,
+       [';'] = RFC1738_RESERVED,
+       ['='] = RFC1738_RESERVED,
 };
 
 /*
- *  rfc1738_escape - Returns a static buffer contains the RFC 1738
- *  compliant, escaped version of the given url.
+ *  rfc1738_do_escape - fills a preallocated buffer with an escaped version of
+ *  the given string.
  *
+ *  For canonical escaping, mask should be RFC1738_ENCODE | RFC1738_RESERVED.
+ *  For mandatory escaping, mask should be RFC1738_RESERVED.
  */
 static char *
-rfc1738_do_escape(TALLOC_CTX *mem_ctx, const char *url, int encode_reserved)
+rfc1738_do_escape(char *buf, size_t bufsize,
+                 const char *url, size_t len, unsigned char mask)
 {
-    size_t bufsize = 0;
-    const char *p;
-    char *buf;
-    char *q;
-    unsigned int i, do_escape;
-
-    bufsize = strlen(url) * 3 + 1;
-    buf = talloc_array(mem_ctx, char, bufsize);
-    if (!buf) {
-           return NULL;
-    }
-
-    talloc_set_name_const(buf, buf);
-    buf[0] = '\0';
-
-    for (p = url, q = buf; *p != '\0' && q < (buf + bufsize - 1); p++, q++) {
-        do_escape = 0;
-
-        /* RFC 1738 defines these chars as unsafe */
-        for (i = 0; i < sizeof(rfc1738_unsafe_chars); i++) {
-            if (*p == rfc1738_unsafe_chars[i]) {
-                do_escape = 1;
-                break;
-            }
-        }
-        /* Handle % separately */
-        if (encode_reserved >= 0 && *p == '%')
-            do_escape = 1;
-        /* RFC 1738 defines these chars as reserved */
-        for (i = 0; i < sizeof(rfc1738_reserved_chars) && encode_reserved > 0; i++) {
-            if (*p == rfc1738_reserved_chars[i]) {
-                do_escape = 1;
-                break;
-            }
-        }
-        /* RFC 1738 says any control chars (0x00-0x1F) are encoded */
-        if ((unsigned char) *p <= (unsigned char) 0x1F) {
-            do_escape = 1;
-        }
-        /* RFC 1738 says 0x7f is encoded */
-        if (*p == (char) 0x7F) {
-            do_escape = 1;
-        }
-        /* RFC 1738 says any non-US-ASCII are encoded */
-        if (((unsigned char) *p >= (unsigned char) 0x80)) {
-            do_escape = 1;
-        }
-        /* Do the triplet encoding, or just copy the char */
-        /* note: while we do not need snprintf here as q is appropriately
-         * allocated, Samba does to avoid our macro banning it -- abartlet */
-
-        if (do_escape == 1) {
-               (void) snprintf(q, 4, "%%%02X", (unsigned char) *p);
-            q += sizeof(char) * 2;
-        } else {
-            *q = *p;
-        }
-    }
-    *q = '\0';
-    return (buf);
+       size_t i;
+       size_t j = 0;
+       for (i = 0; i < len; i++) {
+               unsigned int c = (unsigned char) url[i];
+               if (c > 126 || c < 32 || (escapees[c] & mask)) {
+                       if (j + 3 >= bufsize) {
+                               return NULL;
+                       }
+                       (void) snprintf(&buf[j], 4, "%%%02X", c);
+                       j += 3;
+               } else {
+                       if (j + 1 >= bufsize) {
+                               return NULL;
+                       }
+                       buf[j] = c;
+                       j++;
+               }
+       }
+       buf[j] = '\0';
+       return buf;
 }
 
 /*
- * rfc1738_escape_part - Returns a buffer that contains the RFC
- * 1738 compliant, escaped version of the given url segment. (escapes
- * unsafe, reserved and % chars) It would mangle the :// in http://,
- * and mangle paths (because of /).
+ * rfc1738_escape_part - Returns a talloced buffer that contains the RFC 3986
+ * compliant, escaped version of the given url segment.
  */
 char *
 rfc1738_escape_part(TALLOC_CTX *mem_ctx, const char *url)
 {
-       return rfc1738_do_escape(mem_ctx, url, 1);
+       size_t bufsize = 0;
+       char *buf = NULL;
+
+       size_t len = strlen(url);
+       if (len >= SIZE_MAX / 3) {
+               return NULL;
+       }
+
+       bufsize = len * 3 + 1;
+       buf = talloc_array(mem_ctx, char, bufsize);
+       if (buf == NULL) {
+               return NULL;
+       }
+
+       talloc_set_name_const(buf, buf);
+
+       return rfc1738_do_escape(buf, bufsize, url, len,
+                                RFC1738_ENCODE | RFC1738_RESERVED);
 }
 
 /*
diff --git a/selftest/knownfail.d/rfc1738 b/selftest/knownfail.d/rfc1738
deleted file mode 100644 (file)
index 3f5497e..0000000
+++ /dev/null
@@ -1 +0,0 @@
-^samba.unittests.rfc1738.test_escape