ldb: note a transitivity problem in ldb_comparison_fold
authorDouglas Bagnall <douglas.bagnall@catalyst.net.nz>
Tue, 30 Apr 2024 00:41:25 +0000 (12:41 +1200)
committerAndrew Bartlett <abartlet@samba.org>
Tue, 7 May 2024 23:25:35 +0000 (23:25 +0000)
Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Reviewed-by: Andrew Bartlett <abartlet@samba.org>
lib/ldb/common/attrib_handlers.c

index 2344dc0c8afc8f890e126bafcd8b977b61229c40..e6d412bd3cfb8d0be50145d6ee1215fb47f22c35 100644 (file)
@@ -389,6 +389,9 @@ utf8str:
         * No need to recheck from the start, just from the first utf8 charu
         * found. Note that the callback of ldb_casefold() needs to be ascii
         * compatible.
+        *
+        * Probably ldb_casefold() is wrap_casefold() which wraps
+        * strupper_talloc_n().
         */
        b1 = ldb_casefold(ldb, mem_ctx, s1, n1);
        b2 = ldb_casefold(ldb, mem_ctx, s2, n2);
@@ -397,6 +400,28 @@ utf8str:
                /*
                 * One of the strings was not UTF8, so we have no
                 * options but to do a binary compare.
+                *
+                * FIXME: this can be non-transitive.
+                *
+                * consider {
+                *           CA 8A  "ʊ"
+                *           C6 B1  "Ʊ"
+                *           C8 FE  invalid utf-8
+                *          }
+                *
+                * The byte "0xfe" is always invalid in utf-8, so the
+                * comparisons against that string end up coming this way,
+                * while the "Ʊ" vs "ʊ" comparison goes via the ldb_casefold
+                * branch. Then:
+                *
+                *  "ʊ" == "Ʊ"     by casefold.
+                *  "ʊ" > {c8 fe}  by byte comparison.
+                *  "Ʊ" < {c8 fe}  by byte comparison.
+                *
+                * In many cases there are no invalid encodings between the
+                * upper and lower case letters, but the string as a whole
+                * might also compare differently due to the space-eating in
+                * the other branch.
                 */
                talloc_free(b1);
                talloc_free(b2);