From 44ca84166e5a909f1acd77323a444ff2e14a8db8 Mon Sep 17 00:00:00 2001 From: Tim Beale Date: Fri, 11 Aug 2017 13:53:31 +1200 Subject: [PATCH] replmd: Allow missing targets if GET_TGT has already been set While running the selftests, I noticed a case where DC replication unexpectedly sends a linked attribute for a deleted object (created in the drs.ridalloc_exop tests). The problem is due to the msDS-NC-Replica-Locations attribute, which is a (known) one-way link. Because it is a one-way link, when the test demotes the DC and deletes the link target, there is no backlink to delete the link from the source object. After much debate and head-scratching, we decided that there wasn't an ideal way to resolve this problem. Any automated intervention could potentially do the wrong thing, especially if the link spans partitions. Running dbcheck will find this problem and is able to fix it (providing the deleted object is still a tombstone). So the recommendation is to run dbcheck on your DCs every 6 months (or more frequently if using a lower tombstone lifetime setting). However, it does highlight a problem with the current GET_TGT implementation. If the tombstone object had been expunged and you upgraded to 4.8, then you would be stuck - replication would fail because the target object can't be resolved, even with GET_TGT, and dbcheck would not be able to fix the hanging link. The solution is to not fail the replication for an unknown target if GET_TGT has already been set (i.e. the dsdb_repl_flags contains DSDB_REPL_FLAG_TARGETS_UPTODATE). It's debatable whether we should add a hanging link in this case or ignore/drop the link. Some cases to consider: - If you're talking to a DC that still sends all the links last, you could still get object deletion between processing the source object's links and sending the target (GET_TGT just restarts the replication cycle from scratch). Adding a hanging link in this case would be incorrect and would add spurious information to the DB. - Suppose there's a bug in Samba that incorrectly results in an object disappearing. If other DCs then remove any links that pointed to that object, it makes recovering from the problem harder. However, simply ignoring the link shouldn't result in data loss, i.e. replication won't remove the existing link information from other DCs. Data loss in this case would only occur if a new DC were brought online, or if it were a new link that was affected. Based on this, I think ignoring the link does the least harm. This problem also highlights that we should really be using the same logic in both the unknown target and the deleted target cases. Combining the logic and moving it into a common replmd_allow_missing_target() function fixes the problem. (This also has the side-effect of fixing another logic flaw - in the deleted object case we would unnecessarily retry with GET_TGT if the target object was in another partition. This is pointless work, because GET_TGT won't resolve the target). Signed-off-by: Tim Beale Reviewed-by: Andrew Bartlett BUG: https://bugzilla.samba.org/show_bug.cgi?id=12972 Reviewed-by: Garming Sam --- .../dsdb/samdb/ldb_modules/repl_meta_data.c | 156 +++++++++++------- source4/torture/drs/python/getncchanges.py | 24 ++- 2 files changed, 121 insertions(+), 59 deletions(-) diff --git a/source4/dsdb/samdb/ldb_modules/repl_meta_data.c b/source4/dsdb/samdb/ldb_modules/repl_meta_data.c index 68268ef5ee45..1e78045e7238 100644 --- a/source4/dsdb/samdb/ldb_modules/repl_meta_data.c +++ b/source4/dsdb/samdb/ldb_modules/repl_meta_data.c @@ -6725,6 +6725,89 @@ static int replmd_extended_replicated_objects(struct ldb_module *module, struct return replmd_replicated_apply_next(ar); } +/** + * Checks how to handle an missing target - either we need to fail the + * replication and retry with GET_TGT, ignore the link and continue, or try to + * add a partial link to an unknown target. + */ +static int replmd_allow_missing_target(struct ldb_module *module, + TALLOC_CTX *mem_ctx, + struct ldb_dn *target_dn, + struct ldb_dn *source_dn, + struct GUID *guid, + uint32_t dsdb_repl_flags, + bool *ignore_link, + const char * missing_str) +{ + struct ldb_context *ldb = ldb_module_get_ctx(module); + bool is_in_same_nc; + + /* + * we may not be able to resolve link targets properly when + * dealing with subsets of objects, e.g. the source is a + * critical object and the target isn't + * + * TODO: + * When we implement Trusted Domains we need to consider + * whether they get treated as an incomplete replica here or not + */ + if (dsdb_repl_flags & DSDB_REPL_FLAG_OBJECT_SUBSET) { + + /* + * Ignore the link. We don't increase the highwater-mark in + * the object subset cases, so subsequent replications should + * resolve any missing links + */ + DEBUG(2, ("%s target %s linked from %s\n", missing_str, + ldb_dn_get_linearized(target_dn), + ldb_dn_get_linearized(source_dn))); + *ignore_link = true; + return LDB_SUCCESS; + } + + if (dsdb_repl_flags & DSDB_REPL_FLAG_TARGETS_UPTODATE) { + + /* + * target should already be up-to-date so there's no point in + * retrying. This could be due to bad timing, or if a target + * on a one-way link was deleted. We ignore the link rather + * than failing the replication cycle completely + */ + *ignore_link = true; + DBG_WARNING("%s is %s but up to date. Ignoring link from %s\n", + ldb_dn_get_linearized(target_dn), missing_str, + ldb_dn_get_linearized(source_dn)); + return LDB_SUCCESS; + } + + is_in_same_nc = dsdb_objects_have_same_nc(ldb, + mem_ctx, + source_dn, + target_dn); + if (is_in_same_nc) { + /* fail the replication and retry with GET_TGT */ + ldb_asprintf_errstring(ldb, "%s target %s GUID %s linked from %s\n", + missing_str, + ldb_dn_get_linearized(target_dn), + GUID_string(mem_ctx, guid), + ldb_dn_get_linearized(source_dn)); + return LDB_ERR_NO_SUCH_OBJECT; + } + + /* + * The target of the cross-partition link is missing. Continue + * and try to at least add the forward-link. This isn't great, + * but a partial link can be fixed by dbcheck, so it's better + * than dropping the link completely. + */ + DBG_WARNING("%s cross-partition target %s linked from %s\n", + missing_str, ldb_dn_get_linearized(target_dn), + ldb_dn_get_linearized(source_dn)); + *ignore_link = false; + + return LDB_SUCCESS; +} + /** * Checks that the target object for a linked attribute exists. * @param guid returns the target object's GUID (is returned)if it exists) @@ -6807,46 +6890,14 @@ static int replmd_check_target_exists(struct ldb_module *module, if (target_res->count == 0) { /* - * we may not be able to resolve link targets properly when - * dealing with subsets of objects, e.g. the source is a - * critical object and the target isn't - * - * TODO: - * When we implement Trusted Domains we need to consider - * whether they get treated as an incomplete replica here or not + * target object is unknown. Check whether to ignore the link, + * fail the replication, or add a partial link */ - if (la_entry->dsdb_repl_flags & DSDB_REPL_FLAG_OBJECT_SUBSET) { - - /* - * We don't increase the highwater-mark in the object - * subset cases, so subsequent replications should - * resolve any missing links - */ - DEBUG(2,(__location__ - ": Failed to find target %s linked from %s\n", - ldb_dn_get_linearized(dsdb_dn->dn), - ldb_dn_get_linearized(source_dn))); - *ignore_link = true; - - } else if (dsdb_objects_have_same_nc(ldb, tmp_ctx, source_dn, - dsdb_dn->dn)) { - ldb_asprintf_errstring(ldb, "Unknown target %s GUID %s linked from %s\n", - ldb_dn_get_linearized(dsdb_dn->dn), - GUID_string(tmp_ctx, guid), - ldb_dn_get_linearized(source_dn)); - ret = LDB_ERR_NO_SUCH_OBJECT; - } else { + ret = replmd_allow_missing_target(module, tmp_ctx, dsdb_dn->dn, + source_dn, guid, + la_entry->dsdb_repl_flags, + ignore_link, "Unknown"); - /* - * The target of the cross-partition link is missing. - * Continue and try to at least add the forward-link. - * This isn't great, but if we can add a partial link - * then it's better than nothing. - */ - DEBUG(2,("Failed to resolve cross-partition link between %s and %s\n", - ldb_dn_get_linearized(source_dn), - ldb_dn_get_linearized(dsdb_dn->dn))); - } } else if (target_res->count != 1) { ldb_asprintf_errstring(ldb, "More than one object found matching objectGUID %s\n", GUID_string(tmp_ctx, guid)); @@ -6869,26 +6920,15 @@ static int replmd_check_target_exists(struct ldb_module *module, */ if (target_deletion_state >= OBJECT_RECYCLED) { - if (la_entry->dsdb_repl_flags & DSDB_REPL_FLAG_TARGETS_UPTODATE) { - - /* - * target should already be uptodate so there's no - * point retrying - it's probably just bad timing - */ - *ignore_link = true; - DEBUG(0, ("%s is deleted but up to date. " - "Ignoring link from %s\n", - ldb_dn_get_linearized(dsdb_dn->dn), - ldb_dn_get_linearized(source_dn))); - - } else { - ldb_asprintf_errstring(ldb, - "Deleted target %s GUID %s linked from %s", - ldb_dn_get_linearized(dsdb_dn->dn), - GUID_string(tmp_ctx, guid), - ldb_dn_get_linearized(source_dn)); - ret = LDB_ERR_NO_SUCH_OBJECT; - } + /* + * target object is deleted. Check whether to ignore the + * link, fail the replication, or add a partial link + */ + ret = replmd_allow_missing_target(module, tmp_ctx, + dsdb_dn->dn, + source_dn, guid, + la_entry->dsdb_repl_flags, + ignore_link, "Deleted"); } } diff --git a/source4/torture/drs/python/getncchanges.py b/source4/torture/drs/python/getncchanges.py index 9022f4a19095..b1a8f68c2c06 100644 --- a/source4/torture/drs/python/getncchanges.py +++ b/source4/torture/drs/python/getncchanges.py @@ -1042,8 +1042,30 @@ class DrsReplicaSyncIntegrityTestCase(drs_base.DrsBaseTestCase): self.assertFalse("managedBy" in res[0], "%s in DB still has managedBy attribute" % la_source) + # Check receiving a cross-partition link to a deleted target. + # Delete the target and make sure the deletion is sync'd between DCs + target_guid = self.get_object_guid(la_target) + self.test_ldb_dc.delete(la_target) + self.sync_DCs(nc_dn=self.config_dn) + self._disable_all_repl(self.dnsname_dc2) + + # re-animate the target + self.restore_deleted_object(target_guid, la_target) + self.modify_object(la_source, "managedBy", la_target) + + # now sync the link - because the target is in another partition, the + # peer DC receives a link for a deleted target, which it should accept + self.sync_DCs() + res = self.test_ldb_dc.search(ldb.Dn(self.ldb_dc1, la_source), + attrs=["managedBy"], + controls=['extended_dn:1:0'], + scope=ldb.SCOPE_BASE) + self.assertTrue("managedBy" in res[0], "%s in DB missing managedBy attribute" + % la_source) + # cleanup the server object we created in the Configuration partition - self.test_ldb_dc.delete(la_source) + self.test_ldb_dc.delete(la_target) + self._enable_all_repl(self.dnsname_dc2) def test_repl_get_tgt_multivalued_links(self): """Tests replication with multi-valued link attributes.""" -- 2.34.1