commit 4ff67682be16c48c098e0412aee07cabd74a1f27 from: Stefan Sperling date: Wed Sep 18 15:50:11 2024 UTC detect objects missing from pack file that should be present commit - 387c45f4c7dcbeea29112789e8c87b5bd973ce37 commit + 4ff67682be16c48c098e0412aee07cabd74a1f27 blob - 83e9e758949ae582b359469d41573bd1b3f44205 blob + ab9b86ab3ca21b10c0676ff6d55f269df436634e --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -723,12 +723,14 @@ class GitLoader(BaseGitLoader): tags = {} commits = {} commit_edges = [] + missing_objects = [] for ref_name, ref_object_hex in self.remote_refs.items(): if utils.ignore_branch_name(ref_name): continue try: obj = self.pack[ref_object_hex] except KeyError: + missing_objects.append(ref_object_hex) continue while obj.type_name == b"tag": @@ -739,7 +741,7 @@ class GitLoader(BaseGitLoader): try: tagged_obj = self.pack[tagged_object_hex] except KeyError: - logger.debug(f" pack is missing: {tagged_object_hex}") + missing_objects.append(tagged_object_hex) obj = None break else: @@ -748,9 +750,11 @@ class GitLoader(BaseGitLoader): obj = tagged_obj ref_object_hex = tagged_object_hex - # TODO: Allow tags pointing at blobs or trees? if obj is None: + # Object is missing from pack file. continue + + # TODO: Allow tags pointing at blobs or trees? if obj.type_name != b"commit": logger.debug( f" tag {ref_name} resolves to a {obj.type_name}, not a commit" @@ -766,7 +770,6 @@ class GitLoader(BaseGitLoader): try: commit = self.pack[commit_hex] except KeyError: - logger.debug(f" pack is missing: {commit_hex}") continue commit_hash = hashutil.bytehex_to_hash(commit_hex) if commit_hash in commits.keys(): @@ -781,6 +784,15 @@ class GitLoader(BaseGitLoader): if (commit_hash, parent_hash) not in commit_edges: commit_edges.append((commit_hash, parent_hash)) + archived_missing_objects = set( + self.storage.object_find_by_sha1_git(missing_objects).keys() + ) + if len(archived_missing_objects) != len(missing_objects): + raise NotFound( + "Referenced objects found in neither the fetched pack file " + f"nor in the archive: {missing_objects - archived_missing_objects}" + ) + def add_vertices(new_vertices, object_type): attributes = dict() attributes["object_type"] = [object_type for x in new_vertices] @@ -811,7 +823,6 @@ class GitLoader(BaseGitLoader): try: tree = self.pack[tree_hex] except KeyError: - logger.debug(f" pack is missing: {tree_hex}") continue for name, mode, entry_hex in tree.iteritems(): if mode & submodule_mode == submodule_mode: @@ -827,7 +838,7 @@ class GitLoader(BaseGitLoader): tree = self.pack[entry_hex] subtrees.append(entry_hash) except KeyError: - logger.debug(f" pack is missing: {entry_hex}") + pass # add new vertices and edges in batches for performance reasons if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 100000: if len(new_trees) > 0: