commit cbe1c11028d08815f6840cc4b5fdaeafca82e9ca from: Stefan Sperling date: Fri Sep 13 16:05:23 2024 UTC improve performance and debug log output commit - 91b0caf89ffbcd64c670ddc387741fe9e8901b5e commit + cbe1c11028d08815f6840cc4b5fdaeafca82e9ca blob - ffda1054c499b061edb13fc90d47c3550e64e74b blob + 034ae31642dea4c2774d19b1ecd4b3546d3967a8 --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -724,6 +724,7 @@ class GitLoader(BaseGitLoader): commits = {} commit_edges = [] for ref_name, ref_object_hex in self.remote_refs.items(): + logger.debug(f"Processing ref {ref_name}: {ref_object_hex}") if utils.ignore_branch_name(ref_name): continue try: @@ -751,9 +752,11 @@ class GitLoader(BaseGitLoader): # TODO: Allow tags pointing at blobs or trees? if obj is None or obj.type_name != b"commit": + logger.debug(f"{ref_object_hex} is a {obj.type_name}, not a commit") continue parents = [ref_object_hex] + while len(parents) > 0: commit_hex = parents.pop(0) try: @@ -761,14 +764,18 @@ class GitLoader(BaseGitLoader): except KeyError: logger.debug(f" pack is missing: {commit_hex}") continue - tree_hash = hashutil.bytehex_to_hash(commit.tree) commit_hash = hashutil.bytehex_to_hash(commit_hex) + if commit_hash in commits.keys(): + continue + tree_hash = hashutil.bytehex_to_hash(commit.tree) commits[commit_hash] = tree_hash for parent_hex in commit.parents: if parent_hex in self.pack: - parents.append(parent_hex) + if parent_hex not in parents: + parents.append(parent_hex) parent_hash = hashutil.bytehex_to_hash(parent_hex) - commit_edges.append((commit_hash, parent_hash)) + if (commit_hash, parent_hash) not in commit_edges: + commit_edges.append((commit_hash, parent_hash)) def add_vertices(new_vertices, object_type): attributes = dict() @@ -790,10 +797,6 @@ class GitLoader(BaseGitLoader): traversed_trees = set() submodule_mode = stat.S_IFDIR | stat.S_IFLNK for commit_hash, tree_hash in commits.items(): - logger.debug( - f"commit {hashutil.hash_to_hex(commit_hash)} " - f"tree {hashutil.hash_to_hex(tree_hash)}" - ) subtrees = [tree_hash] while len(subtrees) > 0: tree_hash = subtrees.pop(0) @@ -806,9 +809,7 @@ class GitLoader(BaseGitLoader): except KeyError: logger.debug(f" pack is missing: {tree_hex}") continue - logger.debug(f"Entries of {tree}:") for name, mode, entry_hex in tree.iteritems(): - logger.debug(f" {name} {mode} {entry_hex}") if mode & submodule_mode == submodule_mode: continue # ignore submodules entry_hash = hashutil.bytehex_to_hash(entry_hex) @@ -821,7 +822,6 @@ class GitLoader(BaseGitLoader): try: tree = self.pack[entry_hex] subtrees.append(entry_hash) - logger.debug(f" present in pack: {entry_hex}") except KeyError: logger.debug(f" pack is missing: {entry_hex}") # add new vertices and edges in batches for performance reasons @@ -841,18 +841,7 @@ class GitLoader(BaseGitLoader): add_vertices(new_blobs, GitObjectType.BLOB) self._object_graph.add_edges(new_edges) - for v in self._object_graph.vs: - name = hashutil.hash_to_bytehex(v["name"]) - successors = [] - vertices = [v] - object_type = v["object_type"] - while len(vertices) > 0: - v = vertices.pop(0) - for s in self._object_graph.successors(v): - sname = self._object_graph.vs["name"][s] - successors.append(hashutil.hash_to_bytehex(sname)) - vertices.append(s) - logger.debug(f"{object_type} {name} depends on {successors}") + logger.info("Commit graph has been built") def save_data(self) -> None: """Store a pack for archival""" @@ -914,12 +903,13 @@ class GitLoader(BaseGitLoader): "fetched pack file nor in local heads nor in the archive" ) - def get_dependencies(object_hash): - vertices = [self._object_graph.vs.find(name=object_hash)] + def get_dependencies(v): + vertices = [v] while len(vertices) > 0: v = vertices.pop(0) for s in self._object_graph.successors(v): - vertices.append(s) + if s not in vertices: + vertices.append(s) yield ( self._object_graph.vs[s]["name"], self._object_graph.vs[s]["object_type"], @@ -941,24 +931,38 @@ class GitLoader(BaseGitLoader): self.log.debug( "Number of packed blobs that are missing in storage: " - f"{len(missing_contents)}" + f"{len(missing_contents)} of {len(blob_vertices['name'])} packed blobs total" ) tree_hashes: List[bytes] = [] - if len(missing_contents) > 0: - missing_directories = set() + if len(missing_contents) == len(blob_vertices["name"]): + # If all blobs are missing then all trees are missing, too. + tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices] + missing_directories = set(tree_hashes) self.log.debug( - "Searching for packed trees which depend on missing packed blobs" + "Number of packed trees considered missing by implication: " + f"{len(missing_directories)}" ) + elif len(missing_contents) > 0: + missing_directories = set() + self.log.debug( + f"Searching {len(tree_vertices)} packed trees for trees which " + "depend on missing packed blobs" + ) + nsearched = 0 for t in tree_vertices: tree_hash = t["name"] + nsearched += 1 + if (nsearched % 5000) == 0: + self.log.debug( + f"Searched {int((nsearched * 100) / len(tree_vertices))}% " + f"of {len(tree_vertices)} packed trees..." + ) + if tree_hash in missing_directories: + continue have_dep = False - for dep_hash, dep_type in get_dependencies(tree_hash): + for dep_hash, dep_type in get_dependencies(t): have_dep = True if dep_type == GitObjectType.BLOB and dep_hash in missing_contents: - self.log.debug( - f"tree {hashutil.hash_to_bytehex(tree_hash)!r} depends on " - f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}" - ) # We can infer that the tree is also missing. missing_directories.add(tree_hash) break @@ -977,6 +981,10 @@ class GitLoader(BaseGitLoader): else: tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices] missing_directories = set(self.storage.directory_missing(tree_hashes)) + self.log.debug( + "Number of missing trees according to archive query: " + f"{len(missing_directories)}" + ) try: commit_vertices = self._object_graph.vs.select( @@ -989,14 +997,19 @@ class GitLoader(BaseGitLoader): self.log.debug( "Searching for packed commits which depend on missing packed blobs or trees" ) + nsearched = 0 for c in commit_vertices: commit_hash = c["name"] - for dep_hash, dep_type in get_dependencies(commit_hash): + nsearched += 1 + if (nsearched % 5000) == 0: + self.log.debug( + f"Searched {int((nsearched * 100) / len(commit_vertices))}% " + f"of {len(commit_vertices)} packed commits..." + ) + if commit_hash in missing_revisions: + continue + for dep_hash, dep_type in get_dependencies(c): if dep_hash in missing_contents or dep_hash in missing_directories: - self.log.debug( - f"commit {hashutil.hash_to_bytehex(commit_hash)!r} depends on " - f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}" - ) # We can infer that the commit is also missing. missing_revisions.add(commit_hash) break @@ -1035,16 +1048,12 @@ class GitLoader(BaseGitLoader): ) for t in tag_vertices: tag_hash = t["name"] - for dep_hash, dep_type in get_dependencies(tag_hash): + for dep_hash, dep_type in get_dependencies(t): if ( dep_hash in missing_revisions or dep_hash in missing_directories or dep_hash in missing_contents ): - self.log.debug( - f"tag {hashutil.hash_to_bytehex(tag_hash)!r} depends on " - f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}" - ) # We can infer that the tag is also missing. missing_releases.add(tag_hash) break