commit dc6d4f4cddb766959cc8d773a86d6e1cebc53265 from: Stefan Sperling date: Tue Sep 17 13:28:19 2024 UTC fix commit processing commit - cbe1c11028d08815f6840cc4b5fdaeafca82e9ca commit + dc6d4f4cddb766959cc8d773a86d6e1cebc53265 blob - 034ae31642dea4c2774d19b1ecd4b3546d3967a8 blob + dd1d00020b1d912022cc7442c498611140c3e375 --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -903,11 +903,14 @@ class GitLoader(BaseGitLoader): "fetched pack file nor in local heads nor in the archive" ) - def get_dependencies(v): + def get_successors(v, object_type=None): vertices = [v] while len(vertices) > 0: v = vertices.pop(0) for s in self._object_graph.successors(v): + if object_type is not None: + if self._object_graph.vs[s]["object_type"] != object_type: + continue if s not in vertices: vertices.append(s) yield ( @@ -915,6 +918,16 @@ class GitLoader(BaseGitLoader): self._object_graph.vs[s]["object_type"], ) + def get_neighbors(v, object_type=None): + for s in self._object_graph.neighbors(v): + if object_type is not None: + if self._object_graph.vs[s]["object_type"] != object_type: + continue + yield ( + self._object_graph.vs[s]["name"], + self._object_graph.vs[s]["object_type"], + ) + try: blob_vertices = self._object_graph.vs.select(object_type=GitObjectType.BLOB) except KeyError: @@ -960,7 +973,7 @@ class GitLoader(BaseGitLoader): if tree_hash in missing_directories: continue have_dep = False - for dep_hash, dep_type in get_dependencies(t): + for dep_hash, dep_type in get_successors(t): have_dep = True if dep_type == GitObjectType.BLOB and dep_hash in missing_contents: # We can infer that the tree is also missing. @@ -975,6 +988,10 @@ class GitLoader(BaseGitLoader): if len(missing_empty_tree): missing_directories.add(tree_hash) self.log.debug( + f"Searched {int((nsearched * 100) / len(tree_vertices))}% " + f"of {len(tree_vertices)} packed trees..." + ) + self.log.debug( "Number of packed trees considered missing by implication: " f"{len(missing_directories)}" ) @@ -1008,11 +1025,15 @@ class GitLoader(BaseGitLoader): ) if commit_hash in missing_revisions: continue - for dep_hash, dep_type in get_dependencies(c): + for dep_hash, dep_type in get_neighbors(c, GitObjectType.TREE): if dep_hash in missing_contents or dep_hash in missing_directories: # We can infer that the commit is also missing. missing_revisions.add(commit_hash) break + self.log.debug( + f"Searched {int((nsearched * 100) / len(commit_vertices))}% " + f"of {len(commit_vertices)} packed commits..." + ) self.log.debug( "Number of packed commits considered missing by implication: " f"{len(missing_revisions)}" @@ -1048,7 +1069,7 @@ class GitLoader(BaseGitLoader): ) for t in tag_vertices: tag_hash = t["name"] - for dep_hash, dep_type in get_dependencies(t): + for dep_hash, dep_type in get_neighbors(t): if ( dep_hash in missing_revisions or dep_hash in missing_directories