commit 4c3333066c370989f34d96881f559e3eee03391d from: Stefan Sperling date: Mon Oct 21 14:57:22 2024 UTC list missing trees more efficiently commit - fc5f2bb353e14a8a06d1cd6bda751d2e02ac8915 commit + 4c3333066c370989f34d96881f559e3eee03391d blob - 30c91308f9baf38141fb6b5d5cb280aab835b020 blob + 60b4e974f34091a8c67b4b796003d97e06477f43 --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -1124,12 +1124,6 @@ class GitLoader(BaseGitLoader): if graph.vs[s]["object_type"] != object_type: continue yield (graph.vs[s]["name"], graph.vs[s]["object_type"]) - - def iter_tree_deps(root_tree_vertices): - for t in root_tree_vertices: - tree_hash = t["name"] - tree_deps = get_tree_deps(tree_hash) - yield (tree_hash, tree_deps) def get_recursive_tree_dep_hashes(tree_hash, obj_type=None): recursive_deps = get_recursive_tree_deps(tree_hash) @@ -1165,22 +1159,13 @@ class GitLoader(BaseGitLoader): # This list is potentially quite big. del blob_hashes - root_tree_vertices = self._commit_graph.vs.select( - object_type=GitObjectType.TREE - ) missing_directories: Set[bytes] = set() - tree_hashes: Set[bytes] = set() if len(missing_contents) == num_blobs: # If all blobs are missing then all trees are missing, too. - for root_hash, tree_deps in iter_tree_deps(root_tree_vertices): - missing_directories.add(hashutil.hash_to_bytes(root_hash)) - try: - dep_hashes = get_recursive_tree_dep_hashes( - root_hash, obj_type=GitObjectType.TREE - ) - missing_directories.update(map(hashutil.hash_to_bytes, dep_hashes)) - except KeyError: - continue + for obj_pos in self._tree_types.search(1): + id_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos)) + id_hash = hashutil.bytehex_to_hash(id_hex) + missing_directories.add(hashutil.hash_to_bytes(id_hash)) self.log.debug( "Number of packed trees considered missing by implication: " f"{len(missing_directories)}" @@ -1190,55 +1175,30 @@ class GitLoader(BaseGitLoader): self.log.debug( "Searching for packed trees which are missing from the archive..." ) - i = 0 - last_p = -1 - num_commits_total = len(root_tree_vertices) - for root_hash, tree_deps in iter_tree_deps(root_tree_vertices): - # Show some debug progress output for very large datasets - p = int(i * 100 / num_commits_total) - if p != last_p: - logger.debug( - "Searching for packed trees which are missing from the archive: " - f"{p}% of commits processed..." + for obj_pos in self._tree_types.search(1): + tree_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos)) + tree_hash = hashutil.bytehex_to_hash(tree_hex) + is_missing = False + tree_deps = get_tree_deps(tree_hash) + for dep_pos in tree_deps.search(1): + dep_hex = sha_to_hex(self.pack.index._unpack_name(dep_pos)) + dep_hash = hashutil.bytehex_to_hash(dep_hex) + if dep_hash in missing_contents: + # Content this tree depends on is missing. + # We can infer that the tree itself is missing. + missing_directories.add(hashutil.hash_to_bytes(tree_hash)) + is_missing = True + break + if not is_missing: + # This tree is either empty or has no known-missing dependencies. + # Determine if it is missing by searching the archive. + missing_tree = set( + self.storage.directory_missing( + [hashutil.hash_to_bytes(tree_hash)] + ) ) - last_p = p - i = i + 1 - subtrees = [root_hash] - while len(subtrees) > 0: - tree_hash = subtrees.pop(0) - if hashutil.hash_to_bytes(tree_hash) in missing_directories: - continue - is_missing = False - for dep_hash, dep_type in get_successors(tree_hash): - if dep_type == GitObjectType.TREE: - if dep_hash not in subtrees: - subtrees.append(dep_hash) - elif ( - not is_missing - and dep_type == GitObjectType.BLOB - and dep_hash in missing_contents - ): - # Content this tree depends on is missing. - # We can infer that the tree itself is missing. - missing_directories.add(hashutil.hash_to_bytes(tree_hash)) - is_missing = True - if not is_missing: - # This tree is either empty or has no known-missing dependencies. - # Determine if it is missing by searching the archive. - missing_tree = set( - self.storage.directory_missing( - [hashutil.hash_to_bytes(tree_hash)] - ) - ) - if len(missing_tree): - missing_directories.add(hashutil.hash_to_bytes(tree_hash)) - p = int(i * 100 / num_commits_total) - if p != last_p: - logger.debug( - "Searching for packed trees which are missing from the archive: " - f"{p}% of commits processed." - ) - last_p = p + if len(missing_tree): + missing_directories.add(hashutil.hash_to_bytes(tree_hash)) self.log.debug( f"Number of packed trees found missing: {len(missing_directories)}" ) @@ -1246,16 +1206,12 @@ class GitLoader(BaseGitLoader): # All blobs are present. Any of these blobs might have been added # to the archive via imports of other origins. We must query the # archive for missing trees specific to the origin we are importing. - for root_hash, tree_deps in iter_tree_deps(root_tree_vertices): - tree_hashes.add(root_hash) - try: - dep_hashes = get_recursive_tree_dep_hashes( - root_hash, obj_type=GitObjectType.TREE - ) - tree_hashes.add(dep_hashes) - except KeyError: - continue - missing_directories = set(self.storage.directory_missing(list(tree_hashes))) + tree_hashes: List[bytes] = list() + for obj_pos in self._tree_types.search(1): + tree_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos)) + tree_hash = hashutil.bytehex_to_hash(tree_hex) + tree_hashes.append(tree_hash) + missing_directories = set(self.storage.directory_missing(tree_hashes)) self.log.debug( "Number of missing trees according to archive query: " f"{len(missing_directories)}"