commit 4f62e31e79cb1498c62ca45b0b9d9d72ef03a334 from: Stefan Sperling date: Mon Oct 21 13:42:04 2024 UTC use a faster way to find all blob object IDs commit - d9e70c37d9c1f727bf5c52714f64189717706d01 commit + 4f62e31e79cb1498c62ca45b0b9d9d72ef03a334 blob - faa42ebc8315cb686415f0039a27d1119f5ab6ec blob + f25600eaa4b77f39953dd90dd1e4ddecea853efa --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -1147,44 +1147,30 @@ class GitLoader(BaseGitLoader): else: raise TypeError(f"Unexpected object type: {obj_type}") - # Load the set of all blob IDs found in the pack file. + # Load the set of all blob IDs found in the pack file + # and find out which blobs are missing from the archive. + blob_hashes: List[bytes] = list() + logger.debug("Querying archive for missing packed blobs...") + num_blobs = self._blob_types.count() + for obj_pos in self._blob_types.search(1): + id_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos)) + id_hash = hashutil.bytehex_to_hash(id_hex) + blob_hashes.append(id_hash) + missing_contents = set(self.storage.content_missing_per_sha1_git(blob_hashes)) + self.log.debug( + "Number of packed blobs that are missing in storage: " + f"{len(missing_contents)} of {num_blobs} packed blobs total" + ) + + # This list is potentially quite big. + del blob_hashes + root_tree_vertices = self._commit_graph.vs.select( object_type=GitObjectType.TREE - ) - blob_hashes: Set[bytes] = set() - logger.debug("Loading blob object IDs...") - i = 0 - last_p = -1 - for root_hash, tree_deps in iter_tree_deps(root_tree_vertices): - p = int(i * 100 / len(root_tree_vertices)) - if p != last_p: - logger.debug(f"Loading blob object IDs: {p}% of commits processed...") - last_p = p - i = i + 1 - try: - dep_hashes = get_recursive_tree_dep_hashes( - root_hash, obj_type=GitObjectType.BLOB - ) - blob_hashes.update(dep_hashes) - except KeyError: - continue - - p = int(i * 100 / len(root_tree_vertices)) - if p != last_p: - logger.debug(f"Loading blob object IDs: {p}% of commits processed.") - - # Find out which blobs are missing from the archive. - missing_contents = set( - self.storage.content_missing_per_sha1_git(list(blob_hashes)) - ) - self.log.debug( - "Number of packed blobs that are missing in storage: " - f"{len(missing_contents)} of {len(blob_hashes)} packed blobs total" ) - missing_directories: Set[bytes] = set() tree_hashes: Set[bytes] = set() - if len(missing_contents) == len(blob_hashes): + if len(missing_contents) == num_blobs: # If all blobs are missing then all trees are missing, too. for root_hash, tree_deps in iter_tree_deps(root_tree_vertices): missing_directories.add(hashutil.hash_to_bytes(root_hash))