Commit Diff


commit - d9e70c37d9c1f727bf5c52714f64189717706d01
commit + 4f62e31e79cb1498c62ca45b0b9d9d72ef03a334
blob - faa42ebc8315cb686415f0039a27d1119f5ab6ec
blob + f25600eaa4b77f39953dd90dd1e4ddecea853efa
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -1147,44 +1147,30 @@ class GitLoader(BaseGitLoader):
                 else:
                     raise TypeError(f"Unexpected object type: {obj_type}")
 
-        # Load the set of all blob IDs found in the pack file.
+        # Load the set of all blob IDs found in the pack file
+        # and find out which blobs are missing from the archive.
+        blob_hashes: List[bytes] = list()
+        logger.debug("Querying archive for missing packed blobs...")
+        num_blobs = self._blob_types.count()
+        for obj_pos in self._blob_types.search(1):
+            id_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos))
+            id_hash = hashutil.bytehex_to_hash(id_hex)
+            blob_hashes.append(id_hash)
+        missing_contents = set(self.storage.content_missing_per_sha1_git(blob_hashes))
+        self.log.debug(
+            "Number of packed blobs that are missing in storage: "
+            f"{len(missing_contents)} of {num_blobs} packed blobs total"
+        )
+
+        # This list is potentially quite big.
+        del blob_hashes
+
         root_tree_vertices = self._commit_graph.vs.select(
             object_type=GitObjectType.TREE
-        )
-        blob_hashes: Set[bytes] = set()
-        logger.debug("Loading blob object IDs...")
-        i = 0
-        last_p = -1
-        for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
-            p = int(i * 100 / len(root_tree_vertices))
-            if p != last_p:
-                logger.debug(f"Loading blob object IDs: {p}% of commits processed...")
-                last_p = p
-            i = i + 1
-            try:
-                dep_hashes = get_recursive_tree_dep_hashes(
-                    root_hash, obj_type=GitObjectType.BLOB
-                )
-                blob_hashes.update(dep_hashes)
-            except KeyError:
-                continue
-
-        p = int(i * 100 / len(root_tree_vertices))
-        if p != last_p:
-            logger.debug(f"Loading blob object IDs: {p}% of commits processed.")
-
-        # Find out which blobs are missing from the archive.
-        missing_contents = set(
-            self.storage.content_missing_per_sha1_git(list(blob_hashes))
-        )
-        self.log.debug(
-            "Number of packed blobs that are missing in storage: "
-            f"{len(missing_contents)} of {len(blob_hashes)} packed blobs total"
         )
-
         missing_directories: Set[bytes] = set()
         tree_hashes: Set[bytes] = set()
-        if len(missing_contents) == len(blob_hashes):
+        if len(missing_contents) == num_blobs:
             # If all blobs are missing then all trees are missing, too.
             for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
                 missing_directories.add(hashutil.hash_to_bytes(root_hash))