commit - d9e70c37d9c1f727bf5c52714f64189717706d01
commit + 4f62e31e79cb1498c62ca45b0b9d9d72ef03a334
blob - faa42ebc8315cb686415f0039a27d1119f5ab6ec
blob + f25600eaa4b77f39953dd90dd1e4ddecea853efa
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
else:
raise TypeError(f"Unexpected object type: {obj_type}")
- # Load the set of all blob IDs found in the pack file.
+ # Load the set of all blob IDs found in the pack file
+ # and find out which blobs are missing from the archive.
+ blob_hashes: List[bytes] = list()
+ logger.debug("Querying archive for missing packed blobs...")
+ num_blobs = self._blob_types.count()
+ for obj_pos in self._blob_types.search(1):
+ id_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos))
+ id_hash = hashutil.bytehex_to_hash(id_hex)
+ blob_hashes.append(id_hash)
+ missing_contents = set(self.storage.content_missing_per_sha1_git(blob_hashes))
+ self.log.debug(
+ "Number of packed blobs that are missing in storage: "
+ f"{len(missing_contents)} of {num_blobs} packed blobs total"
+ )
+
+ # This list is potentially quite big.
+ del blob_hashes
+
root_tree_vertices = self._commit_graph.vs.select(
object_type=GitObjectType.TREE
- )
- blob_hashes: Set[bytes] = set()
- logger.debug("Loading blob object IDs...")
- i = 0
- last_p = -1
- for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
- p = int(i * 100 / len(root_tree_vertices))
- if p != last_p:
- logger.debug(f"Loading blob object IDs: {p}% of commits processed...")
- last_p = p
- i = i + 1
- try:
- dep_hashes = get_recursive_tree_dep_hashes(
- root_hash, obj_type=GitObjectType.BLOB
- )
- blob_hashes.update(dep_hashes)
- except KeyError:
- continue
-
- p = int(i * 100 / len(root_tree_vertices))
- if p != last_p:
- logger.debug(f"Loading blob object IDs: {p}% of commits processed.")
-
- # Find out which blobs are missing from the archive.
- missing_contents = set(
- self.storage.content_missing_per_sha1_git(list(blob_hashes))
- )
- self.log.debug(
- "Number of packed blobs that are missing in storage: "
- f"{len(missing_contents)} of {len(blob_hashes)} packed blobs total"
)
-
missing_directories: Set[bytes] = set()
tree_hashes: Set[bytes] = set()
- if len(missing_contents) == len(blob_hashes):
+ if len(missing_contents) == num_blobs:
# If all blobs are missing then all trees are missing, too.
for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
missing_directories.add(hashutil.hash_to_bytes(root_hash))