commit - 1d97ad9c20b782648ef07fe728e383ce961bf10f
commit + dbf83540eddcf41a2e363bf94cc276dfa3979658
blob - 2a70f0271a90a9aab9afec2d4363c0d2be9c5c03
blob + e74eb7a325c280b15419e396a916efd22d60e446
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
self.storage = storage
self.incremental = incremental
self.statsd = statsd
+ self.num_objects = 0
if base_snapshots and incremental:
self.base_snapshots: List[Snapshot] = base_snapshots
resolve_ext_ref=self._resolve_ext_ref,
)
packindex = load_pack_index(indexfile.name)
- logger.info(f"Indexing pack file: 100% ({len(packindex)} objects)")
+ self.num_objects = len(packindex)
+ logger.info(f"Indexing pack file: 100% ({self.num_objects} objects)")
self.pack = Pack.from_objects(packdata, packindex)
self.pack.resolve_ext_ref = self._resolve_ext_ref
logger.debug(f"Found {len(tags)} tags and {len(commits)} commits")
+ # Calculate how many more objects we still need to add to the graph.
+ num_objects_left = self.num_objects - len(tags) - len(commits)
+
archived_missing_objects = set(
self.storage.object_find_by_sha1_git(missing_objects).keys()
)
self._object_graph.add_vertices(new_vertices, attributes=attributes)
# Add vertices for any directly referenced trees and blobs.
+ num_objects_left -= len(new_trees) + len(new_blobs)
if len(new_trees) > 0:
add_vertices(new_trees, GitObjectType.TREE)
new_trees = []
seen_blobs = set()
submodule_mode = stat.S_IFDIR | stat.S_IFLNK
i = 0
+ num_objects_found = 0
+ num_tree_entries = 0
+ logger.info("Adding trees and blobs to the graph...")
for commit_hash, tree_hash in commits.items():
# Show some debug progress output for very large datasets
if i > 0 and i % 10000 == 0:
- logger.debug(f"Finding trees and blobs: {i} commits processed...")
+ logger.debug(f"Adding trees and blobs: {i} commits processed...")
i = i + 1
subtrees = [tree_hash]
while len(subtrees) > 0:
# add new vertices and edges in batches for performance reasons
if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 1000000:
- logger.debug(
- f"Adding {len(new_trees)} trees, {len(new_blobs)} blobs, "
- f"and {len(new_edges)} tree entries"
- )
if len(new_trees) > 0:
add_vertices(new_trees, GitObjectType.TREE)
if len(new_blobs) > 0:
add_vertices(new_blobs, GitObjectType.BLOB)
- self._object_graph.add_edges(new_edges)
+ if len(new_edges) > 0:
+ num_tree_entries += len(new_edges)
+ logger.debug(
+ f"Adding {len(new_edges)} tree entries to the graph "
+ f"({num_tree_entries} entries total)"
+ )
+ self._object_graph.add_edges(new_edges)
+ num_objects_found += len(new_trees) + len(new_blobs)
+ logger.debug(
+ f"Added {int((num_objects_found * 100) / num_objects_left)}% "
+ "of packed objects to the graph"
+ )
new_trees = []
new_blobs = []
new_edges = []
+ num_objects_found += len(new_trees) + len(new_blobs)
if len(new_trees) > 0:
add_vertices(new_trees, GitObjectType.TREE)
if len(new_blobs) > 0:
add_vertices(new_blobs, GitObjectType.BLOB)
- self._object_graph.add_edges(new_edges)
+ if len(new_edges) > 0:
+ num_tree_entries += len(new_edges)
+ logger.debug(
+ f"Adding {len(new_edges)} tree entries to the graph "
+ f"({num_tree_entries} entries total)"
+ )
+ self._object_graph.add_edges(new_edges)
- logger.info("Commit graph has been built")
+ logger.info(
+ f"Added {int((num_objects_found * 100) / num_objects_left)}% "
+ "of packed objects to the graph"
+ )
+ logger.info("Packed objects graph has been built")
def save_data(self) -> None:
"""Store a pack for archival"""