Commit Diff


commit - 1d97ad9c20b782648ef07fe728e383ce961bf10f
commit + dbf83540eddcf41a2e363bf94cc276dfa3979658
blob - 2a70f0271a90a9aab9afec2d4363c0d2be9c5c03
blob + e74eb7a325c280b15419e396a916efd22d60e446
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -116,6 +116,7 @@ class RepoRepresentation:
         self.storage = storage
         self.incremental = incremental
         self.statsd = statsd
+        self.num_objects = 0
 
         if base_snapshots and incremental:
             self.base_snapshots: List[Snapshot] = base_snapshots
@@ -707,7 +708,8 @@ class GitLoader(BaseGitLoader):
             resolve_ext_ref=self._resolve_ext_ref,
         )
         packindex = load_pack_index(indexfile.name)
-        logger.info(f"Indexing pack file: 100% ({len(packindex)} objects)")
+        self.num_objects = len(packindex)
+        logger.info(f"Indexing pack file: 100% ({self.num_objects} objects)")
         self.pack = Pack.from_objects(packdata, packindex)
         self.pack.resolve_ext_ref = self._resolve_ext_ref
 
@@ -802,6 +804,9 @@ class GitLoader(BaseGitLoader):
 
         logger.debug(f"Found {len(tags)} tags and {len(commits)} commits")
 
+        # Calculate how many more objects we still need to add to the graph.
+        num_objects_left = self.num_objects - len(tags) - len(commits)
+
         archived_missing_objects = set(
             self.storage.object_find_by_sha1_git(missing_objects).keys()
         )
@@ -817,6 +822,7 @@ class GitLoader(BaseGitLoader):
             self._object_graph.add_vertices(new_vertices, attributes=attributes)
 
         # Add vertices for any directly referenced trees and blobs.
+        num_objects_left -= len(new_trees) + len(new_blobs)
         if len(new_trees) > 0:
             add_vertices(new_trees, GitObjectType.TREE)
             new_trees = []
@@ -838,10 +844,13 @@ class GitLoader(BaseGitLoader):
         seen_blobs = set()
         submodule_mode = stat.S_IFDIR | stat.S_IFLNK
         i = 0
+        num_objects_found = 0
+        num_tree_entries = 0
+        logger.info("Adding trees and blobs to the graph...")
         for commit_hash, tree_hash in commits.items():
             # Show some debug progress output for very large datasets
             if i > 0 and i % 10000 == 0:
-                logger.debug(f"Finding trees and blobs: {i} commits processed...")
+                logger.debug(f"Adding trees and blobs: {i} commits processed...")
             i = i + 1
             subtrees = [tree_hash]
             while len(subtrees) > 0:
@@ -869,26 +878,44 @@ class GitLoader(BaseGitLoader):
 
                 # add new vertices and edges in batches for performance reasons
                 if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 1000000:
-                    logger.debug(
-                        f"Adding {len(new_trees)} trees, {len(new_blobs)} blobs, "
-                        f"and {len(new_edges)} tree entries"
-                    )
                     if len(new_trees) > 0:
                         add_vertices(new_trees, GitObjectType.TREE)
                     if len(new_blobs) > 0:
                         add_vertices(new_blobs, GitObjectType.BLOB)
-                    self._object_graph.add_edges(new_edges)
+                    if len(new_edges) > 0:
+                        num_tree_entries += len(new_edges)
+                        logger.debug(
+                            f"Adding {len(new_edges)} tree entries to the graph "
+                            f"({num_tree_entries} entries total)"
+                        )
+                        self._object_graph.add_edges(new_edges)
+                    num_objects_found += len(new_trees) + len(new_blobs)
+                    logger.debug(
+                        f"Added {int((num_objects_found * 100) / num_objects_left)}% "
+                        "of packed objects to the graph"
+                    )
                     new_trees = []
                     new_blobs = []
                     new_edges = []
 
+        num_objects_found += len(new_trees) + len(new_blobs)
         if len(new_trees) > 0:
             add_vertices(new_trees, GitObjectType.TREE)
         if len(new_blobs) > 0:
             add_vertices(new_blobs, GitObjectType.BLOB)
-        self._object_graph.add_edges(new_edges)
+        if len(new_edges) > 0:
+            num_tree_entries += len(new_edges)
+            logger.debug(
+                f"Adding {len(new_edges)} tree entries to the graph "
+                f"({num_tree_entries} entries total)"
+            )
+            self._object_graph.add_edges(new_edges)
 
-        logger.info("Commit graph has been built")
+        logger.info(
+            f"Added {int((num_objects_found * 100) / num_objects_left)}% "
+            "of packed objects to the graph"
+        )
+        logger.info("Packed objects graph has been built")
 
     def save_data(self) -> None:
         """Store a pack for archival"""