Commit Diff


commit - b1e497a6d0bae8ab541b45d1ffcd4b58be2c5b25
commit + 607a92048c2b18c77d6930d181b8326ec9158074
blob - e78ebf760ea71e81d5e57ff1b70f68d2b3d7b405
blob + e4f2f184e4687f11df9528fd46539d7873359b12
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -920,7 +920,9 @@ class GitLoader(BaseGitLoader):
             # Show some debug progress output for very large datasets
             p = int(i * 100 / num_commits_total)
             if p != last_p:
-                logger.debug(f"Adding trees and blobs: {p}% of commits processed...")
+                logger.debug(
+                    f"Adding trees and blobs to the graph: {p}% of commits processed..."
+                )
                 last_p = p
             i = i + 1
             tree_hex = hashutil.hash_to_bytehex(root_tree_hash)
@@ -996,6 +998,8 @@ class GitLoader(BaseGitLoader):
             logger.debug("No objects to load")
             return
 
+        logger.debug("Loading objects in topological order...")
+
         if self.save_data_path:
             self.save_data()
 
@@ -1086,7 +1090,15 @@ class GitLoader(BaseGitLoader):
             object_type=GitObjectType.TREE
         )
         blob_hashes: Set[bytes] = set()
+        logger.debug("Loading blob object IDs...")
+        i = 0
+        last_p = -1
         for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
+            p = int(i * 100 / len(root_tree_vertices))
+            if p != last_p:
+                logger.debug(f"Loading blob object IDs: {p}% of commits processed...")
+                last_p = p
+            i = i + 1
             try:
                 dep_hashes = get_recursive_tree_deps(
                     tree_deps, obj_type=GitObjectType.BLOB
@@ -1095,6 +1107,10 @@ class GitLoader(BaseGitLoader):
             except KeyError:
                 continue
 
+        p = int(i * 100 / len(root_tree_vertices))
+        if p != last_p:
+            logger.debug(f"Loading blob object IDs: {p}% of commits processed.")
+
         # Find out which blobs are missing from the archive.
         missing_contents = set(
             self.storage.content_missing_per_sha1_git(list(blob_hashes))