Commit Diff


commit - d90389e678229d944617785754abf30f2f31ce80
commit + d30fbeafa2087c29c558a7e1f539f6aae480a2c2
blob - f26d6c96a73198205002c075fbced564d61970bd
blob + a66297ed71f87b2d38f96647b63a6c473c0534da
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -837,7 +837,12 @@ class GitLoader(BaseGitLoader):
         traversed_trees = set()
         seen_blobs = set()
         submodule_mode = stat.S_IFDIR | stat.S_IFLNK
+        i = 0
         for commit_hash, tree_hash in commits.items():
+            # Show some debug progress output for very large datasets
+            if i > 0 and i % 10000 == 0:
+                logger.debug(f"Finding trees and blobs: {i} commits processed...")
+            i = i + 1
             subtrees = [tree_hash]
             while len(subtrees) > 0:
                 tree_hash = subtrees.pop(0)
@@ -864,6 +869,10 @@ class GitLoader(BaseGitLoader):
 
                 # add new vertices and edges in batches for performance reasons
                 if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 100000:
+                    logger.debug(
+                        f"Found {len(new_trees)} new trees and {len(new_blobs)} new blobs "
+                        f"and {len(new_edges)} new tree entries"
+                    )
                     if len(new_trees) > 0:
                         add_vertices(new_trees, GitObjectType.TREE)
                     if len(new_blobs) > 0: