Commit Diff


commit - 4f62e31e79cb1498c62ca45b0b9d9d72ef03a334
commit + fc5f2bb353e14a8a06d1cd6bda751d2e02ac8915
blob - f25600eaa4b77f39953dd90dd1e4ddecea853efa
blob + 30c91308f9baf38141fb6b5d5cb280aab835b020
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -1188,9 +1188,21 @@ class GitLoader(BaseGitLoader):
         elif len(missing_contents) > 0:
             # If a subset of blobs is missing then a subset of trees are missing, too.
             self.log.debug(
-                "Searching for packed trees which are missing from the archive "
+                "Searching for packed trees which are missing from the archive..."
             )
+            i = 0
+            last_p = -1
+            num_commits_total = len(root_tree_vertices)
             for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
+                # Show some debug progress output for very large datasets
+                p = int(i * 100 / num_commits_total)
+                if p != last_p:
+                    logger.debug(
+                        "Searching for packed trees which are missing from the archive: "
+                        f"{p}% of commits processed..."
+                    )
+                    last_p = p
+                i = i + 1
                 subtrees = [root_hash]
                 while len(subtrees) > 0:
                     tree_hash = subtrees.pop(0)
@@ -1220,6 +1232,13 @@ class GitLoader(BaseGitLoader):
                         )
                         if len(missing_tree):
                             missing_directories.add(hashutil.hash_to_bytes(tree_hash))
+            p = int(i * 100 / num_commits_total)
+            if p != last_p:
+                logger.debug(
+                    "Searching for packed trees which are missing from the archive: "
+                    f"{p}% of commits processed."
+                )
+                last_p = p
             self.log.debug(
                 f"Number of packed trees found missing: {len(missing_directories)}"
             )