Commit Diff


commit - 5a2803734b8c024d4ab4db8c4f7527547539775f
commit + c683aeaacfbbc2bac3b2e42f576a38d94dca218a
blob - 0dac21502f69d6cd4b68005d9403f4bea472b1b5
blob + 8ae4a5d93145b969b9664e6f0476392c7f159c0d
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -765,25 +765,33 @@ class GitLoader(BaseGitLoader):
                 self.ref_object_types[obj.id] = SnapshotTargetType.REVISION
 
             parents = [ref_object_hex]
+            i = 0
             while len(parents) > 0:
                 commit_hex = parents.pop(0)
+                commit_hash = hashutil.bytehex_to_hash(commit_hex)
+                if commit_hash in commits.keys():
+                    continue
                 try:
                     commit = self.pack[commit_hex]
                 except KeyError:
                     continue
-                commit_hash = hashutil.bytehex_to_hash(commit_hex)
-                if commit_hash in commits.keys():
-                    continue
+                # Show some debug progress output for very large datasets
+                if i > 0 and i % 100000 == 0:
+                    logger.debug(f"{ref_name}: {i} commits processed...")
+                i = i + 1
                 tree_hash = hashutil.bytehex_to_hash(commit.tree)
                 commits[commit_hash] = tree_hash
                 for parent_hex in commit.parents:
-                    if parent_hex in self.pack:
-                        if parent_hex not in parents:
-                            parents.append(parent_hex)
-                        parent_hash = hashutil.bytehex_to_hash(parent_hex)
-                        if (commit_hash, parent_hash) not in commit_edges:
-                            commit_edges.append((commit_hash, parent_hash))
+                    if parent_hex in parents:
+                        continue
+                    parent_hash = hashutil.bytehex_to_hash(parent_hex)
+                    if parent_hash in commits.keys():
+                        continue
+                    parents.append(parent_hex)
+                    commit_edges.append((commit_hash, parent_hash))
 
+        logger.debug(f"Found {len(tags)} tags and {len(commits)} commits")
+
         archived_missing_objects = set(
             self.storage.object_find_by_sha1_git(missing_objects).keys()
         )