Commit Diff


commit - 4ff67682be16c48c098e0412aee07cabd74a1f27
commit + e3840f7501a7dbea6e169bd9c0d0fa8a49476af0
blob - ab9b86ab3ca21b10c0676ff6d55f269df436634e
blob + bd5c4b65d0adb1db728a1c1dc33c23318d91840e
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -811,6 +811,7 @@ class GitLoader(BaseGitLoader):
         new_blobs = []
         new_edges = []
         traversed_trees = set()
+        seen_blobs = set()
         submodule_mode = stat.S_IFDIR | stat.S_IFLNK
         for commit_hash, tree_hash in commits.items():
             subtrees = [tree_hash]
@@ -829,16 +830,14 @@ class GitLoader(BaseGitLoader):
                         continue  # ignore submodules
                     entry_hash = hashutil.bytehex_to_hash(entry_hex)
                     if mode & stat.S_IFDIR:
-                        new_trees.append(entry_hash)
-                    else:
+                        if entry_hash not in traversed_trees:
+                            new_trees.append(entry_hash)
+                            subtrees.append(entry_hash)
+                    elif entry_hash not in seen_blobs:
                         new_blobs.append(entry_hash)
+                        seen_blobs.add(entry_hash)
                     new_edges.append((tree_hash, entry_hash))
-                    if mode & stat.S_IFDIR:
-                        try:
-                            tree = self.pack[entry_hex]
-                            subtrees.append(entry_hash)
-                        except KeyError:
-                            pass
+
                 # add new vertices and edges in batches for performance reasons
                 if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 100000:
                     if len(new_trees) > 0: