Commit Diff


commit - 2d1a69eeca244bc64d106fff5b1a4e25fa1e1728
commit + 310666ca1b675f5b9ea18bd71bbda18b9f0da97d
blob - d19645d3dafd42ba011a0f16fa7dfb189c115c9e
blob + dfc9bd15c4a5c3dba256baa055aadfa2b051ffdf
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -908,15 +908,28 @@ class GitLoader(BaseGitLoader):
         new_edges = []
         submodule_mode = stat.S_IFDIR | stat.S_IFLNK
         i = 0
+        last_p = -1
         num_objects_found = 0
         num_tree_entries = 0
+        num_commits_total = len(commits.items())
         logger.info("Adding trees and blobs to the graph...")
-        for commit_hash, tree_hash in commits.items():
-            if tree_hash in self._tree_graphs.keys():
-                # Keep recently used trees cached in memory.
-                self._tree_graphs.move_to_end(tree_hash, last=False)
+        for commit_hash, root_tree_hash in commits.items():
+            # Show some debug progress output for very large datasets
+            p = int(i * 100 / num_commits_total)
+            if p != last_p:
+                logger.debug(f"Adding trees and blobs: {p}% of commits processed...")
+                last_p = p
+            i = i + 1
+            tree_hex = hashutil.hash_to_bytehex(root_tree_hash)
+            t = have_traversed(traversed_objects, tree_hex)
+            if t is None:
                 continue
-            if tree_hash in self._swapped_graphs.keys():
+            if t is True:
+                try:
+                    # Keep graphs for recently used trees cached in memory.
+                    self._tree_graphs.move_to_end(root_tree_hash, last=False)
+                except KeyError:
+                    pass
                 continue
             if len(self._tree_graphs) >= self._max_trees_in_mem:
                 (other_tree_hash, other_tree_graph) = self._tree_graphs.popitem(
@@ -926,14 +939,10 @@ class GitLoader(BaseGitLoader):
                 self._swapped_graphs[other_tree_hash] = swapper
                 swapper.swap_out()
             tree_graph = Graph(directed=True)
-            self._tree_graphs[tree_hash] = tree_graph
-            self._tree_graphs.move_to_end(tree_hash, last=False)
-            # Show some debug progress output for very large datasets
-            if i > 0 and i % 10000 == 0:
-                logger.debug(f"Adding trees and blobs: {i} commits processed...")
-            i = i + 1
-            new_trees.append(tree_hash)
-            subtrees = [tree_hash]
+            self._tree_graphs[root_tree_hash] = tree_graph
+            self._tree_graphs.move_to_end(root_tree_hash, last=False)
+            new_trees.append(root_tree_hash)
+            subtrees = [root_tree_hash]
             while len(subtrees) > 0:
                 tree_hash = subtrees.pop(0)
                 tree_hex = hashutil.hash_to_bytehex(tree_hash)