Commit Diff


commit - 948ae59d4196b17d4098e50439b8cc7e728b308e
commit + 3f6425401a22f5010346a180a91b7d65d4715c43
blob - f48e919dad45136bc0cbb922a3fd62bce0738448
blob + 39567770d496ea1588e9b7371ee6760215c3c598
--- mypy.ini
+++ mypy.ini
@@ -21,3 +21,6 @@ ignore_missing_imports = True
 
 [mypy-swh.loader.*]
 ignore_missing_imports = True
+
+[mypy-igraph.*]
+ignore_missing_imports = True
blob - 9c56eae6ad1a72bc41731f4faf585c7b631bde47
blob + 20f92a7854e1890e0ceed4a25721253321f2aafb
--- requirements.txt
+++ requirements.txt
@@ -1,3 +1,4 @@
 dulwich >= 0.21.5
 retrying
 click
+igraph
blob - 0f2285d498628de6a1612b2b81a48554344f3de2
blob + 0ff1b4f9df05c766e3abee76e6306f8368a7e180
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -32,6 +32,7 @@ from dulwich.object_store import ObjectStoreGraphWalke
 from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree
 from dulwich.pack import Pack, PackData, PackInflater, load_pack_index
 import urllib3.util
+from igraph import Graph
 
 from swh.core.statsd import Statsd
 from swh.loader.exception import NotFound
@@ -700,7 +701,7 @@ class GitLoader(BaseGitLoader):
     def make_object_graph(self):
         logger.debug("Building packed objects graph")
 
-        self._object_graph = {}
+        self._object_graph = Graph(directed=True)
 
         # Find all commits and corresponding tree roots in the pack file
         commits = {}
@@ -726,42 +727,64 @@ class GitLoader(BaseGitLoader):
             tree_hash = hashutil.bytehex_to_hash(commit.tree)
             commit_hash = hashutil.bytehex_to_hash(ref_object_hex)
             commits[commit_hash] = tree_hash
+
+        # Add root commits and trees to the graph
+        self._object_graph.add_vertices(list(commits.keys()))
+        self._object_graph.add_vertices(list(commits.values()))
+        self._object_graph.add_edges(zip(commits.keys(), commits.values()))
 
+        # Populate the graph with trees and blobs
+        new_vertices = []
+        new_edges = []
         for commit_hash, tree_hash in commits.items():
             logger.debug(
                 f"commit {hashutil.hash_to_hex(commit_hash)} "
                 f"tree {hashutil.hash_to_hex(tree_hash)}"
             )
-            self._object_graph[commit_hash] = list()
-            self._object_graph[commit_hash].append(tree_hash)
-            tree_hex = hashutil.hash_to_bytehex(tree_hash)
-            try:
-                tree = self.pack[tree_hex]
-                self._object_graph[tree_hash] = list()
-            except KeyError:
-                continue
-            subtrees = [tree]
+            subtrees = [tree_hash]
             while len(subtrees) > 0:
-                tree = subtrees.pop(0)
+                tree_hash = subtrees.pop(0)
+                tree_hex = hashutil.hash_to_bytehex(tree_hash)
+                try:
+                    tree = self.pack[tree_hex]
+                except KeyError:
+                    logger.debug(f"  pack is missing: {tree_hex}")
+                    continue
                 logger.debug(f"Entries of {tree}:")
                 for (name, mode, entry_hex) in tree.iteritems():
                     logger.debug(f"  {name} {mode} {entry_hex}")
                     entry_hash = hashutil.bytehex_to_hash(entry_hex)
-                    self._object_graph[tree_hash].append(entry_hash)
+                    new_vertices.append(entry_hash)
+                    new_edges.append((tree_hash, entry_hash))
                     if mode & stat.S_IFDIR:
                         try:
                             tree = self.pack[entry_hex]
-                            subtrees.append(tree)
+                            subtrees.append(entry_hash)
                             logger.debug(f"  present in pack: {entry_hex}")
                         except KeyError:
                             logger.debug(f"  pack is missing: {entry_hex}")
-                            pass
+                # add new vertices and edges in batches for performance reasons
+                if len(new_vertices) > 100000 or len(new_edges) > 100000:
+                    self._object_graph.add_vertices(new_vertices)
+                    self._object_graph.add_edges(new_edges)
+                    new_vertices = []
+                    new_edges = []
 
-        for (k, l) in self._object_graph.items():
-            logger.debug(f"object {hashutil.hash_to_hex(k)}")
-            for v in l:
-                logger.debug(f" child {hashutil.hash_to_hex(v)}")
+        self._object_graph.add_vertices(new_vertices)
+        self._object_graph.add_edges(new_edges)
 
+        for v in self._object_graph.vs:
+            name = hashutil.hash_to_bytehex(v["name"])
+            predecessors = []
+            vertices = [v]
+            while len(vertices) > 0:
+                v = vertices.pop(0)
+                for p in self._object_graph.predecessors(v):
+                    pname = self._object_graph.vs["name"][p]
+                    predecessors.append(hashutil.hash_to_bytehex(pname))
+                    vertices.append(p)
+            logger.debug(f"object {name} has predecessors {predecessors}")
+
     def save_data(self) -> None:
         """Store a pack for archival"""
         assert isinstance(self.visit_date, datetime.datetime)