Commit Diff


commit - a77c88011f70493a154286133edefe6cf85fec7e
commit + 4623907a1182b8baea3f03b1c0217ba6b2df2dda
blob - 2327196ec86d098b8aa391e9660d795e0cde887b
blob + ff402a66bfa97770b352653809ff7cc47e21724b
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -53,6 +53,7 @@ from swh.model.model import (
     RawExtrinsicMetadata,
     Release,
     Revision,
+    Sha1Git,
     SkippedContent,
     Snapshot,
     SnapshotBranch,
@@ -261,6 +262,10 @@ class GitLoader(BaseGitLoader):
             self.urllib3_extra_kwargs["cert_reqs"] = "CERT_NONE"
             self.requests_extra_kwargs["verify"] = False
         self.objects_seen: Set[bytes] = set()
+
+        self.vertex_ids: Dict[int, int] = {}
+        self.packidx_slots: Dict[int, int] = {}
+        self.vertex_cur = 0
 
     def fetch_pack_from_origin(
         self,
@@ -716,30 +721,46 @@ class GitLoader(BaseGitLoader):
 
         self.make_object_graph()
         return False
+
+    def idx_to_hex_id(self, idx):
+        logger.debug(f"looking up object at index {idx} of {self.num_objects}")
+        return hashutil.hash_to_bytehex(self.pack.index._unpack_name(idx))
+
+    def hex_id_to_idx(self, id_hex):
+        """
+        Return the pack index table entry number for a given object ID.
+
+        Based on dulwich.pack.FilePackIndex._object_offset()
+        XXX: This reaches into Dulwich internals; need an API?
+        """
+        sha = hex_to_sha(id_hex)
+        idx = ord(sha[:1])
+        if idx == 0:
+            start = 0
+        else:
+            start = self.pack.index._fan_out_table[idx - 1]
+        end = self.pack.index._fan_out_table[idx]
+        return bisect_find_sha(start, end, sha, self.pack.index._unpack_name)
 
+    def next_vertex_id(self, hex_id):
+        """Map pack file index slots to consecutive igrap vertex IDs
+        and vice versa..."""
+        slot = self.hex_id_to_idx(hex_id)
+        self.vertex_ids[slot] = self.vertex_cur
+        self.packidx_slots[self.vertex_cur] = slot
+        self.vertex_cur = self.vertex_cur + 1
+        return self.vertex_cur - 1
+
+    def vertex_to_packidx_slot(self, vertex_id):
+        return self.packidx_slots[vertex_id]
+
     def make_object_graph(self):
         logger.info("Building packed objects graph")
 
         self._object_graph = Graph(directed=True)
 
-        def get_pos_in_index(id_hex):
-            """
-            Return the pack index table entry number for a given object ID.
-
-            Based on dulwich.pack.FilePackIndex._object_offset()
-            XXX: This reaches into Dulwich internals; need an API?
-            """
-            sha = hex_to_sha(id_hex)
-            idx = ord(sha[:1])
-            if idx == 0:
-                start = 0
-            else:
-                start = self.pack.index._fan_out_table[idx - 1]
-            end = self.pack.index._fan_out_table[idx]
-            return bisect_find_sha(start, end, sha, self.pack.index._unpack_name)
-
         def mark_as_traversed(bitstr, id_hex):
-            bitstr.set(True, get_pos_in_index(id_hex))
+            bitstr.set(True, self.hex_id_to_idx(id_hex))
 
         def have_traversed(bitstr, id_hex):
             """
@@ -747,7 +768,7 @@ class GitLoader(BaseGitLoader):
             been traversed (i.e. already has its vertex added to the graph).
             Return None if the object is not present in the pack file.
             """
-            n = get_pos_in_index(id_hex)
+            n = self.hex_id_to_idx(id_hex)
             if n is None:
                 return None
             return bitstr[n]
@@ -770,7 +791,6 @@ class GitLoader(BaseGitLoader):
             while obj.type_name == b"tag":
                 if obj.id in self.ref_object_types:
                     self.ref_object_types[obj.id] = SnapshotTargetType.RELEASE
-                tag_hash = hashutil.bytehex_to_hash(ref_object_hex)
                 tagged_object_hex = obj.object[1]
                 try:
                     tagged_obj = self.pack[tagged_object_hex]
@@ -778,8 +798,7 @@ class GitLoader(BaseGitLoader):
                     obj = None
                     break
                 else:
-                    tagged_hash = hashutil.bytehex_to_hash(tagged_object_hex)
-                    tags[tag_hash] = tagged_hash
+                    tags[ref_object_hex] = tagged_object_hex
                     obj = tagged_obj
                     ref_object_hex = tagged_object_hex
 
@@ -789,10 +808,10 @@ class GitLoader(BaseGitLoader):
 
             # Handle refs or tags pointing at blobs or trees...
             if obj.type_name == b"blob":
-                new_blobs.append(hashutil.bytehex_to_hash(ref_object_hex))
+                new_blobs.append(self.next_vertex_id(ref_object_hex))
                 continue
             if obj.type_name == b"tree":
-                new_trees.append(hashutil.bytehex_to_hash(ref_object_hex))
+                new_trees.append(self.next_vertex_id(ref_object_hex))
                 continue
 
             # From here on out, we handle commits only
@@ -809,7 +828,6 @@ class GitLoader(BaseGitLoader):
             i = 0
             while len(parents) > 0:
                 commit_hex = parents.pop(0)
-                commit_hash = hashutil.bytehex_to_hash(commit_hex)
                 t = have_traversed(traversed_objects, commit_hex)
                 if t is None or t is True:
                     continue
@@ -822,19 +840,20 @@ class GitLoader(BaseGitLoader):
                 if i > 0 and i % 100000 == 0:
                     logger.debug(f"{ref_name}: {i} commits processed...")
                 i = i + 1
-                tree_hash = hashutil.bytehex_to_hash(commit.tree)
-                commits[commit_hash] = tree_hash
+                commits[commit_hex] = commit.tree
                 for parent_hex in commit.parents:
                     t = have_traversed(traversed_objects, parent_hex)
                     if t is None or t is True:
                         continue
                     if parent_hex in parents:
                         continue
-                    if parent_hex not in self.pack:
-                        continue
                     parents.append(parent_hex)
-                    parent_hash = hashutil.bytehex_to_hash(parent_hex)
-                    commit_edges.append((commit_hash, parent_hash))
+                    commit_edges.append(
+                        (
+                            self.vertex_ids[self.hex_id_to_idx(commit_hex)],
+                            self.vertex_ids[self.hex_id_to_idx(parent_hex)],
+                        )
+                    )
 
         logger.debug(f"Found {len(tags)} tags and {len(commits)} commits")
 
@@ -844,6 +863,7 @@ class GitLoader(BaseGitLoader):
         def add_vertices(new_vertices, object_type):
             attributes = dict()
             attributes["object_type"] = [object_type for x in new_vertices]
+            logger.debug(f"adding vertices {new_vertices} of type {object_type}")
             self._object_graph.add_vertices(new_vertices, attributes=attributes)
 
         # Add vertices for any directly referenced trees and blobs.
@@ -856,11 +876,23 @@ class GitLoader(BaseGitLoader):
             new_blobs = []
 
         # Add tags, commits and root trees to the graph
-        add_vertices(list(tags.keys()), GitObjectType.TAG)
-        add_vertices(list(commits.keys()), GitObjectType.COMMIT)
-        add_vertices(list(commits.values()), GitObjectType.TREE)
-        self._object_graph.add_edges(zip(tags.keys(), tags.values()))
-        self._object_graph.add_edges(zip(commits.keys(), commits.values()))
+        tag_vertices = list(map(self.next_vertex_id, tags.keys()))
+        add_vertices(tag_vertices, GitObjectType.TAG)
+        commit_vertices = list(map(self.next_vertex_id, commits.keys()))
+        add_vertices(commit_vertices, GitObjectType.COMMIT)
+        tree_vertices = list(map(self.next_vertex_id, commits.values()))
+        add_vertices(commit_vertices, GitObjectType.COMMIT)
+        add_vertices(tree_vertices, GitObjectType.TREE)
+        tagged_vertices = list(map(self.next_vertex_id, tags.values()))
+        logger.debug(
+            f"adding tag-tagged edges: {list(zip(tag_vertices, tagged_vertices))}"
+        )
+        self._object_graph.add_edges(list(zip(tag_vertices, tagged_vertices)))
+        logger.debug(
+            f"adding commit-tree edges: {list(zip(commit_vertices, tree_vertices))}"
+        )
+        self._object_graph.add_edges(list(zip(commit_vertices, tree_vertices)))
+        logger.debug(f"adding commit-commit edges: {commit_edges}")
         self._object_graph.add_edges(commit_edges)
 
         # Populate the graph with trees and blobs
@@ -870,38 +902,38 @@ class GitLoader(BaseGitLoader):
         num_objects_found = 0
         num_tree_entries = 0
         logger.info("Adding trees and blobs to the graph...")
-        for commit_hash, tree_hash in commits.items():
+        for commit_hex, tree_hex in commits.items():
             # Show some debug progress output for very large datasets
             if i > 0 and i % 10000 == 0:
                 logger.debug(f"Adding trees and blobs: {i} commits processed...")
             i = i + 1
-            subtrees = [tree_hash]
+            subtrees = [tree_hex]
             while len(subtrees) > 0:
-                tree_hash = subtrees.pop(0)
-                tree_hex = hashutil.hash_to_bytehex(tree_hash)
+                tree_hex = subtrees.pop(0)
                 t = have_traversed(traversed_objects, tree_hex)
                 if t is None or t is True:
                     continue
                 mark_as_traversed(traversed_objects, tree_hex)
-                try:
-                    tree = self.pack[tree_hex]
-                except KeyError:
-                    continue
+                tree = self.pack[tree_hex]
                 for name, mode, entry_hex in tree.iteritems():
                     if mode & submodule_mode == submodule_mode:
                         continue  # ignore submodules
                     t = have_traversed(traversed_objects, entry_hex)
                     if t is None:
                         continue  # not present in pack file
-                    entry_hash = hashutil.bytehex_to_hash(entry_hex)
-                    new_edges.append((tree_hash, entry_hash))
                     if t is False:
                         if mode & stat.S_IFDIR:
-                            new_trees.append(entry_hash)
-                            subtrees.append(entry_hash)
+                            new_trees.append(self.next_vertex_id(entry_hex))
+                            subtrees.append(entry_hex)
                         else:
-                            new_blobs.append(entry_hash)
+                            new_blobs.append(self.next_vertex_id(entry_hex))
                             mark_as_traversed(traversed_objects, entry_hex)
+                    new_edges.append(
+                        (
+                            self.vertex_ids[self.hex_id_to_idx(tree_hex)],
+                            self.vertex_ids[self.hex_id_to_idx(entry_hex)],
+                        )
+                    )
 
                 # add new vertices and edges in batches for performance reasons
                 if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 1000000:
@@ -915,6 +947,7 @@ class GitLoader(BaseGitLoader):
                             f"Adding {len(new_edges)} tree entries to the graph "
                             f"({num_tree_entries} entries total)"
                         )
+                        logger.debug(f"adding tree-entry edges: {new_edges}")
                         self._object_graph.add_edges(new_edges)
                     num_objects_found += len(new_trees) + len(new_blobs)
                     logger.debug(
@@ -936,6 +969,7 @@ class GitLoader(BaseGitLoader):
                 f"Adding {len(new_edges)} tree entries to the graph "
                 f"({num_tree_entries} entries total)"
             )
+            logger.debug(f"adding tree-entry edges: {new_edges}")
             self._object_graph.add_edges(new_edges)
 
         logger.info(
@@ -1015,42 +1049,66 @@ class GitLoader(BaseGitLoader):
                     if s not in vertices:
                         vertices.append(s)
                     yield (
-                        self._object_graph.vs[s]["name"],
+                        hashutil.bytehex_to_hash(
+                            self.idx_to_hex_id(self.packidx_slots[s])
+                        ),
                         self._object_graph.vs[s]["object_type"],
                     )
 
         def get_neighbors(v, object_type=None):
             for s in self._object_graph.neighbors(v):
+                logger.debug(f"neighbor {s}")
                 if object_type is not None:
                     if self._object_graph.vs[s]["object_type"] != object_type:
+                        logger.debug(f"object type expected: {object_type}")
+                        logger.debug(
+                            f"object type actual: {self._object_graph.vs[s]['object_type']}"
+                        )
                         continue
                 yield (
-                    self._object_graph.vs[s]["name"],
+                    hashutil.bytehex_to_hash(self.idx_to_hex_id(self.packidx_slots[s])),
                     self._object_graph.vs[s]["object_type"],
                 )
 
+        def map_idx_to_hex_id(idx):
+            logger.debug(f"asking for hex ID of idx {idx}")
+            return self.idx_to_hex_id(idx)
+
+        def map_vertex_to_packidx_slot(vertex):
+            return self.vertex_to_packidx_slot(vertex)
+
+        blob_hex_ids: List[Sha1Git] = []
         try:
             blob_vertices = self._object_graph.vs.select(object_type=GitObjectType.BLOB)
+            blob_packidx_slots = list(
+                map(map_vertex_to_packidx_slot, blob_vertices["name"])
+            )
+            blob_hex_ids = list(map(map_idx_to_hex_id, blob_packidx_slots))
         except KeyError:
             missing_contents = set()
         else:
             missing_contents = set(
-                self.storage.content_missing_per_sha1_git(blob_vertices["name"])
+                self.storage.content_missing_per_sha1_git(blob_hex_ids)
             )
+        logger.debug(f"missing contents: {missing_contents}")
 
+        tree_hex_ids: List[Sha1Git] = []
         try:
             tree_vertices = self._object_graph.vs.select(object_type=GitObjectType.TREE)
+            tree_packidx_slots = list(
+                map(map_vertex_to_packidx_slot, tree_vertices["name"])
+            )
+            tree_hex_ids = list(map(map_idx_to_hex_id, tree_packidx_slots))
         except KeyError:
-            tree_vertices = set()
+            tree_hex_ids = list()
 
         self.log.debug(
             "Number of packed blobs that are missing in storage: "
-            f"{len(missing_contents)} of {len(blob_vertices['name'])} packed blobs total"
+            f"{len(missing_contents)} of {len(blob_hex_ids)} packed blobs total"
         )
-        tree_hashes: List[bytes] = []
-        if len(missing_contents) == len(blob_vertices["name"]):
+        if len(missing_contents) == len(blob_hex_ids):
             # If all blobs are missing then all trees are missing, too.
-            tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices]
+            tree_hashes = [hashutil.bytehex_to_hash(t) for t in tree_hex_ids]
             missing_directories = set(tree_hashes)
             self.log.debug(
                 "Number of packed trees considered missing by implication: "
@@ -1063,8 +1121,8 @@ class GitLoader(BaseGitLoader):
                 "depend on missing packed blobs"
             )
             nsearched = 0
-            for t in tree_vertices:
-                tree_hash = t["name"]
+            for tree_hex in tree_hex_ids:
+                tree_hash = hashutil.bytehex_to_hash(tree_hex)
                 nsearched += 1
                 if (nsearched % 5000) == 0:
                     self.log.debug(
@@ -1074,7 +1132,8 @@ class GitLoader(BaseGitLoader):
                 if tree_hash in missing_directories:
                     continue
                 have_dep = False
-                for dep_hash, dep_type in get_successors(t):
+                v = self.vertex_ids[self.hex_id_to_idx(tree_hex)]
+                for dep_hash, dep_type in get_successors(v):
                     have_dep = True
                     if dep_type == GitObjectType.BLOB and dep_hash in missing_contents:
                         # We can infer that the tree is also missing.
@@ -1082,10 +1141,8 @@ class GitLoader(BaseGitLoader):
                         break
                 if not have_dep:
                     # An empty tree has no dependencies. Determine if it is missing.
-                    tree_hashes = [hashutil.hash_to_bytes(tree_hash)]
-                    missing_empty_tree = set(
-                        self.storage.directory_missing(tree_hashes)
-                    )
+                    tree_ids = [tree_hex]
+                    missing_empty_tree = set(self.storage.directory_missing(tree_ids))
                     if len(missing_empty_tree):
                         missing_directories.add(tree_hash)
             self.log.debug(
@@ -1097,27 +1154,33 @@ class GitLoader(BaseGitLoader):
                 f"{len(missing_directories)}"
             )
         else:
-            tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices]
+            tree_hashes = [hashutil.bytehex_to_hash(t) for t in tree_hex_ids]
             missing_directories = set(self.storage.directory_missing(tree_hashes))
             self.log.debug(
                 "Number of missing trees according to archive query: "
                 f"{len(missing_directories)}"
             )
 
+        commit_hex_ids: List[Sha1Git] = []
         try:
             commit_vertices = self._object_graph.vs.select(
                 object_type=GitObjectType.COMMIT
             )
+            commit_packidx_slots = list(
+                map(map_vertex_to_packidx_slot, commit_vertices["name"])
+            )
+            commit_hex_ids = list(map(map_idx_to_hex_id, commit_packidx_slots))
         except KeyError:
-            commit_vertices = set()
+            commit_hex_ids = list()
+
         missing_revisions = set()
         if len(missing_contents) > 0 or len(missing_directories) > 0:
             self.log.debug(
                 "Searching for packed commits which depend on missing packed blobs or trees"
             )
             nsearched = 0
-            for c in commit_vertices:
-                commit_hash = c["name"]
+            for commit_hex in commit_hex_ids:
+                commit_hash = hashutil.bytehex_to_hash(commit_hex)
                 nsearched += 1
                 if (nsearched % 5000) == 0:
                     self.log.debug(
@@ -1126,7 +1189,10 @@ class GitLoader(BaseGitLoader):
                     )
                 if commit_hash in missing_revisions:
                     continue
-                for dep_hash, dep_type in get_neighbors(c, GitObjectType.TREE):
+                v = self.vertex_ids[self.hex_id_to_idx(commit_hex)]
+                logger.debug(f"checking commit vertex {v}")
+                for dep_hash, dep_type in get_neighbors(v, GitObjectType.TREE):
+                    logger.debug(f"dep_hash: {dep_hash}")
                     if dep_hash in missing_contents or dep_hash in missing_directories:
                         # We can infer that the commit is also missing.
                         missing_revisions.add(commit_hash)
@@ -1140,11 +1206,11 @@ class GitLoader(BaseGitLoader):
                 f"{len(missing_revisions)}"
             )
         else:
-            commit_hashes = [c["name"] for c in commit_vertices]
+            commit_hashes = [hashutil.bytehex_to_hash(c) for c in commit_hex_ids]
             missing_revisions = set(self.storage.revision_missing(commit_hashes))
 
         for blob_hash in missing_contents:
-            obj = self.pack[hashutil.hash_to_bytehex(blob_hash)]
+            obj = self.pack[hashutil.hash_to_bytes(blob_hash)]
             if obj.id in self.ref_object_types:
                 self.ref_object_types[obj.id] = SnapshotTargetType.CONTENT
             content = converters.dulwich_blob_to_content(