Commit Diff


commit - 7b0e91db76ad97ace322f6ffbb46cfa524e6eb9e
commit + 6c6f6338fe67e530e3ed5171fc67ad4754545d37
blob - 20f92a7854e1890e0ceed4a25721253321f2aafb
blob + 01d8160cd8b522113836973aa5fae67ad62cf054
--- requirements.txt
+++ requirements.txt
@@ -1,3 +1,4 @@
+bitstring
 dulwich >= 0.21.5
 retrying
 click
blob - 9f4075086dd293ffa2c6d58a2052c947c69dc292
blob + 4214f5df61cd0b898384d1262b1aa87f0586f532
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -28,10 +28,11 @@ from typing import (
     Type,
 )
 
+from bitstring import BitArray
 import dulwich.client
 from dulwich.object_store import ObjectStoreGraphWalker
-from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree
-from dulwich.pack import Pack, PackData, PackInflater, load_pack_index
+from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree, hex_to_sha
+from dulwich.pack import Pack, PackData, PackInflater, bisect_find_sha, load_pack_index
 from igraph import Graph
 import urllib3.util
 
@@ -708,10 +709,10 @@ class GitLoader(BaseGitLoader):
             resolve_ext_ref=self._resolve_ext_ref,
         )
         packindex = load_pack_index(indexfile.name)
-        self.num_objects = len(packindex)
-        logger.info(f"Indexing pack file: 100% ({self.num_objects} objects)")
         self.pack = Pack.from_objects(packdata, packindex)
         self.pack.resolve_ext_ref = self._resolve_ext_ref
+        self.num_objects = len(self.pack.index)
+        logger.info(f"Indexing pack file: 100% ({self.num_objects} objects)")
 
         self.make_object_graph()
         return False
@@ -720,11 +721,36 @@ class GitLoader(BaseGitLoader):
         logger.info("Building packed objects graph")
 
         self._object_graph = Graph(directed=True)
+
+        def get_pos_in_index(id_hex):
+            """
+            Return the pack index table entry number for a given object ID.
+
+            Based on dulwich.pack.FilePackIndex._object_offset()
+            XXX: This reaches into Dulwich internals; need an API?
+            """
+            sha = hex_to_sha(id_hex)
+            idx = ord(sha[:1])
+            if idx == 0:
+                start = 0
+            else:
+                start = self.pack.index._fan_out_table[idx - 1]
+            end = self.pack.index._fan_out_table[idx]
+            return bisect_find_sha(start, end, sha, self.pack.index._unpack_name)
+
+        def mark_as_traversed(bitstr, id_hex):
+            bitstr.set(True, get_pos_in_index(id_hex))
+
+        def have_traversed(bitstr, id_hex):
+            n = get_pos_in_index(id_hex)
+            if n is None:
+                return False
+            return bitstr[n]
 
         # Find all tags, commits and corresponding tree roots in the pack file
         tags = {}
         commits = {}
-        traversed_commits = set()
+        traversed_objects = BitArray(length=self.num_objects)
         commit_edges = []
         new_trees = []
         new_blobs = []
@@ -779,9 +805,9 @@ class GitLoader(BaseGitLoader):
             while len(parents) > 0:
                 commit_hex = parents.pop(0)
                 commit_hash = hashutil.bytehex_to_hash(commit_hex)
-                if commit_hash in traversed_commits:
+                if have_traversed(traversed_objects, commit_hex):
                     continue
-                traversed_commits.add(commit_hash)
+                mark_as_traversed(traversed_objects, commit_hex)
                 try:
                     commit = self.pack[commit_hex]
                 except KeyError:
@@ -793,14 +819,14 @@ class GitLoader(BaseGitLoader):
                 tree_hash = hashutil.bytehex_to_hash(commit.tree)
                 commits[commit_hash] = tree_hash
                 for parent_hex in commit.parents:
+                    if have_traversed(traversed_objects, parent_hex):
+                        continue
                     if parent_hex in parents:
                         continue
                     if parent_hex not in self.pack:
                         continue
-                    parent_hash = hashutil.bytehex_to_hash(parent_hex)
-                    if parent_hash in traversed_commits:
-                        continue
                     parents.append(parent_hex)
+                    parent_hash = hashutil.bytehex_to_hash(parent_hex)
                     commit_edges.append((commit_hash, parent_hash))
 
         logger.debug(f"Found {len(tags)} tags and {len(commits)} commits")
@@ -832,8 +858,6 @@ class GitLoader(BaseGitLoader):
 
         # Populate the graph with trees and blobs
         new_edges = []
-        traversed_trees = set()
-        seen_blobs = set()
         submodule_mode = stat.S_IFDIR | stat.S_IFLNK
         i = 0
         num_objects_found = 0
@@ -847,10 +871,10 @@ class GitLoader(BaseGitLoader):
             subtrees = [tree_hash]
             while len(subtrees) > 0:
                 tree_hash = subtrees.pop(0)
-                if tree_hash in traversed_trees:
+                tree_hex = hashutil.hash_to_bytehex(tree_hash)
+                if have_traversed(traversed_objects, tree_hex):
                     continue
-                traversed_trees.add(tree_hash)
-                tree_hex = hashutil.hash_to_bytehex(tree_hash)
+                mark_as_traversed(traversed_objects, tree_hex)
                 try:
                     tree = self.pack[tree_hex]
                 except KeyError:
@@ -862,12 +886,12 @@ class GitLoader(BaseGitLoader):
                         continue
                     entry_hash = hashutil.bytehex_to_hash(entry_hex)
                     if mode & stat.S_IFDIR:
-                        if entry_hash not in traversed_trees:
+                        if not have_traversed(traversed_objects, entry_hex):
                             new_trees.append(entry_hash)
                             subtrees.append(entry_hash)
-                    elif entry_hash not in seen_blobs:
+                    elif not have_traversed(traversed_objects, entry_hex):
                         new_blobs.append(entry_hash)
-                        seen_blobs.add(entry_hash)
+                        mark_as_traversed(traversed_objects, entry_hex)
                     new_edges.append((tree_hash, entry_hash))
 
                 # add new vertices and edges in batches for performance reasons