commit - 7b0e91db76ad97ace322f6ffbb46cfa524e6eb9e
commit + 6c6f6338fe67e530e3ed5171fc67ad4754545d37
blob - 20f92a7854e1890e0ceed4a25721253321f2aafb
blob + 01d8160cd8b522113836973aa5fae67ad62cf054
--- requirements.txt
+++ requirements.txt
+bitstring
dulwich >= 0.21.5
retrying
click
blob - 9f4075086dd293ffa2c6d58a2052c947c69dc292
blob + 4214f5df61cd0b898384d1262b1aa87f0586f532
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
Type,
)
+from bitstring import BitArray
import dulwich.client
from dulwich.object_store import ObjectStoreGraphWalker
-from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree
-from dulwich.pack import Pack, PackData, PackInflater, load_pack_index
+from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree, hex_to_sha
+from dulwich.pack import Pack, PackData, PackInflater, bisect_find_sha, load_pack_index
from igraph import Graph
import urllib3.util
resolve_ext_ref=self._resolve_ext_ref,
)
packindex = load_pack_index(indexfile.name)
- self.num_objects = len(packindex)
- logger.info(f"Indexing pack file: 100% ({self.num_objects} objects)")
self.pack = Pack.from_objects(packdata, packindex)
self.pack.resolve_ext_ref = self._resolve_ext_ref
+ self.num_objects = len(self.pack.index)
+ logger.info(f"Indexing pack file: 100% ({self.num_objects} objects)")
self.make_object_graph()
return False
logger.info("Building packed objects graph")
self._object_graph = Graph(directed=True)
+
+ def get_pos_in_index(id_hex):
+ """
+ Return the pack index table entry number for a given object ID.
+
+ Based on dulwich.pack.FilePackIndex._object_offset()
+ XXX: This reaches into Dulwich internals; need an API?
+ """
+ sha = hex_to_sha(id_hex)
+ idx = ord(sha[:1])
+ if idx == 0:
+ start = 0
+ else:
+ start = self.pack.index._fan_out_table[idx - 1]
+ end = self.pack.index._fan_out_table[idx]
+ return bisect_find_sha(start, end, sha, self.pack.index._unpack_name)
+
+ def mark_as_traversed(bitstr, id_hex):
+ bitstr.set(True, get_pos_in_index(id_hex))
+
+ def have_traversed(bitstr, id_hex):
+ n = get_pos_in_index(id_hex)
+ if n is None:
+ return False
+ return bitstr[n]
# Find all tags, commits and corresponding tree roots in the pack file
tags = {}
commits = {}
- traversed_commits = set()
+ traversed_objects = BitArray(length=self.num_objects)
commit_edges = []
new_trees = []
new_blobs = []
while len(parents) > 0:
commit_hex = parents.pop(0)
commit_hash = hashutil.bytehex_to_hash(commit_hex)
- if commit_hash in traversed_commits:
+ if have_traversed(traversed_objects, commit_hex):
continue
- traversed_commits.add(commit_hash)
+ mark_as_traversed(traversed_objects, commit_hex)
try:
commit = self.pack[commit_hex]
except KeyError:
tree_hash = hashutil.bytehex_to_hash(commit.tree)
commits[commit_hash] = tree_hash
for parent_hex in commit.parents:
+ if have_traversed(traversed_objects, parent_hex):
+ continue
if parent_hex in parents:
continue
if parent_hex not in self.pack:
continue
- parent_hash = hashutil.bytehex_to_hash(parent_hex)
- if parent_hash in traversed_commits:
- continue
parents.append(parent_hex)
+ parent_hash = hashutil.bytehex_to_hash(parent_hex)
commit_edges.append((commit_hash, parent_hash))
logger.debug(f"Found {len(tags)} tags and {len(commits)} commits")
# Populate the graph with trees and blobs
new_edges = []
- traversed_trees = set()
- seen_blobs = set()
submodule_mode = stat.S_IFDIR | stat.S_IFLNK
i = 0
num_objects_found = 0
subtrees = [tree_hash]
while len(subtrees) > 0:
tree_hash = subtrees.pop(0)
- if tree_hash in traversed_trees:
+ tree_hex = hashutil.hash_to_bytehex(tree_hash)
+ if have_traversed(traversed_objects, tree_hex):
continue
- traversed_trees.add(tree_hash)
- tree_hex = hashutil.hash_to_bytehex(tree_hash)
+ mark_as_traversed(traversed_objects, tree_hex)
try:
tree = self.pack[tree_hex]
except KeyError:
continue
entry_hash = hashutil.bytehex_to_hash(entry_hex)
if mode & stat.S_IFDIR:
- if entry_hash not in traversed_trees:
+ if not have_traversed(traversed_objects, entry_hex):
new_trees.append(entry_hash)
subtrees.append(entry_hash)
- elif entry_hash not in seen_blobs:
+ elif not have_traversed(traversed_objects, entry_hex):
new_blobs.append(entry_hash)
- seen_blobs.add(entry_hash)
+ mark_as_traversed(traversed_objects, entry_hex)
new_edges.append((tree_hash, entry_hash))
# add new vertices and edges in batches for performance reasons