commit 6c6f6338fe67e530e3ed5171fc67ad4754545d37 from: Stefan Sperling date: Thu Oct 3 08:59:13 2024 UTC use a bitstring to keep track of traversed object IDs The aim is to reduce the memory footprint of object graph construction commit - 7b0e91db76ad97ace322f6ffbb46cfa524e6eb9e commit + 6c6f6338fe67e530e3ed5171fc67ad4754545d37 blob - 20f92a7854e1890e0ceed4a25721253321f2aafb blob + 01d8160cd8b522113836973aa5fae67ad62cf054 --- requirements.txt +++ requirements.txt @@ -1,3 +1,4 @@ +bitstring dulwich >= 0.21.5 retrying click blob - 9f4075086dd293ffa2c6d58a2052c947c69dc292 blob + 4214f5df61cd0b898384d1262b1aa87f0586f532 --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -28,10 +28,11 @@ from typing import ( Type, ) +from bitstring import BitArray import dulwich.client from dulwich.object_store import ObjectStoreGraphWalker -from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree -from dulwich.pack import Pack, PackData, PackInflater, load_pack_index +from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree, hex_to_sha +from dulwich.pack import Pack, PackData, PackInflater, bisect_find_sha, load_pack_index from igraph import Graph import urllib3.util @@ -708,10 +709,10 @@ class GitLoader(BaseGitLoader): resolve_ext_ref=self._resolve_ext_ref, ) packindex = load_pack_index(indexfile.name) - self.num_objects = len(packindex) - logger.info(f"Indexing pack file: 100% ({self.num_objects} objects)") self.pack = Pack.from_objects(packdata, packindex) self.pack.resolve_ext_ref = self._resolve_ext_ref + self.num_objects = len(self.pack.index) + logger.info(f"Indexing pack file: 100% ({self.num_objects} objects)") self.make_object_graph() return False @@ -720,11 +721,36 @@ class GitLoader(BaseGitLoader): logger.info("Building packed objects graph") self._object_graph = Graph(directed=True) + + def get_pos_in_index(id_hex): + """ + Return the pack index table entry number for a given object ID. + + Based on dulwich.pack.FilePackIndex._object_offset() + XXX: This reaches into Dulwich internals; need an API? + """ + sha = hex_to_sha(id_hex) + idx = ord(sha[:1]) + if idx == 0: + start = 0 + else: + start = self.pack.index._fan_out_table[idx - 1] + end = self.pack.index._fan_out_table[idx] + return bisect_find_sha(start, end, sha, self.pack.index._unpack_name) + + def mark_as_traversed(bitstr, id_hex): + bitstr.set(True, get_pos_in_index(id_hex)) + + def have_traversed(bitstr, id_hex): + n = get_pos_in_index(id_hex) + if n is None: + return False + return bitstr[n] # Find all tags, commits and corresponding tree roots in the pack file tags = {} commits = {} - traversed_commits = set() + traversed_objects = BitArray(length=self.num_objects) commit_edges = [] new_trees = [] new_blobs = [] @@ -779,9 +805,9 @@ class GitLoader(BaseGitLoader): while len(parents) > 0: commit_hex = parents.pop(0) commit_hash = hashutil.bytehex_to_hash(commit_hex) - if commit_hash in traversed_commits: + if have_traversed(traversed_objects, commit_hex): continue - traversed_commits.add(commit_hash) + mark_as_traversed(traversed_objects, commit_hex) try: commit = self.pack[commit_hex] except KeyError: @@ -793,14 +819,14 @@ class GitLoader(BaseGitLoader): tree_hash = hashutil.bytehex_to_hash(commit.tree) commits[commit_hash] = tree_hash for parent_hex in commit.parents: + if have_traversed(traversed_objects, parent_hex): + continue if parent_hex in parents: continue if parent_hex not in self.pack: continue - parent_hash = hashutil.bytehex_to_hash(parent_hex) - if parent_hash in traversed_commits: - continue parents.append(parent_hex) + parent_hash = hashutil.bytehex_to_hash(parent_hex) commit_edges.append((commit_hash, parent_hash)) logger.debug(f"Found {len(tags)} tags and {len(commits)} commits") @@ -832,8 +858,6 @@ class GitLoader(BaseGitLoader): # Populate the graph with trees and blobs new_edges = [] - traversed_trees = set() - seen_blobs = set() submodule_mode = stat.S_IFDIR | stat.S_IFLNK i = 0 num_objects_found = 0 @@ -847,10 +871,10 @@ class GitLoader(BaseGitLoader): subtrees = [tree_hash] while len(subtrees) > 0: tree_hash = subtrees.pop(0) - if tree_hash in traversed_trees: + tree_hex = hashutil.hash_to_bytehex(tree_hash) + if have_traversed(traversed_objects, tree_hex): continue - traversed_trees.add(tree_hash) - tree_hex = hashutil.hash_to_bytehex(tree_hash) + mark_as_traversed(traversed_objects, tree_hex) try: tree = self.pack[tree_hex] except KeyError: @@ -862,12 +886,12 @@ class GitLoader(BaseGitLoader): continue entry_hash = hashutil.bytehex_to_hash(entry_hex) if mode & stat.S_IFDIR: - if entry_hash not in traversed_trees: + if not have_traversed(traversed_objects, entry_hex): new_trees.append(entry_hash) subtrees.append(entry_hash) - elif entry_hash not in seen_blobs: + elif not have_traversed(traversed_objects, entry_hex): new_blobs.append(entry_hash) - seen_blobs.add(entry_hash) + mark_as_traversed(traversed_objects, entry_hex) new_edges.append((tree_hash, entry_hash)) # add new vertices and edges in batches for performance reasons