commit 3f6425401a22f5010346a180a91b7d65d4715c43 from: Stefan Sperling date: Wed Sep 11 11:16:55 2024 UTC use igraph to store our graph of packed git objects commit - 948ae59d4196b17d4098e50439b8cc7e728b308e commit + 3f6425401a22f5010346a180a91b7d65d4715c43 blob - f48e919dad45136bc0cbb922a3fd62bce0738448 blob + 39567770d496ea1588e9b7371ee6760215c3c598 --- mypy.ini +++ mypy.ini @@ -21,3 +21,6 @@ ignore_missing_imports = True [mypy-swh.loader.*] ignore_missing_imports = True + +[mypy-igraph.*] +ignore_missing_imports = True blob - 9c56eae6ad1a72bc41731f4faf585c7b631bde47 blob + 20f92a7854e1890e0ceed4a25721253321f2aafb --- requirements.txt +++ requirements.txt @@ -1,3 +1,4 @@ dulwich >= 0.21.5 retrying click +igraph blob - 0f2285d498628de6a1612b2b81a48554344f3de2 blob + 0ff1b4f9df05c766e3abee76e6306f8368a7e180 --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -32,6 +32,7 @@ from dulwich.object_store import ObjectStoreGraphWalke from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree from dulwich.pack import Pack, PackData, PackInflater, load_pack_index import urllib3.util +from igraph import Graph from swh.core.statsd import Statsd from swh.loader.exception import NotFound @@ -700,7 +701,7 @@ class GitLoader(BaseGitLoader): def make_object_graph(self): logger.debug("Building packed objects graph") - self._object_graph = {} + self._object_graph = Graph(directed=True) # Find all commits and corresponding tree roots in the pack file commits = {} @@ -726,42 +727,64 @@ class GitLoader(BaseGitLoader): tree_hash = hashutil.bytehex_to_hash(commit.tree) commit_hash = hashutil.bytehex_to_hash(ref_object_hex) commits[commit_hash] = tree_hash + + # Add root commits and trees to the graph + self._object_graph.add_vertices(list(commits.keys())) + self._object_graph.add_vertices(list(commits.values())) + self._object_graph.add_edges(zip(commits.keys(), commits.values())) + # Populate the graph with trees and blobs + new_vertices = [] + new_edges = [] for commit_hash, tree_hash in commits.items(): logger.debug( f"commit {hashutil.hash_to_hex(commit_hash)} " f"tree {hashutil.hash_to_hex(tree_hash)}" ) - self._object_graph[commit_hash] = list() - self._object_graph[commit_hash].append(tree_hash) - tree_hex = hashutil.hash_to_bytehex(tree_hash) - try: - tree = self.pack[tree_hex] - self._object_graph[tree_hash] = list() - except KeyError: - continue - subtrees = [tree] + subtrees = [tree_hash] while len(subtrees) > 0: - tree = subtrees.pop(0) + tree_hash = subtrees.pop(0) + tree_hex = hashutil.hash_to_bytehex(tree_hash) + try: + tree = self.pack[tree_hex] + except KeyError: + logger.debug(f" pack is missing: {tree_hex}") + continue logger.debug(f"Entries of {tree}:") for (name, mode, entry_hex) in tree.iteritems(): logger.debug(f" {name} {mode} {entry_hex}") entry_hash = hashutil.bytehex_to_hash(entry_hex) - self._object_graph[tree_hash].append(entry_hash) + new_vertices.append(entry_hash) + new_edges.append((tree_hash, entry_hash)) if mode & stat.S_IFDIR: try: tree = self.pack[entry_hex] - subtrees.append(tree) + subtrees.append(entry_hash) logger.debug(f" present in pack: {entry_hex}") except KeyError: logger.debug(f" pack is missing: {entry_hex}") - pass + # add new vertices and edges in batches for performance reasons + if len(new_vertices) > 100000 or len(new_edges) > 100000: + self._object_graph.add_vertices(new_vertices) + self._object_graph.add_edges(new_edges) + new_vertices = [] + new_edges = [] - for (k, l) in self._object_graph.items(): - logger.debug(f"object {hashutil.hash_to_hex(k)}") - for v in l: - logger.debug(f" child {hashutil.hash_to_hex(v)}") + self._object_graph.add_vertices(new_vertices) + self._object_graph.add_edges(new_edges) + for v in self._object_graph.vs: + name = hashutil.hash_to_bytehex(v["name"]) + predecessors = [] + vertices = [v] + while len(vertices) > 0: + v = vertices.pop(0) + for p in self._object_graph.predecessors(v): + pname = self._object_graph.vs["name"][p] + predecessors.append(hashutil.hash_to_bytehex(pname)) + vertices.append(p) + logger.debug(f"object {name} has predecessors {predecessors}") + def save_data(self) -> None: """Store a pack for archival""" assert isinstance(self.visit_date, datetime.datetime)