commit - 948ae59d4196b17d4098e50439b8cc7e728b308e
commit + 3f6425401a22f5010346a180a91b7d65d4715c43
blob - f48e919dad45136bc0cbb922a3fd62bce0738448
blob + 39567770d496ea1588e9b7371ee6760215c3c598
--- mypy.ini
+++ mypy.ini
[mypy-swh.loader.*]
ignore_missing_imports = True
+
+[mypy-igraph.*]
+ignore_missing_imports = True
blob - 9c56eae6ad1a72bc41731f4faf585c7b631bde47
blob + 20f92a7854e1890e0ceed4a25721253321f2aafb
--- requirements.txt
+++ requirements.txt
dulwich >= 0.21.5
retrying
click
+igraph
blob - 0f2285d498628de6a1612b2b81a48554344f3de2
blob + 0ff1b4f9df05c766e3abee76e6306f8368a7e180
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
from dulwich.objects import Blob, Commit, ShaFile, Tag, Tree
from dulwich.pack import Pack, PackData, PackInflater, load_pack_index
import urllib3.util
+from igraph import Graph
from swh.core.statsd import Statsd
from swh.loader.exception import NotFound
def make_object_graph(self):
logger.debug("Building packed objects graph")
- self._object_graph = {}
+ self._object_graph = Graph(directed=True)
# Find all commits and corresponding tree roots in the pack file
commits = {}
tree_hash = hashutil.bytehex_to_hash(commit.tree)
commit_hash = hashutil.bytehex_to_hash(ref_object_hex)
commits[commit_hash] = tree_hash
+
+ # Add root commits and trees to the graph
+ self._object_graph.add_vertices(list(commits.keys()))
+ self._object_graph.add_vertices(list(commits.values()))
+ self._object_graph.add_edges(zip(commits.keys(), commits.values()))
+ # Populate the graph with trees and blobs
+ new_vertices = []
+ new_edges = []
for commit_hash, tree_hash in commits.items():
logger.debug(
f"commit {hashutil.hash_to_hex(commit_hash)} "
f"tree {hashutil.hash_to_hex(tree_hash)}"
)
- self._object_graph[commit_hash] = list()
- self._object_graph[commit_hash].append(tree_hash)
- tree_hex = hashutil.hash_to_bytehex(tree_hash)
- try:
- tree = self.pack[tree_hex]
- self._object_graph[tree_hash] = list()
- except KeyError:
- continue
- subtrees = [tree]
+ subtrees = [tree_hash]
while len(subtrees) > 0:
- tree = subtrees.pop(0)
+ tree_hash = subtrees.pop(0)
+ tree_hex = hashutil.hash_to_bytehex(tree_hash)
+ try:
+ tree = self.pack[tree_hex]
+ except KeyError:
+ logger.debug(f" pack is missing: {tree_hex}")
+ continue
logger.debug(f"Entries of {tree}:")
for (name, mode, entry_hex) in tree.iteritems():
logger.debug(f" {name} {mode} {entry_hex}")
entry_hash = hashutil.bytehex_to_hash(entry_hex)
- self._object_graph[tree_hash].append(entry_hash)
+ new_vertices.append(entry_hash)
+ new_edges.append((tree_hash, entry_hash))
if mode & stat.S_IFDIR:
try:
tree = self.pack[entry_hex]
- subtrees.append(tree)
+ subtrees.append(entry_hash)
logger.debug(f" present in pack: {entry_hex}")
except KeyError:
logger.debug(f" pack is missing: {entry_hex}")
- pass
+ # add new vertices and edges in batches for performance reasons
+ if len(new_vertices) > 100000 or len(new_edges) > 100000:
+ self._object_graph.add_vertices(new_vertices)
+ self._object_graph.add_edges(new_edges)
+ new_vertices = []
+ new_edges = []
- for (k, l) in self._object_graph.items():
- logger.debug(f"object {hashutil.hash_to_hex(k)}")
- for v in l:
- logger.debug(f" child {hashutil.hash_to_hex(v)}")
+ self._object_graph.add_vertices(new_vertices)
+ self._object_graph.add_edges(new_edges)
+ for v in self._object_graph.vs:
+ name = hashutil.hash_to_bytehex(v["name"])
+ predecessors = []
+ vertices = [v]
+ while len(vertices) > 0:
+ v = vertices.pop(0)
+ for p in self._object_graph.predecessors(v):
+ pname = self._object_graph.vs["name"][p]
+ predecessors.append(hashutil.hash_to_bytehex(pname))
+ vertices.append(p)
+ logger.debug(f"object {name} has predecessors {predecessors}")
+
def save_data(self) -> None:
"""Store a pack for archival"""
assert isinstance(self.visit_date, datetime.datetime)