commit 948ae59d4196b17d4098e50439b8cc7e728b308e from: Stefan Sperling date: Wed Sep 11 11:16:55 2024 UTC build a graph of objects found in the pack file Not used yet, still WIP. commit - 971c04d1b79f92a74c42df10efba38fddf2ad006 commit + 948ae59d4196b17d4098e50439b8cc7e728b308e blob - 558384425de018b5bd4cb9814df33623677ae268 blob + 0f2285d498628de6a1612b2b81a48554344f3de2 --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -693,8 +693,75 @@ class GitLoader(BaseGitLoader): packindex = load_pack_index(indexfile.name) self.pack = Pack.from_objects(packdata, packindex) self.pack.resolve_ext_ref = self._resolve_ext_ref + + self.make_object_graph() return False + + def make_object_graph(self): + logger.debug("Building packed objects graph") + self._object_graph = {} + + # Find all commits and corresponding tree roots in the pack file + commits = {} + for ref_name, ref_object_hex in self.remote_refs.items(): + if utils.ignore_branch_name(ref_name): + continue + try: + obj = self.pack[ref_object_hex] + except KeyError: + continue + logger.debug(f"Opened {obj}") + while obj.type_name == b"tag": + try: + obj = self.pack[obj.object[1]] + logger.debug(f"Opened {obj}") + except KeyError: + obj = None + break + if obj is None or obj.type_name != b"commit": + continue + + commit = obj + tree_hash = hashutil.bytehex_to_hash(commit.tree) + commit_hash = hashutil.bytehex_to_hash(ref_object_hex) + commits[commit_hash] = tree_hash + + for commit_hash, tree_hash in commits.items(): + logger.debug( + f"commit {hashutil.hash_to_hex(commit_hash)} " + f"tree {hashutil.hash_to_hex(tree_hash)}" + ) + self._object_graph[commit_hash] = list() + self._object_graph[commit_hash].append(tree_hash) + tree_hex = hashutil.hash_to_bytehex(tree_hash) + try: + tree = self.pack[tree_hex] + self._object_graph[tree_hash] = list() + except KeyError: + continue + subtrees = [tree] + while len(subtrees) > 0: + tree = subtrees.pop(0) + logger.debug(f"Entries of {tree}:") + for (name, mode, entry_hex) in tree.iteritems(): + logger.debug(f" {name} {mode} {entry_hex}") + entry_hash = hashutil.bytehex_to_hash(entry_hex) + self._object_graph[tree_hash].append(entry_hash) + if mode & stat.S_IFDIR: + try: + tree = self.pack[entry_hex] + subtrees.append(tree) + logger.debug(f" present in pack: {entry_hex}") + except KeyError: + logger.debug(f" pack is missing: {entry_hex}") + pass + + for (k, l) in self._object_graph.items(): + logger.debug(f"object {hashutil.hash_to_hex(k)}") + for v in l: + logger.debug(f" child {hashutil.hash_to_hex(v)}") + def save_data(self) -> None: """Store a pack for archival""" assert isinstance(self.visit_date, datetime.datetime)