commit 39b88888999abc64dd5ea2dd56e48bcc639fab15 from: Stefan Sperling date: Wed Sep 11 11:16:55 2024 UTC keep track of Git object types in the graph of packed objects commit - aaff2b2add7c25bd5ab8c1ad8936e2c808b87e29 commit + 39b88888999abc64dd5ea2dd56e48bcc639fab15 blob - 7238aa354bc75c7c76d30de51c6aa8257b766e42 blob + c31af09d8a3d859fa7c78e67803100b9d74404ac --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -6,6 +6,7 @@ import collections from dataclasses import dataclass import datetime +from enum import Enum import json import logging import os @@ -86,6 +87,20 @@ def split_lines_and_remainder(buf: bytes) -> Tuple[Lis # The buffer didn't end with a newline, we need to keep the # last bit as the beginning of the next line return lines[:-1], lines[-1] + + +class GitObjectType(Enum): + UNKNOWN = 0 + COMMIT = 1 + TREE = 2 + BLOB = 3 + TAG = 4 + # 5 is reserved + OFFSET_DELTA = 6 + REF_DELTA = 7 + + def __str__(self): + return f"{self.name}".lower() class RepoRepresentation: @@ -742,17 +757,24 @@ class GitLoader(BaseGitLoader): parents.append(parent_hex) parent_hash = hashutil.bytehex_to_hash(parent_hex) commit_edges.append((commit_hash, parent_hash)) + + def add_vertices(new_vertices, object_type): + attributes = dict() + attributes["object_type"] = [object_type for x in new_vertices] + self._object_graph.add_vertices(new_vertices, attributes=attributes) # Add commits and root trees to the graph - self._object_graph.add_vertices(list(commits.keys())) - self._object_graph.add_vertices(list(commits.values())) + add_vertices(list(commits.keys()), GitObjectType.COMMIT) + add_vertices(list(commits.values()), GitObjectType.TREE) self._object_graph.add_edges(zip(commits.keys(), commits.values())) self._object_graph.add_edges(commit_edges) # Populate the graph with trees and blobs - new_vertices = [] + new_trees = [] + new_blobs = [] new_edges = [] traversed_trees = set() + submodule_mode = stat.S_IFDIR | stat.S_IFLNK for commit_hash, tree_hash in commits.items(): logger.debug( f"commit {hashutil.hash_to_hex(commit_hash)} " @@ -773,12 +795,15 @@ class GitLoader(BaseGitLoader): logger.debug(f"Entries of {tree}:") for (name, mode, entry_hex) in tree.iteritems(): logger.debug(f" {name} {mode} {entry_hex}") + if mode & submodule_mode == submodule_mode: + continue # ignore submodules entry_hash = hashutil.bytehex_to_hash(entry_hex) - new_vertices.append(entry_hash) + if mode & stat.S_IFDIR: + new_trees.append(entry_hash) + else: + new_blobs.append(entry_hash) new_edges.append((tree_hash, entry_hash)) if mode & stat.S_IFDIR: - if mode & stat.S_IFLNK: - continue # ignore submodules try: tree = self.pack[entry_hex] subtrees.append(entry_hash) @@ -786,26 +811,34 @@ class GitLoader(BaseGitLoader): except KeyError: logger.debug(f" pack is missing: {entry_hex}") # add new vertices and edges in batches for performance reasons - if len(new_vertices) > 100000 or len(new_edges) > 100000: - self._object_graph.add_vertices(new_vertices) + if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 100000: + if len(new_trees) > 0: + add_vertices(new_trees, GitObjectType.TREE) + if len(new_blobs) > 0: + add_vertices(new_blobs, GitObjectType.BLOB) self._object_graph.add_edges(new_edges) - new_vertices = [] + new_trees = [] + new_blobs = [] new_edges = [] - self._object_graph.add_vertices(new_vertices) + if len(new_trees) > 0: + add_vertices(new_trees, GitObjectType.TREE) + if len(new_blobs) > 0: + add_vertices(new_blobs, GitObjectType.BLOB) self._object_graph.add_edges(new_edges) for v in self._object_graph.vs: name = hashutil.hash_to_bytehex(v["name"]) successors = [] vertices = [v] + object_type = v["object_type"] while len(vertices) > 0: v = vertices.pop(0) for s in self._object_graph.successors(v): sname = self._object_graph.vs["name"][s] successors.append(hashutil.hash_to_bytehex(sname)) vertices.append(s) - logger.debug(f"object {name} depends on {successors}") + logger.debug(f"{object_type} {name} depends on {successors}") def save_data(self) -> None: """Store a pack for archival"""