commit - aaff2b2add7c25bd5ab8c1ad8936e2c808b87e29
commit + 39b88888999abc64dd5ea2dd56e48bcc639fab15
blob - 7238aa354bc75c7c76d30de51c6aa8257b766e42
blob + c31af09d8a3d859fa7c78e67803100b9d74404ac
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
import collections
from dataclasses import dataclass
import datetime
+from enum import Enum
import json
import logging
import os
# The buffer didn't end with a newline, we need to keep the
# last bit as the beginning of the next line
return lines[:-1], lines[-1]
+
+
+class GitObjectType(Enum):
+ UNKNOWN = 0
+ COMMIT = 1
+ TREE = 2
+ BLOB = 3
+ TAG = 4
+ # 5 is reserved
+ OFFSET_DELTA = 6
+ REF_DELTA = 7
+
+ def __str__(self):
+ return f"{self.name}".lower()
class RepoRepresentation:
parents.append(parent_hex)
parent_hash = hashutil.bytehex_to_hash(parent_hex)
commit_edges.append((commit_hash, parent_hash))
+
+ def add_vertices(new_vertices, object_type):
+ attributes = dict()
+ attributes["object_type"] = [object_type for x in new_vertices]
+ self._object_graph.add_vertices(new_vertices, attributes=attributes)
# Add commits and root trees to the graph
- self._object_graph.add_vertices(list(commits.keys()))
- self._object_graph.add_vertices(list(commits.values()))
+ add_vertices(list(commits.keys()), GitObjectType.COMMIT)
+ add_vertices(list(commits.values()), GitObjectType.TREE)
self._object_graph.add_edges(zip(commits.keys(), commits.values()))
self._object_graph.add_edges(commit_edges)
# Populate the graph with trees and blobs
- new_vertices = []
+ new_trees = []
+ new_blobs = []
new_edges = []
traversed_trees = set()
+ submodule_mode = stat.S_IFDIR | stat.S_IFLNK
for commit_hash, tree_hash in commits.items():
logger.debug(
f"commit {hashutil.hash_to_hex(commit_hash)} "
logger.debug(f"Entries of {tree}:")
for (name, mode, entry_hex) in tree.iteritems():
logger.debug(f" {name} {mode} {entry_hex}")
+ if mode & submodule_mode == submodule_mode:
+ continue # ignore submodules
entry_hash = hashutil.bytehex_to_hash(entry_hex)
- new_vertices.append(entry_hash)
+ if mode & stat.S_IFDIR:
+ new_trees.append(entry_hash)
+ else:
+ new_blobs.append(entry_hash)
new_edges.append((tree_hash, entry_hash))
if mode & stat.S_IFDIR:
- if mode & stat.S_IFLNK:
- continue # ignore submodules
try:
tree = self.pack[entry_hex]
subtrees.append(entry_hash)
except KeyError:
logger.debug(f" pack is missing: {entry_hex}")
# add new vertices and edges in batches for performance reasons
- if len(new_vertices) > 100000 or len(new_edges) > 100000:
- self._object_graph.add_vertices(new_vertices)
+ if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 100000:
+ if len(new_trees) > 0:
+ add_vertices(new_trees, GitObjectType.TREE)
+ if len(new_blobs) > 0:
+ add_vertices(new_blobs, GitObjectType.BLOB)
self._object_graph.add_edges(new_edges)
- new_vertices = []
+ new_trees = []
+ new_blobs = []
new_edges = []
- self._object_graph.add_vertices(new_vertices)
+ if len(new_trees) > 0:
+ add_vertices(new_trees, GitObjectType.TREE)
+ if len(new_blobs) > 0:
+ add_vertices(new_blobs, GitObjectType.BLOB)
self._object_graph.add_edges(new_edges)
for v in self._object_graph.vs:
name = hashutil.hash_to_bytehex(v["name"])
successors = []
vertices = [v]
+ object_type = v["object_type"]
while len(vertices) > 0:
v = vertices.pop(0)
for s in self._object_graph.successors(v):
sname = self._object_graph.vs["name"][s]
successors.append(hashutil.hash_to_bytehex(sname))
vertices.append(s)
- logger.debug(f"object {name} depends on {successors}")
+ logger.debug(f"{object_type} {name} depends on {successors}")
def save_data(self) -> None:
"""Store a pack for archival"""