Commit Diff


commit - aaff2b2add7c25bd5ab8c1ad8936e2c808b87e29
commit + 39b88888999abc64dd5ea2dd56e48bcc639fab15
blob - 7238aa354bc75c7c76d30de51c6aa8257b766e42
blob + c31af09d8a3d859fa7c78e67803100b9d74404ac
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -6,6 +6,7 @@
 import collections
 from dataclasses import dataclass
 import datetime
+from enum import Enum
 import json
 import logging
 import os
@@ -86,6 +87,20 @@ def split_lines_and_remainder(buf: bytes) -> Tuple[Lis
         # The buffer didn't end with a newline, we need to keep the
         # last bit as the beginning of the next line
         return lines[:-1], lines[-1]
+
+
+class GitObjectType(Enum):
+    UNKNOWN = 0
+    COMMIT = 1
+    TREE = 2
+    BLOB = 3
+    TAG = 4
+    # 5 is reserved
+    OFFSET_DELTA = 6
+    REF_DELTA = 7
+
+    def __str__(self):
+        return f"{self.name}".lower()
 
 
 class RepoRepresentation:
@@ -742,17 +757,24 @@ class GitLoader(BaseGitLoader):
                         parents.append(parent_hex)
                         parent_hash = hashutil.bytehex_to_hash(parent_hex)
                         commit_edges.append((commit_hash, parent_hash))
+
+        def add_vertices(new_vertices, object_type):
+            attributes = dict()
+            attributes["object_type"] = [object_type for x in new_vertices]
+            self._object_graph.add_vertices(new_vertices, attributes=attributes)
 
         # Add commits and root trees to the graph
-        self._object_graph.add_vertices(list(commits.keys()))
-        self._object_graph.add_vertices(list(commits.values()))
+        add_vertices(list(commits.keys()), GitObjectType.COMMIT)
+        add_vertices(list(commits.values()), GitObjectType.TREE)
         self._object_graph.add_edges(zip(commits.keys(), commits.values()))
         self._object_graph.add_edges(commit_edges)
 
         # Populate the graph with trees and blobs
-        new_vertices = []
+        new_trees = []
+        new_blobs = []
         new_edges = []
         traversed_trees = set()
+        submodule_mode = stat.S_IFDIR | stat.S_IFLNK
         for commit_hash, tree_hash in commits.items():
             logger.debug(
                 f"commit {hashutil.hash_to_hex(commit_hash)} "
@@ -773,12 +795,15 @@ class GitLoader(BaseGitLoader):
                 logger.debug(f"Entries of {tree}:")
                 for (name, mode, entry_hex) in tree.iteritems():
                     logger.debug(f"  {name} {mode} {entry_hex}")
+                    if mode & submodule_mode == submodule_mode:
+                        continue  # ignore submodules
                     entry_hash = hashutil.bytehex_to_hash(entry_hex)
-                    new_vertices.append(entry_hash)
+                    if mode & stat.S_IFDIR:
+                        new_trees.append(entry_hash)
+                    else:
+                        new_blobs.append(entry_hash)
                     new_edges.append((tree_hash, entry_hash))
                     if mode & stat.S_IFDIR:
-                        if mode & stat.S_IFLNK:
-                            continue  # ignore submodules
                         try:
                             tree = self.pack[entry_hex]
                             subtrees.append(entry_hash)
@@ -786,26 +811,34 @@ class GitLoader(BaseGitLoader):
                         except KeyError:
                             logger.debug(f"  pack is missing: {entry_hex}")
                 # add new vertices and edges in batches for performance reasons
-                if len(new_vertices) > 100000 or len(new_edges) > 100000:
-                    self._object_graph.add_vertices(new_vertices)
+                if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 100000:
+                    if len(new_trees) > 0:
+                        add_vertices(new_trees, GitObjectType.TREE)
+                    if len(new_blobs) > 0:
+                        add_vertices(new_blobs, GitObjectType.BLOB)
                     self._object_graph.add_edges(new_edges)
-                    new_vertices = []
+                    new_trees = []
+                    new_blobs = []
                     new_edges = []
 
-        self._object_graph.add_vertices(new_vertices)
+        if len(new_trees) > 0:
+            add_vertices(new_trees, GitObjectType.TREE)
+        if len(new_blobs) > 0:
+            add_vertices(new_blobs, GitObjectType.BLOB)
         self._object_graph.add_edges(new_edges)
 
         for v in self._object_graph.vs:
             name = hashutil.hash_to_bytehex(v["name"])
             successors = []
             vertices = [v]
+            object_type = v["object_type"]
             while len(vertices) > 0:
                 v = vertices.pop(0)
                 for s in self._object_graph.successors(v):
                     sname = self._object_graph.vs["name"][s]
                     successors.append(hashutil.hash_to_bytehex(sname))
                     vertices.append(s)
-            logger.debug(f"object {name} depends on {successors}")
+            logger.debug(f"{object_type} {name} depends on {successors}")
 
     def save_data(self) -> None:
         """Store a pack for archival"""