Commit Diff


commit - 91b0caf89ffbcd64c670ddc387741fe9e8901b5e
commit + cbe1c11028d08815f6840cc4b5fdaeafca82e9ca
blob - ffda1054c499b061edb13fc90d47c3550e64e74b
blob + 034ae31642dea4c2774d19b1ecd4b3546d3967a8
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -724,6 +724,7 @@ class GitLoader(BaseGitLoader):
         commits = {}
         commit_edges = []
         for ref_name, ref_object_hex in self.remote_refs.items():
+            logger.debug(f"Processing ref {ref_name}: {ref_object_hex}")
             if utils.ignore_branch_name(ref_name):
                 continue
             try:
@@ -751,9 +752,11 @@ class GitLoader(BaseGitLoader):
 
             # TODO: Allow tags pointing at blobs or trees?
             if obj is None or obj.type_name != b"commit":
+                logger.debug(f"{ref_object_hex} is a {obj.type_name}, not a commit")
                 continue
 
             parents = [ref_object_hex]
+
             while len(parents) > 0:
                 commit_hex = parents.pop(0)
                 try:
@@ -761,14 +764,18 @@ class GitLoader(BaseGitLoader):
                 except KeyError:
                     logger.debug(f"  pack is missing: {commit_hex}")
                     continue
-                tree_hash = hashutil.bytehex_to_hash(commit.tree)
                 commit_hash = hashutil.bytehex_to_hash(commit_hex)
+                if commit_hash in commits.keys():
+                    continue
+                tree_hash = hashutil.bytehex_to_hash(commit.tree)
                 commits[commit_hash] = tree_hash
                 for parent_hex in commit.parents:
                     if parent_hex in self.pack:
-                        parents.append(parent_hex)
+                        if parent_hex not in parents:
+                            parents.append(parent_hex)
                         parent_hash = hashutil.bytehex_to_hash(parent_hex)
-                        commit_edges.append((commit_hash, parent_hash))
+                        if (commit_hash, parent_hash) not in commit_edges:
+                            commit_edges.append((commit_hash, parent_hash))
 
         def add_vertices(new_vertices, object_type):
             attributes = dict()
@@ -790,10 +797,6 @@ class GitLoader(BaseGitLoader):
         traversed_trees = set()
         submodule_mode = stat.S_IFDIR | stat.S_IFLNK
         for commit_hash, tree_hash in commits.items():
-            logger.debug(
-                f"commit {hashutil.hash_to_hex(commit_hash)} "
-                f"tree {hashutil.hash_to_hex(tree_hash)}"
-            )
             subtrees = [tree_hash]
             while len(subtrees) > 0:
                 tree_hash = subtrees.pop(0)
@@ -806,9 +809,7 @@ class GitLoader(BaseGitLoader):
                 except KeyError:
                     logger.debug(f"  pack is missing: {tree_hex}")
                     continue
-                logger.debug(f"Entries of {tree}:")
                 for name, mode, entry_hex in tree.iteritems():
-                    logger.debug(f"  {name} {mode} {entry_hex}")
                     if mode & submodule_mode == submodule_mode:
                         continue  # ignore submodules
                     entry_hash = hashutil.bytehex_to_hash(entry_hex)
@@ -821,7 +822,6 @@ class GitLoader(BaseGitLoader):
                         try:
                             tree = self.pack[entry_hex]
                             subtrees.append(entry_hash)
-                            logger.debug(f"  present in pack: {entry_hex}")
                         except KeyError:
                             logger.debug(f"  pack is missing: {entry_hex}")
                 # add new vertices and edges in batches for performance reasons
@@ -841,18 +841,7 @@ class GitLoader(BaseGitLoader):
             add_vertices(new_blobs, GitObjectType.BLOB)
         self._object_graph.add_edges(new_edges)
 
-        for v in self._object_graph.vs:
-            name = hashutil.hash_to_bytehex(v["name"])
-            successors = []
-            vertices = [v]
-            object_type = v["object_type"]
-            while len(vertices) > 0:
-                v = vertices.pop(0)
-                for s in self._object_graph.successors(v):
-                    sname = self._object_graph.vs["name"][s]
-                    successors.append(hashutil.hash_to_bytehex(sname))
-                    vertices.append(s)
-            logger.debug(f"{object_type} {name} depends on {successors}")
+        logger.info("Commit graph has been built")
 
     def save_data(self) -> None:
         """Store a pack for archival"""
@@ -914,12 +903,13 @@ class GitLoader(BaseGitLoader):
                     "fetched pack file nor in local heads nor in the archive"
                 )
 
-        def get_dependencies(object_hash):
-            vertices = [self._object_graph.vs.find(name=object_hash)]
+        def get_dependencies(v):
+            vertices = [v]
             while len(vertices) > 0:
                 v = vertices.pop(0)
                 for s in self._object_graph.successors(v):
-                    vertices.append(s)
+                    if s not in vertices:
+                        vertices.append(s)
                     yield (
                         self._object_graph.vs[s]["name"],
                         self._object_graph.vs[s]["object_type"],
@@ -941,24 +931,38 @@ class GitLoader(BaseGitLoader):
 
         self.log.debug(
             "Number of packed blobs that are missing in storage: "
-            f"{len(missing_contents)}"
+            f"{len(missing_contents)} of {len(blob_vertices['name'])} packed blobs total"
         )
         tree_hashes: List[bytes] = []
-        if len(missing_contents) > 0:
-            missing_directories = set()
+        if len(missing_contents) == len(blob_vertices["name"]):
+            # If all blobs are missing then all trees are missing, too.
+            tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices]
+            missing_directories = set(tree_hashes)
             self.log.debug(
-                "Searching for packed trees which depend on missing packed blobs"
+                "Number of packed trees considered missing by implication: "
+                f"{len(missing_directories)}"
             )
+        elif len(missing_contents) > 0:
+            missing_directories = set()
+            self.log.debug(
+                f"Searching {len(tree_vertices)} packed trees for trees which "
+                "depend on missing packed blobs"
+            )
+            nsearched = 0
             for t in tree_vertices:
                 tree_hash = t["name"]
+                nsearched += 1
+                if (nsearched % 5000) == 0:
+                    self.log.debug(
+                        f"Searched {int((nsearched * 100) / len(tree_vertices))}% "
+                        f"of {len(tree_vertices)} packed trees..."
+                    )
+                if tree_hash in missing_directories:
+                    continue
                 have_dep = False
-                for dep_hash, dep_type in get_dependencies(tree_hash):
+                for dep_hash, dep_type in get_dependencies(t):
                     have_dep = True
                     if dep_type == GitObjectType.BLOB and dep_hash in missing_contents:
-                        self.log.debug(
-                            f"tree {hashutil.hash_to_bytehex(tree_hash)!r} depends on "
-                            f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}"
-                        )
                         # We can infer that the tree is also missing.
                         missing_directories.add(tree_hash)
                         break
@@ -977,6 +981,10 @@ class GitLoader(BaseGitLoader):
         else:
             tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices]
             missing_directories = set(self.storage.directory_missing(tree_hashes))
+            self.log.debug(
+                "Number of missing trees according to archive query: "
+                f"{len(missing_directories)}"
+            )
 
         try:
             commit_vertices = self._object_graph.vs.select(
@@ -989,14 +997,19 @@ class GitLoader(BaseGitLoader):
             self.log.debug(
                 "Searching for packed commits which depend on missing packed blobs or trees"
             )
+            nsearched = 0
             for c in commit_vertices:
                 commit_hash = c["name"]
-                for dep_hash, dep_type in get_dependencies(commit_hash):
+                nsearched += 1
+                if (nsearched % 5000) == 0:
+                    self.log.debug(
+                        f"Searched {int((nsearched * 100) / len(commit_vertices))}% "
+                        f"of {len(commit_vertices)} packed commits..."
+                    )
+                if commit_hash in missing_revisions:
+                    continue
+                for dep_hash, dep_type in get_dependencies(c):
                     if dep_hash in missing_contents or dep_hash in missing_directories:
-                        self.log.debug(
-                            f"commit {hashutil.hash_to_bytehex(commit_hash)!r} depends on "
-                            f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}"
-                        )
                         # We can infer that the commit is also missing.
                         missing_revisions.add(commit_hash)
                         break
@@ -1035,16 +1048,12 @@ class GitLoader(BaseGitLoader):
             )
             for t in tag_vertices:
                 tag_hash = t["name"]
-                for dep_hash, dep_type in get_dependencies(tag_hash):
+                for dep_hash, dep_type in get_dependencies(t):
                     if (
                         dep_hash in missing_revisions
                         or dep_hash in missing_directories
                         or dep_hash in missing_contents
                     ):
-                        self.log.debug(
-                            f"tag {hashutil.hash_to_bytehex(tag_hash)!r} depends on "
-                            f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}"
-                        )
                         # We can infer that the tag is also missing.
                         missing_releases.add(tag_hash)
                         break