Commit Diff


commit - fc5f2bb353e14a8a06d1cd6bda751d2e02ac8915
commit + 4c3333066c370989f34d96881f559e3eee03391d
blob - 30c91308f9baf38141fb6b5d5cb280aab835b020
blob + 60b4e974f34091a8c67b4b796003d97e06477f43
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -1124,12 +1124,6 @@ class GitLoader(BaseGitLoader):
                     if graph.vs[s]["object_type"] != object_type:
                         continue
                 yield (graph.vs[s]["name"], graph.vs[s]["object_type"])
-
-        def iter_tree_deps(root_tree_vertices):
-            for t in root_tree_vertices:
-                tree_hash = t["name"]
-                tree_deps = get_tree_deps(tree_hash)
-                yield (tree_hash, tree_deps)
 
         def get_recursive_tree_dep_hashes(tree_hash, obj_type=None):
             recursive_deps = get_recursive_tree_deps(tree_hash)
@@ -1165,22 +1159,13 @@ class GitLoader(BaseGitLoader):
         # This list is potentially quite big.
         del blob_hashes
 
-        root_tree_vertices = self._commit_graph.vs.select(
-            object_type=GitObjectType.TREE
-        )
         missing_directories: Set[bytes] = set()
-        tree_hashes: Set[bytes] = set()
         if len(missing_contents) == num_blobs:
             # If all blobs are missing then all trees are missing, too.
-            for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
-                missing_directories.add(hashutil.hash_to_bytes(root_hash))
-                try:
-                    dep_hashes = get_recursive_tree_dep_hashes(
-                        root_hash, obj_type=GitObjectType.TREE
-                    )
-                    missing_directories.update(map(hashutil.hash_to_bytes, dep_hashes))
-                except KeyError:
-                    continue
+            for obj_pos in self._tree_types.search(1):
+                id_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos))
+                id_hash = hashutil.bytehex_to_hash(id_hex)
+                missing_directories.add(hashutil.hash_to_bytes(id_hash))
             self.log.debug(
                 "Number of packed trees considered missing by implication: "
                 f"{len(missing_directories)}"
@@ -1190,55 +1175,30 @@ class GitLoader(BaseGitLoader):
             self.log.debug(
                 "Searching for packed trees which are missing from the archive..."
             )
-            i = 0
-            last_p = -1
-            num_commits_total = len(root_tree_vertices)
-            for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
-                # Show some debug progress output for very large datasets
-                p = int(i * 100 / num_commits_total)
-                if p != last_p:
-                    logger.debug(
-                        "Searching for packed trees which are missing from the archive: "
-                        f"{p}% of commits processed..."
+            for obj_pos in self._tree_types.search(1):
+                tree_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos))
+                tree_hash = hashutil.bytehex_to_hash(tree_hex)
+                is_missing = False
+                tree_deps = get_tree_deps(tree_hash)
+                for dep_pos in tree_deps.search(1):
+                    dep_hex = sha_to_hex(self.pack.index._unpack_name(dep_pos))
+                    dep_hash = hashutil.bytehex_to_hash(dep_hex)
+                    if dep_hash in missing_contents:
+                        # Content this tree depends on is missing.
+                        # We can infer that the tree itself is missing.
+                        missing_directories.add(hashutil.hash_to_bytes(tree_hash))
+                        is_missing = True
+                        break
+                if not is_missing:
+                    # This tree is either empty or has no known-missing dependencies.
+                    # Determine if it is missing by searching the archive.
+                    missing_tree = set(
+                        self.storage.directory_missing(
+                            [hashutil.hash_to_bytes(tree_hash)]
+                        )
                     )
-                    last_p = p
-                i = i + 1
-                subtrees = [root_hash]
-                while len(subtrees) > 0:
-                    tree_hash = subtrees.pop(0)
-                    if hashutil.hash_to_bytes(tree_hash) in missing_directories:
-                        continue
-                    is_missing = False
-                    for dep_hash, dep_type in get_successors(tree_hash):
-                        if dep_type == GitObjectType.TREE:
-                            if dep_hash not in subtrees:
-                                subtrees.append(dep_hash)
-                        elif (
-                            not is_missing
-                            and dep_type == GitObjectType.BLOB
-                            and dep_hash in missing_contents
-                        ):
-                            # Content this tree depends on is missing.
-                            # We can infer that the tree itself is missing.
-                            missing_directories.add(hashutil.hash_to_bytes(tree_hash))
-                            is_missing = True
-                    if not is_missing:
-                        # This tree is either empty or has no known-missing dependencies.
-                        # Determine if it is missing by searching the archive.
-                        missing_tree = set(
-                            self.storage.directory_missing(
-                                [hashutil.hash_to_bytes(tree_hash)]
-                            )
-                        )
-                        if len(missing_tree):
-                            missing_directories.add(hashutil.hash_to_bytes(tree_hash))
-            p = int(i * 100 / num_commits_total)
-            if p != last_p:
-                logger.debug(
-                    "Searching for packed trees which are missing from the archive: "
-                    f"{p}% of commits processed."
-                )
-                last_p = p
+                    if len(missing_tree):
+                        missing_directories.add(hashutil.hash_to_bytes(tree_hash))
             self.log.debug(
                 f"Number of packed trees found missing: {len(missing_directories)}"
             )
@@ -1246,16 +1206,12 @@ class GitLoader(BaseGitLoader):
             # All blobs are present. Any of these blobs might have been added
             # to the archive via imports of other origins. We must query the
             # archive for missing trees specific to the origin we are importing.
-            for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
-                tree_hashes.add(root_hash)
-                try:
-                    dep_hashes = get_recursive_tree_dep_hashes(
-                        root_hash, obj_type=GitObjectType.TREE
-                    )
-                    tree_hashes.add(dep_hashes)
-                except KeyError:
-                    continue
-            missing_directories = set(self.storage.directory_missing(list(tree_hashes)))
+            tree_hashes: List[bytes] = list()
+            for obj_pos in self._tree_types.search(1):
+                tree_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos))
+                tree_hash = hashutil.bytehex_to_hash(tree_hex)
+                tree_hashes.append(tree_hash)
+            missing_directories = set(self.storage.directory_missing(tree_hashes))
             self.log.debug(
                 "Number of missing trees according to archive query: "
                 f"{len(missing_directories)}"