commit - fc5f2bb353e14a8a06d1cd6bda751d2e02ac8915
commit + 4c3333066c370989f34d96881f559e3eee03391d
blob - 30c91308f9baf38141fb6b5d5cb280aab835b020
blob + 60b4e974f34091a8c67b4b796003d97e06477f43
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
if graph.vs[s]["object_type"] != object_type:
continue
yield (graph.vs[s]["name"], graph.vs[s]["object_type"])
-
- def iter_tree_deps(root_tree_vertices):
- for t in root_tree_vertices:
- tree_hash = t["name"]
- tree_deps = get_tree_deps(tree_hash)
- yield (tree_hash, tree_deps)
def get_recursive_tree_dep_hashes(tree_hash, obj_type=None):
recursive_deps = get_recursive_tree_deps(tree_hash)
# This list is potentially quite big.
del blob_hashes
- root_tree_vertices = self._commit_graph.vs.select(
- object_type=GitObjectType.TREE
- )
missing_directories: Set[bytes] = set()
- tree_hashes: Set[bytes] = set()
if len(missing_contents) == num_blobs:
# If all blobs are missing then all trees are missing, too.
- for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
- missing_directories.add(hashutil.hash_to_bytes(root_hash))
- try:
- dep_hashes = get_recursive_tree_dep_hashes(
- root_hash, obj_type=GitObjectType.TREE
- )
- missing_directories.update(map(hashutil.hash_to_bytes, dep_hashes))
- except KeyError:
- continue
+ for obj_pos in self._tree_types.search(1):
+ id_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos))
+ id_hash = hashutil.bytehex_to_hash(id_hex)
+ missing_directories.add(hashutil.hash_to_bytes(id_hash))
self.log.debug(
"Number of packed trees considered missing by implication: "
f"{len(missing_directories)}"
self.log.debug(
"Searching for packed trees which are missing from the archive..."
)
- i = 0
- last_p = -1
- num_commits_total = len(root_tree_vertices)
- for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
- # Show some debug progress output for very large datasets
- p = int(i * 100 / num_commits_total)
- if p != last_p:
- logger.debug(
- "Searching for packed trees which are missing from the archive: "
- f"{p}% of commits processed..."
+ for obj_pos in self._tree_types.search(1):
+ tree_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos))
+ tree_hash = hashutil.bytehex_to_hash(tree_hex)
+ is_missing = False
+ tree_deps = get_tree_deps(tree_hash)
+ for dep_pos in tree_deps.search(1):
+ dep_hex = sha_to_hex(self.pack.index._unpack_name(dep_pos))
+ dep_hash = hashutil.bytehex_to_hash(dep_hex)
+ if dep_hash in missing_contents:
+ # Content this tree depends on is missing.
+ # We can infer that the tree itself is missing.
+ missing_directories.add(hashutil.hash_to_bytes(tree_hash))
+ is_missing = True
+ break
+ if not is_missing:
+ # This tree is either empty or has no known-missing dependencies.
+ # Determine if it is missing by searching the archive.
+ missing_tree = set(
+ self.storage.directory_missing(
+ [hashutil.hash_to_bytes(tree_hash)]
+ )
)
- last_p = p
- i = i + 1
- subtrees = [root_hash]
- while len(subtrees) > 0:
- tree_hash = subtrees.pop(0)
- if hashutil.hash_to_bytes(tree_hash) in missing_directories:
- continue
- is_missing = False
- for dep_hash, dep_type in get_successors(tree_hash):
- if dep_type == GitObjectType.TREE:
- if dep_hash not in subtrees:
- subtrees.append(dep_hash)
- elif (
- not is_missing
- and dep_type == GitObjectType.BLOB
- and dep_hash in missing_contents
- ):
- # Content this tree depends on is missing.
- # We can infer that the tree itself is missing.
- missing_directories.add(hashutil.hash_to_bytes(tree_hash))
- is_missing = True
- if not is_missing:
- # This tree is either empty or has no known-missing dependencies.
- # Determine if it is missing by searching the archive.
- missing_tree = set(
- self.storage.directory_missing(
- [hashutil.hash_to_bytes(tree_hash)]
- )
- )
- if len(missing_tree):
- missing_directories.add(hashutil.hash_to_bytes(tree_hash))
- p = int(i * 100 / num_commits_total)
- if p != last_p:
- logger.debug(
- "Searching for packed trees which are missing from the archive: "
- f"{p}% of commits processed."
- )
- last_p = p
+ if len(missing_tree):
+ missing_directories.add(hashutil.hash_to_bytes(tree_hash))
self.log.debug(
f"Number of packed trees found missing: {len(missing_directories)}"
)
# All blobs are present. Any of these blobs might have been added
# to the archive via imports of other origins. We must query the
# archive for missing trees specific to the origin we are importing.
- for root_hash, tree_deps in iter_tree_deps(root_tree_vertices):
- tree_hashes.add(root_hash)
- try:
- dep_hashes = get_recursive_tree_dep_hashes(
- root_hash, obj_type=GitObjectType.TREE
- )
- tree_hashes.add(dep_hashes)
- except KeyError:
- continue
- missing_directories = set(self.storage.directory_missing(list(tree_hashes)))
+ tree_hashes: List[bytes] = list()
+ for obj_pos in self._tree_types.search(1):
+ tree_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos))
+ tree_hash = hashutil.bytehex_to_hash(tree_hex)
+ tree_hashes.append(tree_hash)
+ missing_directories = set(self.storage.directory_missing(tree_hashes))
self.log.debug(
"Number of missing trees according to archive query: "
f"{len(missing_directories)}"