commit d9e70c37d9c1f727bf5c52714f64189717706d01 from: Stefan Sperling date: Mon Oct 21 13:42:01 2024 UTC cache bitarrays for recursive tree dependencies commit - 607a92048c2b18c77d6930d181b8326ec9158074 commit + d9e70c37d9c1f727bf5c52714f64189717706d01 blob - e4f2f184e4687f11df9528fd46539d7873359b12 blob + faa42ebc8315cb686415f0039a27d1119f5ab6ec --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -750,11 +750,20 @@ class GitLoader(BaseGitLoader): self._commit_graph = Graph(directed=True) # Bitarrays recording direct dependencies of tree objects. - # The bitarray are stored as bytes rather than bitarray objects, + # The bitarray is stored as bytes rather than bitarray objects, # compressed with sc_encode() to save space. self._tree_deps: collections.OrderedDict[bytes, bytes] = ( collections.OrderedDict() ) + + # Bitarrays recording recursive dependencies of tree objects. + # The bitarray is stored as bytes rather than bitarray objects, + # compressed with sc_encode() to save space. + self._recursive_tree_deps: collections.OrderedDict[bytes, bytes] = ( + collections.OrderedDict() + ) + + # Threshold for swapping bitarrays stored in above collections to a file. self._max_trees_in_mem = 10000 # Object type markers. @@ -768,6 +777,8 @@ class GitLoader(BaseGitLoader): # BitarraySwapper class which covers I/O sequences involving seek + read/write). self._swapfile = SpooledTemporaryFile(max_size=self.temp_file_cutoff) self._swapped_tree_deps: Dict[bytes, BitarraySwapper] = dict() + self._swapfile_recursive = SpooledTemporaryFile(max_size=self.temp_file_cutoff) + self._swapped_recursive_tree_deps: Dict[bytes, BitarraySwapper] = dict() def get_pos_in_index(id_hex): """ @@ -929,6 +940,7 @@ class GitLoader(BaseGitLoader): t = have_traversed(traversed_objects, tree_hex) if t is None or t is True: continue + self._tree_types[get_pos_in_index(tree_hex)] = 1 if len(self._tree_deps) >= self._max_trees_in_mem: (other_tree_hash, other_tree_deps) = self._tree_deps.popitem(last=False) swapper = BitarraySwapper(other_tree_deps, self._swapfile) @@ -964,6 +976,13 @@ class GitLoader(BaseGitLoader): self._blob_types[obj_pos] = 1 self._tree_deps[tree_hash] = sc_encode(tree_deps) self._tree_deps.move_to_end(tree_hash, last=False) + + p = int(i * 100 / num_commits_total) + if p != last_p: + logger.debug( + f"Adding trees and blobs to the graph: {p}% of commits processed." + ) + last_p = p logger.info("Packed objects graph has been built") @@ -1037,9 +1056,54 @@ class GitLoader(BaseGitLoader): swapper = self._swapped_tree_deps[tree_hash] tree_deps = sc_decode(swapper.swap_in()) return tree_deps + + def find_recursive_tree_deps(tree_hash, tree_deps): + recursive_deps = bitarray(self.num_objects) + deps = [tree_deps] + while len(deps) > 0: + tree_deps = deps.pop(0) + for obj_pos in tree_deps.search(1): + recursive_deps[obj_pos] = 1 + if self._tree_types[obj_pos] == 1: + id_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos)) + id_hash = hashutil.bytehex_to_hash(id_hex) + sub_deps = get_tree_deps(id_hash) + deps.append(sub_deps) + return recursive_deps + + def get_recursive_tree_deps(tree_hash): + try: + recursive_tree_deps = sc_decode(self._recursive_tree_deps[tree_hash]) + # Keep bitarrays for recently used trees cached in memory. + self._recursive_tree_deps.move_to_end(tree_hash, last=False) + except KeyError: + try: + swapper = self._swapped_recursive_tree_deps[tree_hash] + recursive_tree_deps = sc_decode(swapper.swap_in()) + except KeyError: + # Neither in memory nor swapped out. Compute recursive deps the hard way. + if len(self._recursive_tree_deps) >= self._max_trees_in_mem: + # Out of room in memory, swap something out. + (other_tree_hash, other_tree_deps) = ( + self._recursive_tree_deps.popitem(last=False) + ) + swapper = BitarraySwapper( + other_tree_deps, self._swapfile_recursive + ) + self._swapped_recursive_tree_deps[other_tree_hash] = swapper + swapper.swap_out() + direct_tree_deps = get_tree_deps(tree_hash) + recursive_tree_deps = find_recursive_tree_deps( + tree_hash, direct_tree_deps + ) + self._recursive_tree_deps[tree_hash] = sc_encode( + recursive_tree_deps + ) + self._recursive_tree_deps.move_to_end(tree_hash, last=False) + return recursive_tree_deps def get_successors(tree_hash): - tree_deps = get_tree_deps(tree_hash) + tree_deps = get_recursive_tree_deps(tree_hash) for obj_pos in tree_deps.search(1): dep_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos)) dep_hash = hashutil.bytehex_to_hash(dep_hex) @@ -1048,7 +1112,10 @@ class GitLoader(BaseGitLoader): elif self._tree_types[obj_pos] == 1: obj_type = GitObjectType.TREE else: - raise TypeError(f"Unexpected object type: {obj_type}") + raise TypeError( + "Unknown object type for dependency " + f"{hashutil.hash_to_bytehex(dep_hash)}" + ) yield (dep_hash, obj_type) def get_neighbors(graph, v, object_type=None): @@ -1064,26 +1131,21 @@ class GitLoader(BaseGitLoader): tree_deps = get_tree_deps(tree_hash) yield (tree_hash, tree_deps) - def get_recursive_tree_deps(tree_deps, obj_type=None): - deps = [tree_deps] - while len(deps) > 0: - tree_deps = deps.pop(0) - for obj_pos in tree_deps.search(1): - id_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos)) - id_hash = hashutil.bytehex_to_hash(id_hex) - if obj_type is None: - yield id_hash - elif obj_type == GitObjectType.BLOB: - if self._blob_types[obj_pos] == 1: - yield id_hash - elif obj_type == GitObjectType.TREE: - if self._tree_types[obj_pos] == 1: - yield id_hash - else: - raise TypeError(f"Unexpected object type: {obj_type}") + def get_recursive_tree_dep_hashes(tree_hash, obj_type=None): + recursive_deps = get_recursive_tree_deps(tree_hash) + for obj_pos in recursive_deps.search(1): + id_hex = sha_to_hex(self.pack.index._unpack_name(obj_pos)) + id_hash = hashutil.bytehex_to_hash(id_hex) + if obj_type is None: + yield id_hash + elif obj_type == GitObjectType.BLOB: + if self._blob_types[obj_pos] == 1: + yield id_hash + elif obj_type == GitObjectType.TREE: if self._tree_types[obj_pos] == 1: - sub_deps = get_tree_deps(id_hash) - deps.append(sub_deps) + yield id_hash + else: + raise TypeError(f"Unexpected object type: {obj_type}") # Load the set of all blob IDs found in the pack file. root_tree_vertices = self._commit_graph.vs.select( @@ -1100,8 +1162,8 @@ class GitLoader(BaseGitLoader): last_p = p i = i + 1 try: - dep_hashes = get_recursive_tree_deps( - tree_deps, obj_type=GitObjectType.BLOB + dep_hashes = get_recursive_tree_dep_hashes( + root_hash, obj_type=GitObjectType.BLOB ) blob_hashes.update(dep_hashes) except KeyError: @@ -1127,8 +1189,8 @@ class GitLoader(BaseGitLoader): for root_hash, tree_deps in iter_tree_deps(root_tree_vertices): missing_directories.add(hashutil.hash_to_bytes(root_hash)) try: - dep_hashes = get_recursive_tree_deps( - tree_deps, obj_type=GitObjectType.TREE + dep_hashes = get_recursive_tree_dep_hashes( + root_hash, obj_type=GitObjectType.TREE ) missing_directories.update(map(hashutil.hash_to_bytes, dep_hashes)) except KeyError: @@ -1182,8 +1244,8 @@ class GitLoader(BaseGitLoader): for root_hash, tree_deps in iter_tree_deps(root_tree_vertices): tree_hashes.add(root_hash) try: - dep_hashes = get_recursive_tree_deps( - tree_deps, obj_type=GitObjectType.TREE + dep_hashes = get_recursive_tree_dep_hashes( + root_hash, obj_type=GitObjectType.TREE ) tree_hashes.add(dep_hashes) except KeyError: