commit 310666ca1b675f5b9ea18bd71bbda18b9f0da97d from: Stefan Sperling date: Fri Oct 04 14:26:57 2024 UTC improve tree crawl efficiency and tweak progress display commit - 2d1a69eeca244bc64d106fff5b1a4e25fa1e1728 commit + 310666ca1b675f5b9ea18bd71bbda18b9f0da97d blob - d19645d3dafd42ba011a0f16fa7dfb189c115c9e blob + dfc9bd15c4a5c3dba256baa055aadfa2b051ffdf --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -908,15 +908,28 @@ class GitLoader(BaseGitLoader): new_edges = [] submodule_mode = stat.S_IFDIR | stat.S_IFLNK i = 0 + last_p = -1 num_objects_found = 0 num_tree_entries = 0 + num_commits_total = len(commits.items()) logger.info("Adding trees and blobs to the graph...") - for commit_hash, tree_hash in commits.items(): - if tree_hash in self._tree_graphs.keys(): - # Keep recently used trees cached in memory. - self._tree_graphs.move_to_end(tree_hash, last=False) + for commit_hash, root_tree_hash in commits.items(): + # Show some debug progress output for very large datasets + p = int(i * 100 / num_commits_total) + if p != last_p: + logger.debug(f"Adding trees and blobs: {p}% of commits processed...") + last_p = p + i = i + 1 + tree_hex = hashutil.hash_to_bytehex(root_tree_hash) + t = have_traversed(traversed_objects, tree_hex) + if t is None: continue - if tree_hash in self._swapped_graphs.keys(): + if t is True: + try: + # Keep graphs for recently used trees cached in memory. + self._tree_graphs.move_to_end(root_tree_hash, last=False) + except KeyError: + pass continue if len(self._tree_graphs) >= self._max_trees_in_mem: (other_tree_hash, other_tree_graph) = self._tree_graphs.popitem( @@ -926,14 +939,10 @@ class GitLoader(BaseGitLoader): self._swapped_graphs[other_tree_hash] = swapper swapper.swap_out() tree_graph = Graph(directed=True) - self._tree_graphs[tree_hash] = tree_graph - self._tree_graphs.move_to_end(tree_hash, last=False) - # Show some debug progress output for very large datasets - if i > 0 and i % 10000 == 0: - logger.debug(f"Adding trees and blobs: {i} commits processed...") - i = i + 1 - new_trees.append(tree_hash) - subtrees = [tree_hash] + self._tree_graphs[root_tree_hash] = tree_graph + self._tree_graphs.move_to_end(root_tree_hash, last=False) + new_trees.append(root_tree_hash) + subtrees = [root_tree_hash] while len(subtrees) > 0: tree_hash = subtrees.pop(0) tree_hex = hashutil.hash_to_bytehex(tree_hash)