commit d30fbeafa2087c29c558a7e1f539f6aae480a2c2 from: Stefan Sperling date: Wed Sep 25 12:46:42 2024 UTC show debug progress output while searching blobs and trees commit - d90389e678229d944617785754abf30f2f31ce80 commit + d30fbeafa2087c29c558a7e1f539f6aae480a2c2 blob - f26d6c96a73198205002c075fbced564d61970bd blob + a66297ed71f87b2d38f96647b63a6c473c0534da --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -837,7 +837,12 @@ class GitLoader(BaseGitLoader): traversed_trees = set() seen_blobs = set() submodule_mode = stat.S_IFDIR | stat.S_IFLNK + i = 0 for commit_hash, tree_hash in commits.items(): + # Show some debug progress output for very large datasets + if i > 0 and i % 10000 == 0: + logger.debug(f"Finding trees and blobs: {i} commits processed...") + i = i + 1 subtrees = [tree_hash] while len(subtrees) > 0: tree_hash = subtrees.pop(0) @@ -864,6 +869,10 @@ class GitLoader(BaseGitLoader): # add new vertices and edges in batches for performance reasons if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 100000: + logger.debug( + f"Found {len(new_trees)} new trees and {len(new_blobs)} new blobs " + f"and {len(new_edges)} new tree entries" + ) if len(new_trees) > 0: add_vertices(new_trees, GitObjectType.TREE) if len(new_blobs) > 0: