commit 607a92048c2b18c77d6930d181b8326ec9158074 from: Stefan Sperling date: Mon Oct 21 10:49:31 2024 UTC progress reporting improvements commit - b1e497a6d0bae8ab541b45d1ffcd4b58be2c5b25 commit + 607a92048c2b18c77d6930d181b8326ec9158074 blob - e78ebf760ea71e81d5e57ff1b70f68d2b3d7b405 blob + e4f2f184e4687f11df9528fd46539d7873359b12 --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -920,7 +920,9 @@ class GitLoader(BaseGitLoader): # Show some debug progress output for very large datasets p = int(i * 100 / num_commits_total) if p != last_p: - logger.debug(f"Adding trees and blobs: {p}% of commits processed...") + logger.debug( + f"Adding trees and blobs to the graph: {p}% of commits processed..." + ) last_p = p i = i + 1 tree_hex = hashutil.hash_to_bytehex(root_tree_hash) @@ -996,6 +998,8 @@ class GitLoader(BaseGitLoader): logger.debug("No objects to load") return + logger.debug("Loading objects in topological order...") + if self.save_data_path: self.save_data() @@ -1086,7 +1090,15 @@ class GitLoader(BaseGitLoader): object_type=GitObjectType.TREE ) blob_hashes: Set[bytes] = set() + logger.debug("Loading blob object IDs...") + i = 0 + last_p = -1 for root_hash, tree_deps in iter_tree_deps(root_tree_vertices): + p = int(i * 100 / len(root_tree_vertices)) + if p != last_p: + logger.debug(f"Loading blob object IDs: {p}% of commits processed...") + last_p = p + i = i + 1 try: dep_hashes = get_recursive_tree_deps( tree_deps, obj_type=GitObjectType.BLOB @@ -1095,6 +1107,10 @@ class GitLoader(BaseGitLoader): except KeyError: continue + p = int(i * 100 / len(root_tree_vertices)) + if p != last_p: + logger.debug(f"Loading blob object IDs: {p}% of commits processed.") + # Find out which blobs are missing from the archive. missing_contents = set( self.storage.content_missing_per_sha1_git(list(blob_hashes))