commit c683aeaacfbbc2bac3b2e42f576a38d94dca218a from: Stefan Sperling date: Wed Sep 25 12:32:17 2024 UTC prevent history walks from takimg too long with many merge commits commit - 5a2803734b8c024d4ab4db8c4f7527547539775f commit + c683aeaacfbbc2bac3b2e42f576a38d94dca218a blob - 0dac21502f69d6cd4b68005d9403f4bea472b1b5 blob + 8ae4a5d93145b969b9664e6f0476392c7f159c0d --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -765,25 +765,33 @@ class GitLoader(BaseGitLoader): self.ref_object_types[obj.id] = SnapshotTargetType.REVISION parents = [ref_object_hex] + i = 0 while len(parents) > 0: commit_hex = parents.pop(0) + commit_hash = hashutil.bytehex_to_hash(commit_hex) + if commit_hash in commits.keys(): + continue try: commit = self.pack[commit_hex] except KeyError: continue - commit_hash = hashutil.bytehex_to_hash(commit_hex) - if commit_hash in commits.keys(): - continue + # Show some debug progress output for very large datasets + if i > 0 and i % 100000 == 0: + logger.debug(f"{ref_name}: {i} commits processed...") + i = i + 1 tree_hash = hashutil.bytehex_to_hash(commit.tree) commits[commit_hash] = tree_hash for parent_hex in commit.parents: - if parent_hex in self.pack: - if parent_hex not in parents: - parents.append(parent_hex) - parent_hash = hashutil.bytehex_to_hash(parent_hex) - if (commit_hash, parent_hash) not in commit_edges: - commit_edges.append((commit_hash, parent_hash)) + if parent_hex in parents: + continue + parent_hash = hashutil.bytehex_to_hash(parent_hex) + if parent_hash in commits.keys(): + continue + parents.append(parent_hex) + commit_edges.append((commit_hash, parent_hash)) + logger.debug(f"Found {len(tags)} tags and {len(commits)} commits") + archived_missing_objects = set( self.storage.object_find_by_sha1_git(missing_objects).keys() )