commit e3840f7501a7dbea6e169bd9c0d0fa8a49476af0 from: Stefan Sperling date: Wed Sep 25 10:25:27 2024 UTC speed up crawling of trees commit - 4ff67682be16c48c098e0412aee07cabd74a1f27 commit + e3840f7501a7dbea6e169bd9c0d0fa8a49476af0 blob - ab9b86ab3ca21b10c0676ff6d55f269df436634e blob + bd5c4b65d0adb1db728a1c1dc33c23318d91840e --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -811,6 +811,7 @@ class GitLoader(BaseGitLoader): new_blobs = [] new_edges = [] traversed_trees = set() + seen_blobs = set() submodule_mode = stat.S_IFDIR | stat.S_IFLNK for commit_hash, tree_hash in commits.items(): subtrees = [tree_hash] @@ -829,16 +830,14 @@ class GitLoader(BaseGitLoader): continue # ignore submodules entry_hash = hashutil.bytehex_to_hash(entry_hex) if mode & stat.S_IFDIR: - new_trees.append(entry_hash) - else: + if entry_hash not in traversed_trees: + new_trees.append(entry_hash) + subtrees.append(entry_hash) + elif entry_hash not in seen_blobs: new_blobs.append(entry_hash) + seen_blobs.add(entry_hash) new_edges.append((tree_hash, entry_hash)) - if mode & stat.S_IFDIR: - try: - tree = self.pack[entry_hex] - subtrees.append(entry_hash) - except KeyError: - pass + # add new vertices and edges in batches for performance reasons if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 100000: if len(new_trees) > 0: