commit a77c88011f70493a154286133edefe6cf85fec7e from: Stefan Sperling date: Thu Oct 3 09:38:16 2024 UTC avoid multiple lookups of the same ID in the pack index commit - 6c6f6338fe67e530e3ed5171fc67ad4754545d37 commit + a77c88011f70493a154286133edefe6cf85fec7e blob - 4214f5df61cd0b898384d1262b1aa87f0586f532 blob + 2327196ec86d098b8aa391e9660d795e0cde887b --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -742,9 +742,14 @@ class GitLoader(BaseGitLoader): bitstr.set(True, get_pos_in_index(id_hex)) def have_traversed(bitstr, id_hex): + """ + Return True or False to indicate whether a given object has + been traversed (i.e. already has its vertex added to the graph). + Return None if the object is not present in the pack file. + """ n = get_pos_in_index(id_hex) if n is None: - return False + return None return bitstr[n] # Find all tags, commits and corresponding tree roots in the pack file @@ -805,7 +810,8 @@ class GitLoader(BaseGitLoader): while len(parents) > 0: commit_hex = parents.pop(0) commit_hash = hashutil.bytehex_to_hash(commit_hex) - if have_traversed(traversed_objects, commit_hex): + t = have_traversed(traversed_objects, commit_hex) + if t is None or t is True: continue mark_as_traversed(traversed_objects, commit_hex) try: @@ -819,7 +825,8 @@ class GitLoader(BaseGitLoader): tree_hash = hashutil.bytehex_to_hash(commit.tree) commits[commit_hash] = tree_hash for parent_hex in commit.parents: - if have_traversed(traversed_objects, parent_hex): + t = have_traversed(traversed_objects, parent_hex) + if t is None or t is True: continue if parent_hex in parents: continue @@ -872,7 +879,8 @@ class GitLoader(BaseGitLoader): while len(subtrees) > 0: tree_hash = subtrees.pop(0) tree_hex = hashutil.hash_to_bytehex(tree_hash) - if have_traversed(traversed_objects, tree_hex): + t = have_traversed(traversed_objects, tree_hex) + if t is None or t is True: continue mark_as_traversed(traversed_objects, tree_hex) try: @@ -882,17 +890,18 @@ class GitLoader(BaseGitLoader): for name, mode, entry_hex in tree.iteritems(): if mode & submodule_mode == submodule_mode: continue # ignore submodules - if entry_hex not in self.pack: - continue + t = have_traversed(traversed_objects, entry_hex) + if t is None: + continue # not present in pack file entry_hash = hashutil.bytehex_to_hash(entry_hex) - if mode & stat.S_IFDIR: - if not have_traversed(traversed_objects, entry_hex): + new_edges.append((tree_hash, entry_hash)) + if t is False: + if mode & stat.S_IFDIR: new_trees.append(entry_hash) subtrees.append(entry_hash) - elif not have_traversed(traversed_objects, entry_hex): - new_blobs.append(entry_hash) - mark_as_traversed(traversed_objects, entry_hex) - new_edges.append((tree_hash, entry_hash)) + else: + new_blobs.append(entry_hash) + mark_as_traversed(traversed_objects, entry_hex) # add new vertices and edges in batches for performance reasons if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 1000000: