commit 4623907a1182b8baea3f03b1c0217ba6b2df2dda from: Stefan Sperling date: Thu Oct 03 14:19:33 2024 UTC WIP attempt to track vertex IDs with ints to save memory this does not seem to work as expected, igraph' internal vertex IDs seems to change as more node get added (index 1 was a tree, checking back later it's a commit...) commit - a77c88011f70493a154286133edefe6cf85fec7e commit + 4623907a1182b8baea3f03b1c0217ba6b2df2dda blob - 2327196ec86d098b8aa391e9660d795e0cde887b blob + ff402a66bfa97770b352653809ff7cc47e21724b --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -53,6 +53,7 @@ from swh.model.model import ( RawExtrinsicMetadata, Release, Revision, + Sha1Git, SkippedContent, Snapshot, SnapshotBranch, @@ -261,6 +262,10 @@ class GitLoader(BaseGitLoader): self.urllib3_extra_kwargs["cert_reqs"] = "CERT_NONE" self.requests_extra_kwargs["verify"] = False self.objects_seen: Set[bytes] = set() + + self.vertex_ids: Dict[int, int] = {} + self.packidx_slots: Dict[int, int] = {} + self.vertex_cur = 0 def fetch_pack_from_origin( self, @@ -716,30 +721,46 @@ class GitLoader(BaseGitLoader): self.make_object_graph() return False + + def idx_to_hex_id(self, idx): + logger.debug(f"looking up object at index {idx} of {self.num_objects}") + return hashutil.hash_to_bytehex(self.pack.index._unpack_name(idx)) + + def hex_id_to_idx(self, id_hex): + """ + Return the pack index table entry number for a given object ID. + + Based on dulwich.pack.FilePackIndex._object_offset() + XXX: This reaches into Dulwich internals; need an API? + """ + sha = hex_to_sha(id_hex) + idx = ord(sha[:1]) + if idx == 0: + start = 0 + else: + start = self.pack.index._fan_out_table[idx - 1] + end = self.pack.index._fan_out_table[idx] + return bisect_find_sha(start, end, sha, self.pack.index._unpack_name) + def next_vertex_id(self, hex_id): + """Map pack file index slots to consecutive igrap vertex IDs + and vice versa...""" + slot = self.hex_id_to_idx(hex_id) + self.vertex_ids[slot] = self.vertex_cur + self.packidx_slots[self.vertex_cur] = slot + self.vertex_cur = self.vertex_cur + 1 + return self.vertex_cur - 1 + + def vertex_to_packidx_slot(self, vertex_id): + return self.packidx_slots[vertex_id] + def make_object_graph(self): logger.info("Building packed objects graph") self._object_graph = Graph(directed=True) - def get_pos_in_index(id_hex): - """ - Return the pack index table entry number for a given object ID. - - Based on dulwich.pack.FilePackIndex._object_offset() - XXX: This reaches into Dulwich internals; need an API? - """ - sha = hex_to_sha(id_hex) - idx = ord(sha[:1]) - if idx == 0: - start = 0 - else: - start = self.pack.index._fan_out_table[idx - 1] - end = self.pack.index._fan_out_table[idx] - return bisect_find_sha(start, end, sha, self.pack.index._unpack_name) - def mark_as_traversed(bitstr, id_hex): - bitstr.set(True, get_pos_in_index(id_hex)) + bitstr.set(True, self.hex_id_to_idx(id_hex)) def have_traversed(bitstr, id_hex): """ @@ -747,7 +768,7 @@ class GitLoader(BaseGitLoader): been traversed (i.e. already has its vertex added to the graph). Return None if the object is not present in the pack file. """ - n = get_pos_in_index(id_hex) + n = self.hex_id_to_idx(id_hex) if n is None: return None return bitstr[n] @@ -770,7 +791,6 @@ class GitLoader(BaseGitLoader): while obj.type_name == b"tag": if obj.id in self.ref_object_types: self.ref_object_types[obj.id] = SnapshotTargetType.RELEASE - tag_hash = hashutil.bytehex_to_hash(ref_object_hex) tagged_object_hex = obj.object[1] try: tagged_obj = self.pack[tagged_object_hex] @@ -778,8 +798,7 @@ class GitLoader(BaseGitLoader): obj = None break else: - tagged_hash = hashutil.bytehex_to_hash(tagged_object_hex) - tags[tag_hash] = tagged_hash + tags[ref_object_hex] = tagged_object_hex obj = tagged_obj ref_object_hex = tagged_object_hex @@ -789,10 +808,10 @@ class GitLoader(BaseGitLoader): # Handle refs or tags pointing at blobs or trees... if obj.type_name == b"blob": - new_blobs.append(hashutil.bytehex_to_hash(ref_object_hex)) + new_blobs.append(self.next_vertex_id(ref_object_hex)) continue if obj.type_name == b"tree": - new_trees.append(hashutil.bytehex_to_hash(ref_object_hex)) + new_trees.append(self.next_vertex_id(ref_object_hex)) continue # From here on out, we handle commits only @@ -809,7 +828,6 @@ class GitLoader(BaseGitLoader): i = 0 while len(parents) > 0: commit_hex = parents.pop(0) - commit_hash = hashutil.bytehex_to_hash(commit_hex) t = have_traversed(traversed_objects, commit_hex) if t is None or t is True: continue @@ -822,19 +840,20 @@ class GitLoader(BaseGitLoader): if i > 0 and i % 100000 == 0: logger.debug(f"{ref_name}: {i} commits processed...") i = i + 1 - tree_hash = hashutil.bytehex_to_hash(commit.tree) - commits[commit_hash] = tree_hash + commits[commit_hex] = commit.tree for parent_hex in commit.parents: t = have_traversed(traversed_objects, parent_hex) if t is None or t is True: continue if parent_hex in parents: continue - if parent_hex not in self.pack: - continue parents.append(parent_hex) - parent_hash = hashutil.bytehex_to_hash(parent_hex) - commit_edges.append((commit_hash, parent_hash)) + commit_edges.append( + ( + self.vertex_ids[self.hex_id_to_idx(commit_hex)], + self.vertex_ids[self.hex_id_to_idx(parent_hex)], + ) + ) logger.debug(f"Found {len(tags)} tags and {len(commits)} commits") @@ -844,6 +863,7 @@ class GitLoader(BaseGitLoader): def add_vertices(new_vertices, object_type): attributes = dict() attributes["object_type"] = [object_type for x in new_vertices] + logger.debug(f"adding vertices {new_vertices} of type {object_type}") self._object_graph.add_vertices(new_vertices, attributes=attributes) # Add vertices for any directly referenced trees and blobs. @@ -856,11 +876,23 @@ class GitLoader(BaseGitLoader): new_blobs = [] # Add tags, commits and root trees to the graph - add_vertices(list(tags.keys()), GitObjectType.TAG) - add_vertices(list(commits.keys()), GitObjectType.COMMIT) - add_vertices(list(commits.values()), GitObjectType.TREE) - self._object_graph.add_edges(zip(tags.keys(), tags.values())) - self._object_graph.add_edges(zip(commits.keys(), commits.values())) + tag_vertices = list(map(self.next_vertex_id, tags.keys())) + add_vertices(tag_vertices, GitObjectType.TAG) + commit_vertices = list(map(self.next_vertex_id, commits.keys())) + add_vertices(commit_vertices, GitObjectType.COMMIT) + tree_vertices = list(map(self.next_vertex_id, commits.values())) + add_vertices(commit_vertices, GitObjectType.COMMIT) + add_vertices(tree_vertices, GitObjectType.TREE) + tagged_vertices = list(map(self.next_vertex_id, tags.values())) + logger.debug( + f"adding tag-tagged edges: {list(zip(tag_vertices, tagged_vertices))}" + ) + self._object_graph.add_edges(list(zip(tag_vertices, tagged_vertices))) + logger.debug( + f"adding commit-tree edges: {list(zip(commit_vertices, tree_vertices))}" + ) + self._object_graph.add_edges(list(zip(commit_vertices, tree_vertices))) + logger.debug(f"adding commit-commit edges: {commit_edges}") self._object_graph.add_edges(commit_edges) # Populate the graph with trees and blobs @@ -870,38 +902,38 @@ class GitLoader(BaseGitLoader): num_objects_found = 0 num_tree_entries = 0 logger.info("Adding trees and blobs to the graph...") - for commit_hash, tree_hash in commits.items(): + for commit_hex, tree_hex in commits.items(): # Show some debug progress output for very large datasets if i > 0 and i % 10000 == 0: logger.debug(f"Adding trees and blobs: {i} commits processed...") i = i + 1 - subtrees = [tree_hash] + subtrees = [tree_hex] while len(subtrees) > 0: - tree_hash = subtrees.pop(0) - tree_hex = hashutil.hash_to_bytehex(tree_hash) + tree_hex = subtrees.pop(0) t = have_traversed(traversed_objects, tree_hex) if t is None or t is True: continue mark_as_traversed(traversed_objects, tree_hex) - try: - tree = self.pack[tree_hex] - except KeyError: - continue + tree = self.pack[tree_hex] for name, mode, entry_hex in tree.iteritems(): if mode & submodule_mode == submodule_mode: continue # ignore submodules t = have_traversed(traversed_objects, entry_hex) if t is None: continue # not present in pack file - entry_hash = hashutil.bytehex_to_hash(entry_hex) - new_edges.append((tree_hash, entry_hash)) if t is False: if mode & stat.S_IFDIR: - new_trees.append(entry_hash) - subtrees.append(entry_hash) + new_trees.append(self.next_vertex_id(entry_hex)) + subtrees.append(entry_hex) else: - new_blobs.append(entry_hash) + new_blobs.append(self.next_vertex_id(entry_hex)) mark_as_traversed(traversed_objects, entry_hex) + new_edges.append( + ( + self.vertex_ids[self.hex_id_to_idx(tree_hex)], + self.vertex_ids[self.hex_id_to_idx(entry_hex)], + ) + ) # add new vertices and edges in batches for performance reasons if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 1000000: @@ -915,6 +947,7 @@ class GitLoader(BaseGitLoader): f"Adding {len(new_edges)} tree entries to the graph " f"({num_tree_entries} entries total)" ) + logger.debug(f"adding tree-entry edges: {new_edges}") self._object_graph.add_edges(new_edges) num_objects_found += len(new_trees) + len(new_blobs) logger.debug( @@ -936,6 +969,7 @@ class GitLoader(BaseGitLoader): f"Adding {len(new_edges)} tree entries to the graph " f"({num_tree_entries} entries total)" ) + logger.debug(f"adding tree-entry edges: {new_edges}") self._object_graph.add_edges(new_edges) logger.info( @@ -1015,42 +1049,66 @@ class GitLoader(BaseGitLoader): if s not in vertices: vertices.append(s) yield ( - self._object_graph.vs[s]["name"], + hashutil.bytehex_to_hash( + self.idx_to_hex_id(self.packidx_slots[s]) + ), self._object_graph.vs[s]["object_type"], ) def get_neighbors(v, object_type=None): for s in self._object_graph.neighbors(v): + logger.debug(f"neighbor {s}") if object_type is not None: if self._object_graph.vs[s]["object_type"] != object_type: + logger.debug(f"object type expected: {object_type}") + logger.debug( + f"object type actual: {self._object_graph.vs[s]['object_type']}" + ) continue yield ( - self._object_graph.vs[s]["name"], + hashutil.bytehex_to_hash(self.idx_to_hex_id(self.packidx_slots[s])), self._object_graph.vs[s]["object_type"], ) + def map_idx_to_hex_id(idx): + logger.debug(f"asking for hex ID of idx {idx}") + return self.idx_to_hex_id(idx) + + def map_vertex_to_packidx_slot(vertex): + return self.vertex_to_packidx_slot(vertex) + + blob_hex_ids: List[Sha1Git] = [] try: blob_vertices = self._object_graph.vs.select(object_type=GitObjectType.BLOB) + blob_packidx_slots = list( + map(map_vertex_to_packidx_slot, blob_vertices["name"]) + ) + blob_hex_ids = list(map(map_idx_to_hex_id, blob_packidx_slots)) except KeyError: missing_contents = set() else: missing_contents = set( - self.storage.content_missing_per_sha1_git(blob_vertices["name"]) + self.storage.content_missing_per_sha1_git(blob_hex_ids) ) + logger.debug(f"missing contents: {missing_contents}") + tree_hex_ids: List[Sha1Git] = [] try: tree_vertices = self._object_graph.vs.select(object_type=GitObjectType.TREE) + tree_packidx_slots = list( + map(map_vertex_to_packidx_slot, tree_vertices["name"]) + ) + tree_hex_ids = list(map(map_idx_to_hex_id, tree_packidx_slots)) except KeyError: - tree_vertices = set() + tree_hex_ids = list() self.log.debug( "Number of packed blobs that are missing in storage: " - f"{len(missing_contents)} of {len(blob_vertices['name'])} packed blobs total" + f"{len(missing_contents)} of {len(blob_hex_ids)} packed blobs total" ) - tree_hashes: List[bytes] = [] - if len(missing_contents) == len(blob_vertices["name"]): + if len(missing_contents) == len(blob_hex_ids): # If all blobs are missing then all trees are missing, too. - tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices] + tree_hashes = [hashutil.bytehex_to_hash(t) for t in tree_hex_ids] missing_directories = set(tree_hashes) self.log.debug( "Number of packed trees considered missing by implication: " @@ -1063,8 +1121,8 @@ class GitLoader(BaseGitLoader): "depend on missing packed blobs" ) nsearched = 0 - for t in tree_vertices: - tree_hash = t["name"] + for tree_hex in tree_hex_ids: + tree_hash = hashutil.bytehex_to_hash(tree_hex) nsearched += 1 if (nsearched % 5000) == 0: self.log.debug( @@ -1074,7 +1132,8 @@ class GitLoader(BaseGitLoader): if tree_hash in missing_directories: continue have_dep = False - for dep_hash, dep_type in get_successors(t): + v = self.vertex_ids[self.hex_id_to_idx(tree_hex)] + for dep_hash, dep_type in get_successors(v): have_dep = True if dep_type == GitObjectType.BLOB and dep_hash in missing_contents: # We can infer that the tree is also missing. @@ -1082,10 +1141,8 @@ class GitLoader(BaseGitLoader): break if not have_dep: # An empty tree has no dependencies. Determine if it is missing. - tree_hashes = [hashutil.hash_to_bytes(tree_hash)] - missing_empty_tree = set( - self.storage.directory_missing(tree_hashes) - ) + tree_ids = [tree_hex] + missing_empty_tree = set(self.storage.directory_missing(tree_ids)) if len(missing_empty_tree): missing_directories.add(tree_hash) self.log.debug( @@ -1097,27 +1154,33 @@ class GitLoader(BaseGitLoader): f"{len(missing_directories)}" ) else: - tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices] + tree_hashes = [hashutil.bytehex_to_hash(t) for t in tree_hex_ids] missing_directories = set(self.storage.directory_missing(tree_hashes)) self.log.debug( "Number of missing trees according to archive query: " f"{len(missing_directories)}" ) + commit_hex_ids: List[Sha1Git] = [] try: commit_vertices = self._object_graph.vs.select( object_type=GitObjectType.COMMIT ) + commit_packidx_slots = list( + map(map_vertex_to_packidx_slot, commit_vertices["name"]) + ) + commit_hex_ids = list(map(map_idx_to_hex_id, commit_packidx_slots)) except KeyError: - commit_vertices = set() + commit_hex_ids = list() + missing_revisions = set() if len(missing_contents) > 0 or len(missing_directories) > 0: self.log.debug( "Searching for packed commits which depend on missing packed blobs or trees" ) nsearched = 0 - for c in commit_vertices: - commit_hash = c["name"] + for commit_hex in commit_hex_ids: + commit_hash = hashutil.bytehex_to_hash(commit_hex) nsearched += 1 if (nsearched % 5000) == 0: self.log.debug( @@ -1126,7 +1189,10 @@ class GitLoader(BaseGitLoader): ) if commit_hash in missing_revisions: continue - for dep_hash, dep_type in get_neighbors(c, GitObjectType.TREE): + v = self.vertex_ids[self.hex_id_to_idx(commit_hex)] + logger.debug(f"checking commit vertex {v}") + for dep_hash, dep_type in get_neighbors(v, GitObjectType.TREE): + logger.debug(f"dep_hash: {dep_hash}") if dep_hash in missing_contents or dep_hash in missing_directories: # We can infer that the commit is also missing. missing_revisions.add(commit_hash) @@ -1140,11 +1206,11 @@ class GitLoader(BaseGitLoader): f"{len(missing_revisions)}" ) else: - commit_hashes = [c["name"] for c in commit_vertices] + commit_hashes = [hashutil.bytehex_to_hash(c) for c in commit_hex_ids] missing_revisions = set(self.storage.revision_missing(commit_hashes)) for blob_hash in missing_contents: - obj = self.pack[hashutil.hash_to_bytehex(blob_hash)] + obj = self.pack[hashutil.hash_to_bytes(blob_hash)] if obj.id in self.ref_object_types: self.ref_object_types[obj.id] = SnapshotTargetType.CONTENT content = converters.dulwich_blob_to_content(