commit - a77c88011f70493a154286133edefe6cf85fec7e
commit + 4623907a1182b8baea3f03b1c0217ba6b2df2dda
blob - 2327196ec86d098b8aa391e9660d795e0cde887b
blob + ff402a66bfa97770b352653809ff7cc47e21724b
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
RawExtrinsicMetadata,
Release,
Revision,
+ Sha1Git,
SkippedContent,
Snapshot,
SnapshotBranch,
self.urllib3_extra_kwargs["cert_reqs"] = "CERT_NONE"
self.requests_extra_kwargs["verify"] = False
self.objects_seen: Set[bytes] = set()
+
+ self.vertex_ids: Dict[int, int] = {}
+ self.packidx_slots: Dict[int, int] = {}
+ self.vertex_cur = 0
def fetch_pack_from_origin(
self,
self.make_object_graph()
return False
+
+ def idx_to_hex_id(self, idx):
+ logger.debug(f"looking up object at index {idx} of {self.num_objects}")
+ return hashutil.hash_to_bytehex(self.pack.index._unpack_name(idx))
+
+ def hex_id_to_idx(self, id_hex):
+ """
+ Return the pack index table entry number for a given object ID.
+
+ Based on dulwich.pack.FilePackIndex._object_offset()
+ XXX: This reaches into Dulwich internals; need an API?
+ """
+ sha = hex_to_sha(id_hex)
+ idx = ord(sha[:1])
+ if idx == 0:
+ start = 0
+ else:
+ start = self.pack.index._fan_out_table[idx - 1]
+ end = self.pack.index._fan_out_table[idx]
+ return bisect_find_sha(start, end, sha, self.pack.index._unpack_name)
+ def next_vertex_id(self, hex_id):
+ """Map pack file index slots to consecutive igrap vertex IDs
+ and vice versa..."""
+ slot = self.hex_id_to_idx(hex_id)
+ self.vertex_ids[slot] = self.vertex_cur
+ self.packidx_slots[self.vertex_cur] = slot
+ self.vertex_cur = self.vertex_cur + 1
+ return self.vertex_cur - 1
+
+ def vertex_to_packidx_slot(self, vertex_id):
+ return self.packidx_slots[vertex_id]
+
def make_object_graph(self):
logger.info("Building packed objects graph")
self._object_graph = Graph(directed=True)
- def get_pos_in_index(id_hex):
- """
- Return the pack index table entry number for a given object ID.
-
- Based on dulwich.pack.FilePackIndex._object_offset()
- XXX: This reaches into Dulwich internals; need an API?
- """
- sha = hex_to_sha(id_hex)
- idx = ord(sha[:1])
- if idx == 0:
- start = 0
- else:
- start = self.pack.index._fan_out_table[idx - 1]
- end = self.pack.index._fan_out_table[idx]
- return bisect_find_sha(start, end, sha, self.pack.index._unpack_name)
-
def mark_as_traversed(bitstr, id_hex):
- bitstr.set(True, get_pos_in_index(id_hex))
+ bitstr.set(True, self.hex_id_to_idx(id_hex))
def have_traversed(bitstr, id_hex):
"""
been traversed (i.e. already has its vertex added to the graph).
Return None if the object is not present in the pack file.
"""
- n = get_pos_in_index(id_hex)
+ n = self.hex_id_to_idx(id_hex)
if n is None:
return None
return bitstr[n]
while obj.type_name == b"tag":
if obj.id in self.ref_object_types:
self.ref_object_types[obj.id] = SnapshotTargetType.RELEASE
- tag_hash = hashutil.bytehex_to_hash(ref_object_hex)
tagged_object_hex = obj.object[1]
try:
tagged_obj = self.pack[tagged_object_hex]
obj = None
break
else:
- tagged_hash = hashutil.bytehex_to_hash(tagged_object_hex)
- tags[tag_hash] = tagged_hash
+ tags[ref_object_hex] = tagged_object_hex
obj = tagged_obj
ref_object_hex = tagged_object_hex
# Handle refs or tags pointing at blobs or trees...
if obj.type_name == b"blob":
- new_blobs.append(hashutil.bytehex_to_hash(ref_object_hex))
+ new_blobs.append(self.next_vertex_id(ref_object_hex))
continue
if obj.type_name == b"tree":
- new_trees.append(hashutil.bytehex_to_hash(ref_object_hex))
+ new_trees.append(self.next_vertex_id(ref_object_hex))
continue
# From here on out, we handle commits only
i = 0
while len(parents) > 0:
commit_hex = parents.pop(0)
- commit_hash = hashutil.bytehex_to_hash(commit_hex)
t = have_traversed(traversed_objects, commit_hex)
if t is None or t is True:
continue
if i > 0 and i % 100000 == 0:
logger.debug(f"{ref_name}: {i} commits processed...")
i = i + 1
- tree_hash = hashutil.bytehex_to_hash(commit.tree)
- commits[commit_hash] = tree_hash
+ commits[commit_hex] = commit.tree
for parent_hex in commit.parents:
t = have_traversed(traversed_objects, parent_hex)
if t is None or t is True:
continue
if parent_hex in parents:
continue
- if parent_hex not in self.pack:
- continue
parents.append(parent_hex)
- parent_hash = hashutil.bytehex_to_hash(parent_hex)
- commit_edges.append((commit_hash, parent_hash))
+ commit_edges.append(
+ (
+ self.vertex_ids[self.hex_id_to_idx(commit_hex)],
+ self.vertex_ids[self.hex_id_to_idx(parent_hex)],
+ )
+ )
logger.debug(f"Found {len(tags)} tags and {len(commits)} commits")
def add_vertices(new_vertices, object_type):
attributes = dict()
attributes["object_type"] = [object_type for x in new_vertices]
+ logger.debug(f"adding vertices {new_vertices} of type {object_type}")
self._object_graph.add_vertices(new_vertices, attributes=attributes)
# Add vertices for any directly referenced trees and blobs.
new_blobs = []
# Add tags, commits and root trees to the graph
- add_vertices(list(tags.keys()), GitObjectType.TAG)
- add_vertices(list(commits.keys()), GitObjectType.COMMIT)
- add_vertices(list(commits.values()), GitObjectType.TREE)
- self._object_graph.add_edges(zip(tags.keys(), tags.values()))
- self._object_graph.add_edges(zip(commits.keys(), commits.values()))
+ tag_vertices = list(map(self.next_vertex_id, tags.keys()))
+ add_vertices(tag_vertices, GitObjectType.TAG)
+ commit_vertices = list(map(self.next_vertex_id, commits.keys()))
+ add_vertices(commit_vertices, GitObjectType.COMMIT)
+ tree_vertices = list(map(self.next_vertex_id, commits.values()))
+ add_vertices(commit_vertices, GitObjectType.COMMIT)
+ add_vertices(tree_vertices, GitObjectType.TREE)
+ tagged_vertices = list(map(self.next_vertex_id, tags.values()))
+ logger.debug(
+ f"adding tag-tagged edges: {list(zip(tag_vertices, tagged_vertices))}"
+ )
+ self._object_graph.add_edges(list(zip(tag_vertices, tagged_vertices)))
+ logger.debug(
+ f"adding commit-tree edges: {list(zip(commit_vertices, tree_vertices))}"
+ )
+ self._object_graph.add_edges(list(zip(commit_vertices, tree_vertices)))
+ logger.debug(f"adding commit-commit edges: {commit_edges}")
self._object_graph.add_edges(commit_edges)
# Populate the graph with trees and blobs
num_objects_found = 0
num_tree_entries = 0
logger.info("Adding trees and blobs to the graph...")
- for commit_hash, tree_hash in commits.items():
+ for commit_hex, tree_hex in commits.items():
# Show some debug progress output for very large datasets
if i > 0 and i % 10000 == 0:
logger.debug(f"Adding trees and blobs: {i} commits processed...")
i = i + 1
- subtrees = [tree_hash]
+ subtrees = [tree_hex]
while len(subtrees) > 0:
- tree_hash = subtrees.pop(0)
- tree_hex = hashutil.hash_to_bytehex(tree_hash)
+ tree_hex = subtrees.pop(0)
t = have_traversed(traversed_objects, tree_hex)
if t is None or t is True:
continue
mark_as_traversed(traversed_objects, tree_hex)
- try:
- tree = self.pack[tree_hex]
- except KeyError:
- continue
+ tree = self.pack[tree_hex]
for name, mode, entry_hex in tree.iteritems():
if mode & submodule_mode == submodule_mode:
continue # ignore submodules
t = have_traversed(traversed_objects, entry_hex)
if t is None:
continue # not present in pack file
- entry_hash = hashutil.bytehex_to_hash(entry_hex)
- new_edges.append((tree_hash, entry_hash))
if t is False:
if mode & stat.S_IFDIR:
- new_trees.append(entry_hash)
- subtrees.append(entry_hash)
+ new_trees.append(self.next_vertex_id(entry_hex))
+ subtrees.append(entry_hex)
else:
- new_blobs.append(entry_hash)
+ new_blobs.append(self.next_vertex_id(entry_hex))
mark_as_traversed(traversed_objects, entry_hex)
+ new_edges.append(
+ (
+ self.vertex_ids[self.hex_id_to_idx(tree_hex)],
+ self.vertex_ids[self.hex_id_to_idx(entry_hex)],
+ )
+ )
# add new vertices and edges in batches for performance reasons
if len(new_trees) + len(new_blobs) > 100000 or len(new_edges) > 1000000:
f"Adding {len(new_edges)} tree entries to the graph "
f"({num_tree_entries} entries total)"
)
+ logger.debug(f"adding tree-entry edges: {new_edges}")
self._object_graph.add_edges(new_edges)
num_objects_found += len(new_trees) + len(new_blobs)
logger.debug(
f"Adding {len(new_edges)} tree entries to the graph "
f"({num_tree_entries} entries total)"
)
+ logger.debug(f"adding tree-entry edges: {new_edges}")
self._object_graph.add_edges(new_edges)
logger.info(
if s not in vertices:
vertices.append(s)
yield (
- self._object_graph.vs[s]["name"],
+ hashutil.bytehex_to_hash(
+ self.idx_to_hex_id(self.packidx_slots[s])
+ ),
self._object_graph.vs[s]["object_type"],
)
def get_neighbors(v, object_type=None):
for s in self._object_graph.neighbors(v):
+ logger.debug(f"neighbor {s}")
if object_type is not None:
if self._object_graph.vs[s]["object_type"] != object_type:
+ logger.debug(f"object type expected: {object_type}")
+ logger.debug(
+ f"object type actual: {self._object_graph.vs[s]['object_type']}"
+ )
continue
yield (
- self._object_graph.vs[s]["name"],
+ hashutil.bytehex_to_hash(self.idx_to_hex_id(self.packidx_slots[s])),
self._object_graph.vs[s]["object_type"],
)
+ def map_idx_to_hex_id(idx):
+ logger.debug(f"asking for hex ID of idx {idx}")
+ return self.idx_to_hex_id(idx)
+
+ def map_vertex_to_packidx_slot(vertex):
+ return self.vertex_to_packidx_slot(vertex)
+
+ blob_hex_ids: List[Sha1Git] = []
try:
blob_vertices = self._object_graph.vs.select(object_type=GitObjectType.BLOB)
+ blob_packidx_slots = list(
+ map(map_vertex_to_packidx_slot, blob_vertices["name"])
+ )
+ blob_hex_ids = list(map(map_idx_to_hex_id, blob_packidx_slots))
except KeyError:
missing_contents = set()
else:
missing_contents = set(
- self.storage.content_missing_per_sha1_git(blob_vertices["name"])
+ self.storage.content_missing_per_sha1_git(blob_hex_ids)
)
+ logger.debug(f"missing contents: {missing_contents}")
+ tree_hex_ids: List[Sha1Git] = []
try:
tree_vertices = self._object_graph.vs.select(object_type=GitObjectType.TREE)
+ tree_packidx_slots = list(
+ map(map_vertex_to_packidx_slot, tree_vertices["name"])
+ )
+ tree_hex_ids = list(map(map_idx_to_hex_id, tree_packidx_slots))
except KeyError:
- tree_vertices = set()
+ tree_hex_ids = list()
self.log.debug(
"Number of packed blobs that are missing in storage: "
- f"{len(missing_contents)} of {len(blob_vertices['name'])} packed blobs total"
+ f"{len(missing_contents)} of {len(blob_hex_ids)} packed blobs total"
)
- tree_hashes: List[bytes] = []
- if len(missing_contents) == len(blob_vertices["name"]):
+ if len(missing_contents) == len(blob_hex_ids):
# If all blobs are missing then all trees are missing, too.
- tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices]
+ tree_hashes = [hashutil.bytehex_to_hash(t) for t in tree_hex_ids]
missing_directories = set(tree_hashes)
self.log.debug(
"Number of packed trees considered missing by implication: "
"depend on missing packed blobs"
)
nsearched = 0
- for t in tree_vertices:
- tree_hash = t["name"]
+ for tree_hex in tree_hex_ids:
+ tree_hash = hashutil.bytehex_to_hash(tree_hex)
nsearched += 1
if (nsearched % 5000) == 0:
self.log.debug(
if tree_hash in missing_directories:
continue
have_dep = False
- for dep_hash, dep_type in get_successors(t):
+ v = self.vertex_ids[self.hex_id_to_idx(tree_hex)]
+ for dep_hash, dep_type in get_successors(v):
have_dep = True
if dep_type == GitObjectType.BLOB and dep_hash in missing_contents:
# We can infer that the tree is also missing.
break
if not have_dep:
# An empty tree has no dependencies. Determine if it is missing.
- tree_hashes = [hashutil.hash_to_bytes(tree_hash)]
- missing_empty_tree = set(
- self.storage.directory_missing(tree_hashes)
- )
+ tree_ids = [tree_hex]
+ missing_empty_tree = set(self.storage.directory_missing(tree_ids))
if len(missing_empty_tree):
missing_directories.add(tree_hash)
self.log.debug(
f"{len(missing_directories)}"
)
else:
- tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices]
+ tree_hashes = [hashutil.bytehex_to_hash(t) for t in tree_hex_ids]
missing_directories = set(self.storage.directory_missing(tree_hashes))
self.log.debug(
"Number of missing trees according to archive query: "
f"{len(missing_directories)}"
)
+ commit_hex_ids: List[Sha1Git] = []
try:
commit_vertices = self._object_graph.vs.select(
object_type=GitObjectType.COMMIT
)
+ commit_packidx_slots = list(
+ map(map_vertex_to_packidx_slot, commit_vertices["name"])
+ )
+ commit_hex_ids = list(map(map_idx_to_hex_id, commit_packidx_slots))
except KeyError:
- commit_vertices = set()
+ commit_hex_ids = list()
+
missing_revisions = set()
if len(missing_contents) > 0 or len(missing_directories) > 0:
self.log.debug(
"Searching for packed commits which depend on missing packed blobs or trees"
)
nsearched = 0
- for c in commit_vertices:
- commit_hash = c["name"]
+ for commit_hex in commit_hex_ids:
+ commit_hash = hashutil.bytehex_to_hash(commit_hex)
nsearched += 1
if (nsearched % 5000) == 0:
self.log.debug(
)
if commit_hash in missing_revisions:
continue
- for dep_hash, dep_type in get_neighbors(c, GitObjectType.TREE):
+ v = self.vertex_ids[self.hex_id_to_idx(commit_hex)]
+ logger.debug(f"checking commit vertex {v}")
+ for dep_hash, dep_type in get_neighbors(v, GitObjectType.TREE):
+ logger.debug(f"dep_hash: {dep_hash}")
if dep_hash in missing_contents or dep_hash in missing_directories:
# We can infer that the commit is also missing.
missing_revisions.add(commit_hash)
f"{len(missing_revisions)}"
)
else:
- commit_hashes = [c["name"] for c in commit_vertices]
+ commit_hashes = [hashutil.bytehex_to_hash(c) for c in commit_hex_ids]
missing_revisions = set(self.storage.revision_missing(commit_hashes))
for blob_hash in missing_contents:
- obj = self.pack[hashutil.hash_to_bytehex(blob_hash)]
+ obj = self.pack[hashutil.hash_to_bytes(blob_hash)]
if obj.id in self.ref_object_types:
self.ref_object_types[obj.id] = SnapshotTargetType.CONTENT
content = converters.dulwich_blob_to_content(