commit - 39b88888999abc64dd5ea2dd56e48bcc639fab15
commit + 1abdec0ef115d8c29968fc2eeb22936bff743295
blob - c31af09d8a3d859fa7c78e67803100b9d74404ac
blob + bad93b27fbcbfcfe219bca5547cabae56b6d75e1
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
percent_done = p
if self.pack_size < 12: # No pack file header present
+ logger.debug("Pack file too small")
return False
packdata = PackData.from_file(self.pack_buffer, self.pack_size)
self._object_graph = Graph(directed=True)
- # Find all commits and corresponding tree roots in the pack file
+ # Find all tags, commits and corresponding tree roots in the pack file
+ tags = {}
commits = {}
commit_edges = []
for ref_name, ref_object_hex in self.remote_refs.items():
obj = self.pack[ref_object_hex]
except KeyError:
continue
- logger.debug(f"Opened {obj}")
- # Peel tags for now; consider adding them to the graph later
+
+ logger.debug(f"Opened {obj.type_name} {obj}")
+
while obj.type_name == b"tag":
+ tag_hash = hashutil.bytehex_to_hash(ref_object_hex)
+ tagged_object_hex = obj.object[1]
+ logger.debug(f"Opened tag {obj} pointing at {tagged_object_hex}")
try:
- ref_object_hex = obj.object[1]
- obj = self.pack[ref_object_hex]
- logger.debug(f"Opened {obj}")
+ tagged_obj = self.pack[tagged_object_hex]
except KeyError:
+ logger.debug(f" pack is missing: {tagged_object_hex}")
obj = None
break
+ else:
+ tagged_hash = hashutil.bytehex_to_hash(tagged_object_hex)
+ tags[tag_hash] = tagged_hash
+ obj = tagged_obj
+ ref_object_hex = tagged_object_hex
+
+ # TODO: Allow tags pointing at blobs or trees?
if obj is None or obj.type_name != b"commit":
continue
attributes["object_type"] = [object_type for x in new_vertices]
self._object_graph.add_vertices(new_vertices, attributes=attributes)
- # Add commits and root trees to the graph
+ # Add tags, commits and root trees to the graph
+ add_vertices(list(tags.keys()), GitObjectType.TAG)
add_vertices(list(commits.keys()), GitObjectType.COMMIT)
add_vertices(list(commits.values()), GitObjectType.TREE)
+ self._object_graph.add_edges(zip(tags.keys(), tags.values()))
self._object_graph.add_edges(zip(commits.keys(), commits.values()))
self._object_graph.add_edges(commit_edges)
def store_data_topological(self) -> None:
assert self.origin
+
+ # No object graph was created if the pack file was empty.
+ if not hasattr(self, "_object_graph"):
+ logger.debug("No objects to load")
+ return
+
if self.save_data_path:
self.save_data()
f"{ref_name!r}: {ref_object!r} was found in neither the "
"fetched pack file nor in local heads nor in the archive"
)
- for o in self.walk_ref(ref_name, ref_object, self.pack):
- obj = self.pack[hashutil.hash_to_bytehex(o)]
- logger.debug(f"Loading object {obj.id}")
- if obj.type_name == b"blob":
- if obj.id in self.ref_object_types:
- self.ref_object_types[obj.id] = SnapshotTargetType.CONTENT
- content = converters.dulwich_blob_to_content(
- obj, max_content_size=self.max_content_size
+
+ def get_dependencies(object_hash):
+ vertices = [self._object_graph.vs.find(name=object_hash)]
+ while len(vertices) > 0:
+ v = vertices.pop(0)
+ for s in self._object_graph.successors(v):
+ vertices.append(s)
+ yield (
+ self._object_graph.vs[s]["name"],
+ self._object_graph.vs[s]["object_type"],
)
- if isinstance(content, Content):
- self.counts["content"] += 1
- storage_summary.update(self.storage.content_add([content]))
- elif isinstance(content, SkippedContent):
- self.counts["skipped_content"] += 1
- storage_summary.update(
- self.storage.skipped_content_add([content])
+
+ try:
+ blob_vertices = self._object_graph.vs.select(object_type=GitObjectType.BLOB)
+ except KeyError:
+ missing_contents = set()
+ else:
+ missing_contents = set(
+ self.storage.content_missing_per_sha1_git(blob_vertices["name"])
+ )
+
+ try:
+ tree_vertices = self._object_graph.vs.select(object_type=GitObjectType.TREE)
+ except KeyError:
+ tree_vertices = set()
+
+ self.log.debug(
+ "Number of packed blobs that are missing in storage: "
+ f"{len(missing_contents)}"
+ )
+ tree_hashes: List[bytes] = []
+ if len(missing_contents) > 0:
+ missing_directories = set()
+ self.log.debug(
+ "Searching for packed trees which depend on missing packed blobs"
+ )
+ for t in tree_vertices:
+ tree_hash = t["name"]
+ have_dep = False
+ for (dep_hash, dep_type) in get_dependencies(tree_hash):
+ have_dep = True
+ if dep_type == GitObjectType.BLOB and dep_hash in missing_contents:
+ self.log.debug(
+ f"tree {hashutil.hash_to_bytehex(tree_hash)!r} depends on "
+ f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}"
)
- else:
- raise TypeError(f"Unexpected content type: {content}")
- elif obj.type_name == b"tree":
- if obj.id in self.ref_object_types:
- self.ref_object_types[obj.id] = SnapshotTargetType.DIRECTORY
- self.counts["directory"] += 1
- directory = converters.dulwich_tree_to_directory(obj)
- storage_summary.update(self.storage.directory_add([directory]))
- elif obj.type_name == b"commit":
- if obj.id in self.ref_object_types:
- self.ref_object_types[obj.id] = SnapshotTargetType.REVISION
- self.counts["revision"] += 1
- revision = converters.dulwich_commit_to_revision(obj)
- storage_summary.update(self.storage.revision_add([revision]))
- elif obj.type_name == b"tag":
- if obj.id in self.ref_object_types:
- self.ref_object_types[obj.id] = SnapshotTargetType.RELEASE
- self.counts["release"] += 1
- release = converters.dulwich_tag_to_release(obj)
- storage_summary.update(self.storage.release_add([release]))
- else:
- raise NotFound(f"object {obj} has bad type {obj.type}")
+ # We can infer that the tree is also missing.
+ missing_directories.add(tree_hash)
+ break
+ if not have_dep:
+ # An empty tree has no dependencies. Determine if it is missing.
+ tree_hashes = [hashutil.hash_to_bytes(tree_hash)]
+ missing_empty_tree = set(
+ self.storage.directory_missing(tree_hashes)
+ )
+ if len(missing_empty_tree):
+ missing_directories.add(tree_hash)
+ self.log.debug(
+ "Number of packed trees considered missing by implication: "
+ f"{len(missing_directories)}"
+ )
+ else:
+ tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices]
+ missing_directories = set(self.storage.directory_missing(tree_hashes))
+ try:
+ commit_vertices = self._object_graph.vs.select(
+ object_type=GitObjectType.COMMIT
+ )
+ except KeyError:
+ commit_vertices = set()
+ missing_revisions = set()
+ if len(missing_contents) > 0 or len(missing_directories) > 0:
+ self.log.debug(
+ "Searching for packed commits which depend on missing packed blobs or trees"
+ )
+ for c in commit_vertices:
+ commit_hash = c["name"]
+ for (dep_hash, dep_type) in get_dependencies(commit_hash):
+ if dep_hash in missing_contents or dep_hash in missing_directories:
+ self.log.debug(
+ f"commit {hashutil.hash_to_bytehex(commit_hash)!r} depends on "
+ f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}"
+ )
+ # We can infer that the commit is also missing.
+ missing_revisions.add(commit_hash)
+ break
+ self.log.debug(
+ "Number of packed commits considered missing by implication: "
+ f"{len(missing_revisions)}"
+ )
+ else:
+ commit_hashes = [c["name"] for c in commit_vertices]
+ missing_revisions = set(self.storage.revision_missing(commit_hashes))
+
+ for blob_hash in missing_contents:
+ obj = self.pack[hashutil.hash_to_bytehex(blob_hash)]
+ if obj.id in self.ref_object_types:
+ self.ref_object_types[obj.id] = SnapshotTargetType.CONTENT
+ content = converters.dulwich_blob_to_content(
+ obj, max_content_size=self.max_content_size
+ )
+ if isinstance(content, Content):
+ self.counts["content"] += 1
+ storage_summary.update(self.storage.content_add([content]))
+ elif isinstance(content, SkippedContent):
+ self.counts["skipped_content"] += 1
+ storage_summary.update(self.storage.skipped_content_add([content]))
+ else:
+ raise TypeError(f"Unexpected content type: {content}")
+
+ try:
+ tag_vertices = self._object_graph.vs.select(object_type=GitObjectType.TAG)
+ except KeyError:
+ tag_vertices = set()
+ missing_releases = set()
+ if len(missing_revisions) > 0:
+ self.log.debug(
+ "Searching for packed tags which depend on missing packed objects"
+ )
+ for t in tag_vertices:
+ tag_hash = t["name"]
+ for (dep_hash, dep_type) in get_dependencies(tag_hash):
+ if (
+ dep_hash in missing_revisions
+ or dep_hash in missing_directories
+ or dep_hash in missing_contents
+ ):
+ self.log.debug(
+ f"tag {hashutil.hash_to_bytehex(tag_hash)!r} depends on "
+ f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}"
+ )
+ # We can infer that the tag is also missing.
+ missing_releases.add(tag_hash)
+ break
+ self.log.debug(
+ "Number of packed tag considered missing by implication: "
+ f"{len(missing_releases)}"
+ )
+ else:
+ tag_hashes = [t["name"] for t in tag_vertices]
+ missing_releases = set(self.storage.release_missing(tag_hashes))
+
+ for tree_hash in missing_directories:
+ obj = self.pack[hashutil.hash_to_bytehex(tree_hash)]
+ if obj.id in self.ref_object_types:
+ self.ref_object_types[obj.id] = SnapshotTargetType.DIRECTORY
+ self.counts["directory"] += 1
+ directory = converters.dulwich_tree_to_directory(obj)
+ storage_summary.update(self.storage.directory_add([directory]))
+
+ for commit_hash in missing_revisions:
+ obj = self.pack[hashutil.hash_to_bytehex(commit_hash)]
+ if obj.id in self.ref_object_types:
+ self.ref_object_types[obj.id] = SnapshotTargetType.REVISION
+ self.counts["revision"] += 1
+ revision = converters.dulwich_commit_to_revision(obj)
+ storage_summary.update(self.storage.revision_add([revision]))
+
+ for tag_hash in missing_releases:
+ obj = self.pack[hashutil.hash_to_bytehex(tag_hash)]
+ if obj.id in self.ref_object_types:
+ self.ref_object_types[obj.id] = SnapshotTargetType.RELEASE
+ self.counts["release"] += 1
+ release = converters.dulwich_tag_to_release(obj)
+ storage_summary.update(self.storage.release_add([release]))
+
+ self.flush()
+
snapshot = self.get_snapshot()
self.counts["snapshot"] += 1
storage_summary.update(self.storage.snapshot_add([snapshot]))
"""The load was eventful if the current snapshot is different to
the one we retrieved at the beginning of the run"""
eventful = False
- if self.prev_snapshot and self.snapshot:
- eventful = self.snapshot.id != self.prev_snapshot.id
- elif self.snapshot:
- eventful = bool(self.snapshot.branches)
+
+ if hasattr(self, "snapshot"):
+ if self.prev_snapshot and self.snapshot:
+ eventful = self.snapshot.id != self.prev_snapshot.id
+ elif self.snapshot:
+ eventful = bool(self.snapshot.branches)
return {"status": ("eventful" if eventful else "uneventful")}