commit - 91b0caf89ffbcd64c670ddc387741fe9e8901b5e
commit + cbe1c11028d08815f6840cc4b5fdaeafca82e9ca
blob - ffda1054c499b061edb13fc90d47c3550e64e74b
blob + 034ae31642dea4c2774d19b1ecd4b3546d3967a8
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
commits = {}
commit_edges = []
for ref_name, ref_object_hex in self.remote_refs.items():
+ logger.debug(f"Processing ref {ref_name}: {ref_object_hex}")
if utils.ignore_branch_name(ref_name):
continue
try:
# TODO: Allow tags pointing at blobs or trees?
if obj is None or obj.type_name != b"commit":
+ logger.debug(f"{ref_object_hex} is a {obj.type_name}, not a commit")
continue
parents = [ref_object_hex]
+
while len(parents) > 0:
commit_hex = parents.pop(0)
try:
except KeyError:
logger.debug(f" pack is missing: {commit_hex}")
continue
- tree_hash = hashutil.bytehex_to_hash(commit.tree)
commit_hash = hashutil.bytehex_to_hash(commit_hex)
+ if commit_hash in commits.keys():
+ continue
+ tree_hash = hashutil.bytehex_to_hash(commit.tree)
commits[commit_hash] = tree_hash
for parent_hex in commit.parents:
if parent_hex in self.pack:
- parents.append(parent_hex)
+ if parent_hex not in parents:
+ parents.append(parent_hex)
parent_hash = hashutil.bytehex_to_hash(parent_hex)
- commit_edges.append((commit_hash, parent_hash))
+ if (commit_hash, parent_hash) not in commit_edges:
+ commit_edges.append((commit_hash, parent_hash))
def add_vertices(new_vertices, object_type):
attributes = dict()
traversed_trees = set()
submodule_mode = stat.S_IFDIR | stat.S_IFLNK
for commit_hash, tree_hash in commits.items():
- logger.debug(
- f"commit {hashutil.hash_to_hex(commit_hash)} "
- f"tree {hashutil.hash_to_hex(tree_hash)}"
- )
subtrees = [tree_hash]
while len(subtrees) > 0:
tree_hash = subtrees.pop(0)
except KeyError:
logger.debug(f" pack is missing: {tree_hex}")
continue
- logger.debug(f"Entries of {tree}:")
for name, mode, entry_hex in tree.iteritems():
- logger.debug(f" {name} {mode} {entry_hex}")
if mode & submodule_mode == submodule_mode:
continue # ignore submodules
entry_hash = hashutil.bytehex_to_hash(entry_hex)
try:
tree = self.pack[entry_hex]
subtrees.append(entry_hash)
- logger.debug(f" present in pack: {entry_hex}")
except KeyError:
logger.debug(f" pack is missing: {entry_hex}")
# add new vertices and edges in batches for performance reasons
add_vertices(new_blobs, GitObjectType.BLOB)
self._object_graph.add_edges(new_edges)
- for v in self._object_graph.vs:
- name = hashutil.hash_to_bytehex(v["name"])
- successors = []
- vertices = [v]
- object_type = v["object_type"]
- while len(vertices) > 0:
- v = vertices.pop(0)
- for s in self._object_graph.successors(v):
- sname = self._object_graph.vs["name"][s]
- successors.append(hashutil.hash_to_bytehex(sname))
- vertices.append(s)
- logger.debug(f"{object_type} {name} depends on {successors}")
+ logger.info("Commit graph has been built")
def save_data(self) -> None:
"""Store a pack for archival"""
"fetched pack file nor in local heads nor in the archive"
)
- def get_dependencies(object_hash):
- vertices = [self._object_graph.vs.find(name=object_hash)]
+ def get_dependencies(v):
+ vertices = [v]
while len(vertices) > 0:
v = vertices.pop(0)
for s in self._object_graph.successors(v):
- vertices.append(s)
+ if s not in vertices:
+ vertices.append(s)
yield (
self._object_graph.vs[s]["name"],
self._object_graph.vs[s]["object_type"],
self.log.debug(
"Number of packed blobs that are missing in storage: "
- f"{len(missing_contents)}"
+ f"{len(missing_contents)} of {len(blob_vertices['name'])} packed blobs total"
)
tree_hashes: List[bytes] = []
- if len(missing_contents) > 0:
- missing_directories = set()
+ if len(missing_contents) == len(blob_vertices["name"]):
+ # If all blobs are missing then all trees are missing, too.
+ tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices]
+ missing_directories = set(tree_hashes)
self.log.debug(
- "Searching for packed trees which depend on missing packed blobs"
+ "Number of packed trees considered missing by implication: "
+ f"{len(missing_directories)}"
)
+ elif len(missing_contents) > 0:
+ missing_directories = set()
+ self.log.debug(
+ f"Searching {len(tree_vertices)} packed trees for trees which "
+ "depend on missing packed blobs"
+ )
+ nsearched = 0
for t in tree_vertices:
tree_hash = t["name"]
+ nsearched += 1
+ if (nsearched % 5000) == 0:
+ self.log.debug(
+ f"Searched {int((nsearched * 100) / len(tree_vertices))}% "
+ f"of {len(tree_vertices)} packed trees..."
+ )
+ if tree_hash in missing_directories:
+ continue
have_dep = False
- for dep_hash, dep_type in get_dependencies(tree_hash):
+ for dep_hash, dep_type in get_dependencies(t):
have_dep = True
if dep_type == GitObjectType.BLOB and dep_hash in missing_contents:
- self.log.debug(
- f"tree {hashutil.hash_to_bytehex(tree_hash)!r} depends on "
- f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}"
- )
# We can infer that the tree is also missing.
missing_directories.add(tree_hash)
break
else:
tree_hashes = [hashutil.hash_to_bytes(t["name"]) for t in tree_vertices]
missing_directories = set(self.storage.directory_missing(tree_hashes))
+ self.log.debug(
+ "Number of missing trees according to archive query: "
+ f"{len(missing_directories)}"
+ )
try:
commit_vertices = self._object_graph.vs.select(
self.log.debug(
"Searching for packed commits which depend on missing packed blobs or trees"
)
+ nsearched = 0
for c in commit_vertices:
commit_hash = c["name"]
- for dep_hash, dep_type in get_dependencies(commit_hash):
+ nsearched += 1
+ if (nsearched % 5000) == 0:
+ self.log.debug(
+ f"Searched {int((nsearched * 100) / len(commit_vertices))}% "
+ f"of {len(commit_vertices)} packed commits..."
+ )
+ if commit_hash in missing_revisions:
+ continue
+ for dep_hash, dep_type in get_dependencies(c):
if dep_hash in missing_contents or dep_hash in missing_directories:
- self.log.debug(
- f"commit {hashutil.hash_to_bytehex(commit_hash)!r} depends on "
- f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}"
- )
# We can infer that the commit is also missing.
missing_revisions.add(commit_hash)
break
)
for t in tag_vertices:
tag_hash = t["name"]
- for dep_hash, dep_type in get_dependencies(tag_hash):
+ for dep_hash, dep_type in get_dependencies(t):
if (
dep_hash in missing_revisions
or dep_hash in missing_directories
or dep_hash in missing_contents
):
- self.log.debug(
- f"tag {hashutil.hash_to_bytehex(tag_hash)!r} depends on "
- f"missing {dep_type} {hashutil.hash_to_bytehex(dep_hash)!r}"
- )
# We can infer that the tag is also missing.
missing_releases.add(tag_hash)
break