commit f40a60968957e3982eee437f04b3e515bb2f5493 from: Jelmer Vernooij via: GitHub date: Fri Jan 13 20:22:45 2023 UTC Merge pull request #1113 from jelmer/pack-refactor Various bits of pack refactoring; add more typing commit - a96e356d1793b5a0fb4fb79679a194095fc2097e commit + f40a60968957e3982eee437f04b3e515bb2f5493 blob - a70755dd8c1a14357cc221578d15374a9c35c6b8 blob + 660e98925d9e6af12a6d56522f8d41d2bec52645 --- dulwich/client.py +++ dulwich/client.py @@ -1515,7 +1515,7 @@ class LocalGitClient(GitClient): pack_data, progress=None, depth=None, - ): + ) -> FetchPackResult: """Retrieve a pack from a git smart server. Args: @@ -1543,7 +1543,7 @@ class LocalGitClient(GitClient): # Note that the client still expects a 0-object pack in most cases. if objects_iter is None: return FetchPackResult(None, symrefs, agent) - write_pack_objects(pack_data, objects_iter) + write_pack_objects(pack_data, objects_iter, reuse_pack=r.object_store) return FetchPackResult(r.get_refs(), symrefs, agent) def get_refs(self, path): blob - a182b4c820f25ae8f5dabdd2f3e555117048f43d blob + e6fa896832f5911b5d0f0e9465b8e15bd56369f9 --- dulwich/fastexport.py +++ dulwich/fastexport.py @@ -30,6 +30,9 @@ from dulwich.objects import ( Tag, ZERO_SHA, ) +from dulwich.object_store import ( + iter_tree_contents, +) from fastimport import ( commands, errors as fastimport_errors, @@ -232,7 +235,7 @@ class GitImportProcessor(processor.ImportProcessor): path, mode, hexsha, - ) in self.repo.object_store.iter_tree_contents(tree_id): + ) in iter_tree_contents(self.repo.object_store, tree_id): self._contents[path] = (mode, hexsha) def reset_handler(self, cmd): blob - bf7396d286010e88d61dc0ccf41c79b2168961fc blob + 2bbb5e19fa548a4639fb279e18f7f24aa396a55b --- dulwich/greenthreads.py +++ dulwich/greenthreads.py @@ -31,12 +31,13 @@ from dulwich.objects import ( ) from dulwich.object_store import ( MissingObjectFinder, + _collect_ancestors, _collect_filetree_revs, ObjectStoreIterator, ) -def _split_commits_and_tags(obj_store, lst, ignore_unknown=False, pool=None): +def _split_commits_and_tags(obj_store, lst, *, ignore_unknown=False, pool=None): """Split object id list into two list with commit SHA1s and tag SHA1s. Same implementation as object_store._split_commits_and_tags @@ -90,11 +91,11 @@ class GreenThreadsMissingObjectFinder(MissingObjectFin self.object_store = object_store p = pool.Pool(size=concurrency) - have_commits, have_tags = _split_commits_and_tags(object_store, haves, True, p) - want_commits, want_tags = _split_commits_and_tags(object_store, wants, False, p) - all_ancestors = object_store._collect_ancestors(have_commits)[0] - missing_commits, common_commits = object_store._collect_ancestors( - want_commits, all_ancestors + have_commits, have_tags = _split_commits_and_tags(object_store, haves, ignore_unknown=True, pool=p) + want_commits, want_tags = _split_commits_and_tags(object_store, wants, ignore_unknown=False, pool=p) + all_ancestors = _collect_ancestors(object_store, have_commits)[0] + missing_commits, common_commits = _collect_ancestors( + object_store, want_commits, all_ancestors ) self.sha_done = set() blob - d875b4851077035a92c82421f2e8f1fc460ea433 blob + 4b13df9eb1147da3efc5e3906ceffad403b82581 --- dulwich/index.py +++ dulwich/index.py @@ -32,16 +32,12 @@ from typing import ( Dict, List, Optional, - TYPE_CHECKING, Iterable, Iterator, Tuple, Union, ) -if TYPE_CHECKING: - from dulwich.object_store import BaseObjectStore - from dulwich.file import GitFile from dulwich.objects import ( Blob, @@ -52,9 +48,11 @@ from dulwich.objects import ( sha_to_hex, ObjectID, ) +from dulwich.object_store import iter_tree_contents from dulwich.pack import ( SHA1Reader, SHA1Writer, + ObjectContainer, ) @@ -451,7 +449,7 @@ class Index: def commit_tree( - object_store: "BaseObjectStore", blobs: Iterable[Tuple[bytes, bytes, int]] + object_store: ObjectContainer, blobs: Iterable[Tuple[bytes, bytes, int]] ) -> bytes: """Commit a new tree. @@ -494,7 +492,7 @@ def commit_tree( return build_tree(b"") -def commit_index(object_store: "BaseObjectStore", index: Index) -> bytes: +def commit_index(object_store: ObjectContainer, index: Index) -> bytes: """Create a new tree from an index. Args: @@ -509,7 +507,7 @@ def commit_index(object_store: "BaseObjectStore", inde def changes_from_tree( names: Iterable[bytes], lookup_entry: Callable[[bytes], Tuple[bytes, int]], - object_store: "BaseObjectStore", + object_store: ObjectContainer, tree: Optional[bytes], want_unchanged=False, ) -> Iterable[ @@ -535,7 +533,7 @@ def changes_from_tree( other_names = set(names) if tree is not None: - for (name, mode, sha) in object_store.iter_tree_contents(tree): + for (name, mode, sha) in iter_tree_contents(object_store, tree): try: (other_sha, other_mode) = lookup_entry(name) except KeyError: @@ -686,7 +684,7 @@ def validate_path(path: bytes, def build_index_from_tree( root_path: Union[str, bytes], index_path: Union[str, bytes], - object_store: "BaseObjectStore", + object_store: ObjectContainer, tree_id: bytes, honor_filemode: bool = True, validate_path_element=validate_path_element_default, @@ -711,7 +709,7 @@ def build_index_from_tree( if not isinstance(root_path, bytes): root_path = os.fsencode(root_path) - for entry in object_store.iter_tree_contents(tree_id): + for entry in iter_tree_contents(object_store, tree_id): if not validate_path(entry.path, validate_path_element): continue full_path = _tree_to_fs_path(root_path, entry.path) @@ -727,6 +725,7 @@ def build_index_from_tree( # TODO(jelmer): record and return submodule paths else: obj = object_store[entry.sha] + assert isinstance(obj, Blob) st = build_file_from_blob( obj, entry.mode, full_path, honor_filemode=honor_filemode, @@ -927,7 +926,7 @@ def index_entry_from_directory(st, path: bytes) -> Opt def index_entry_from_path( - path: bytes, object_store: Optional["BaseObjectStore"] = None + path: bytes, object_store: Optional[ObjectContainer] = None ) -> Optional[IndexEntry]: """Create an index from a filesystem path. @@ -957,7 +956,7 @@ def index_entry_from_path( def iter_fresh_entries( paths: Iterable[bytes], root_path: bytes, - object_store: Optional["BaseObjectStore"] = None + object_store: Optional[ObjectContainer] = None ) -> Iterator[Tuple[bytes, Optional[IndexEntry]]]: """Iterate over current versions of index entries on disk. blob - 7060795636f2266217fae6be5c69bffcdd7387de blob + 44a028dfd1c096769c1965e49fb8cca8f246cfdb --- dulwich/line_ending.py +++ dulwich/line_ending.py @@ -136,6 +136,7 @@ Sources: - https://adaptivepatchwork.com/2012/03/01/mind-the-end-of-your-line/ """ +from dulwich.object_store import iter_tree_contents from dulwich.objects import Blob from dulwich.patch import is_binary @@ -290,7 +291,7 @@ class TreeBlobNormalizer(BlobNormalizer): if tree: self.existing_paths = { name - for name, _, _ in object_store.iter_tree_contents(tree) + for name, _, _ in iter_tree_contents(object_store, tree) } else: self.existing_paths = set() blob - 9e2eb72057ef6f2f61542ab5530dcbf6cca4fabd blob + d91771e9710ddfbf1e9db3650492dccaaec683c0 --- dulwich/object_store.py +++ dulwich/object_store.py @@ -26,13 +26,10 @@ from io import BytesIO import os import stat import sys +import warnings -from typing import Callable, Dict, List, Optional, Tuple +from typing import Callable, Dict, List, Optional, Tuple, Protocol, Union, Iterator, Set -from dulwich.diff_tree import ( - tree_changes, - walk_trees, -) from dulwich.errors import ( NotTreeError, ) @@ -48,10 +45,12 @@ from dulwich.objects import ( sha_to_hex, hex_to_filename, S_ISGITLINK, + TreeEntry, object_class, valid_hexsha, ) from dulwich.pack import ( + ObjectContainer, Pack, PackData, PackInflater, @@ -79,6 +78,14 @@ PACKDIR = "pack" PACK_MODE = 0o444 if sys.platform != "win32" else 0o644 +class PackContainer(Protocol): + + def add_pack( + self + ) -> Tuple[BytesIO, Callable[[], None], Callable[[], None]]: + """Add a new pack.""" + + class BaseObjectStore: """Object store interface.""" @@ -213,6 +220,8 @@ class BaseObjectStore: Returns: Iterator over tuples with (oldpath, newpath), (oldmode, newmode), (oldsha, newsha) """ + + from dulwich.diff_tree import tree_changes for change in tree_changes( self, source, @@ -239,11 +248,10 @@ class BaseObjectStore: Returns: Iterator over TreeEntry namedtuples for all the objects in a tree. """ - for entry, _ in walk_trees(self, tree_id, None): - if ( - entry.mode is not None and not stat.S_ISDIR(entry.mode) - ) or include_trees: - yield entry + warnings.warn( + "Please use dulwich.object_store.iter_tree_contents", + DeprecationWarning, stacklevel=2) + return iter_tree_contents(self, tree_id, include_trees=include_trees) def find_missing_objects( self, @@ -334,47 +342,10 @@ class BaseObjectStore: intermediate tags; if the original ref does not point to a tag, this will equal the original SHA1. """ - obj = self[sha] - obj_class = object_class(obj.type_name) - while obj_class is Tag: - obj_class, sha = obj.object - obj = self[sha] - return obj - - def _collect_ancestors( - self, - heads, - common=frozenset(), - shallow=frozenset(), - get_parents=lambda commit: commit.parents, - ): - """Collect all ancestors of heads up to (excluding) those in common. - - Args: - heads: commits to start from - common: commits to end at, or empty set to walk repository - completely - get_parents: Optional function for getting the parents of a - commit. - Returns: a tuple (A, B) where A - all commits reachable - from heads but not present in common, B - common (shared) elements - that are directly reachable from heads - """ - bases = set() - commits = set() - queue = [] - queue.extend(heads) - while queue: - e = queue.pop(0) - if e in common: - bases.add(e) - elif e not in commits: - commits.add(e) - if e in shallow: - continue - cmt = self[e] - queue.extend(get_parents(cmt)) - return (commits, bases) + warnings.warn( + "Please use dulwich.object_store.peel_sha()", + DeprecationWarning, stacklevel=2) + return peel_sha(self, sha) def _get_depth( self, head, get_parents=lambda commit: commit.parents, max_depth=None, @@ -588,6 +559,46 @@ class PackBasedObjectStore(BaseObjectStore): for alternate in self.alternates: try: return alternate.get_raw(hexsha) + except KeyError: + pass + raise KeyError(hexsha) + + def get_raw_unresolved(self, name: bytes) -> Tuple[int, Union[bytes, None], List[bytes]]: + """Obtain the unresolved data for an object. + + Args: + name: sha for the object. + """ + if name == ZERO_SHA: + raise KeyError(name) + if len(name) == 40: + sha = hex_to_sha(name) + hexsha = name + elif len(name) == 20: + sha = name + hexsha = None + else: + raise AssertionError("Invalid object name {!r}".format(name)) + for pack in self._iter_cached_packs(): + try: + return pack.get_raw_unresolved(sha) + except (KeyError, PackFileDisappeared): + pass + if hexsha is None: + hexsha = sha_to_hex(name) + ret = self._get_loose_object(hexsha) + if ret is not None: + return ret.type_num, None, ret.as_raw_chunks() + # Maybe something else has added a pack with the object + # in the mean time? + for pack in self._update_pack_cache(): + try: + return pack.get_raw_unresolved(sha) + except KeyError: + pass + for alternate in self.alternates: + try: + return alternate.get_raw_unresolved(hexsha) except KeyError: pass raise KeyError(hexsha) @@ -1083,10 +1094,10 @@ class MemoryObjectStore(BaseObjectStore): commit() -class ObjectIterator: +class ObjectIterator(Protocol): """Interface for iterating over objects.""" - def iterobjects(self): + def iterobjects(self) -> Iterator[ShaFile]: raise NotImplementedError(self.iterobjects) @@ -1178,7 +1189,7 @@ def tree_lookup_path(lookup_obj, root_sha, path): return tree.lookup_path(lookup_obj, path) -def _collect_filetree_revs(obj_store, tree_sha, kset): +def _collect_filetree_revs(obj_store: ObjectContainer, tree_sha: ObjectID, kset: Set[ObjectID]) -> None: """Collect SHA1s of files and directories for specified tree. Args: @@ -1187,6 +1198,7 @@ def _collect_filetree_revs(obj_store, tree_sha, kset): kset: set to fill with references to files and directories """ filetree = obj_store[tree_sha] + assert isinstance(filetree, Tree) for name, mode, sha in filetree.iteritems(): if not S_ISGITLINK(mode) and sha not in kset: kset.add(sha) @@ -1194,7 +1206,7 @@ def _collect_filetree_revs(obj_store, tree_sha, kset): _collect_filetree_revs(obj_store, sha, kset) -def _split_commits_and_tags(obj_store, lst, ignore_unknown=False): +def _split_commits_and_tags(obj_store: ObjectContainer, lst, *, ignore_unknown=False) -> Tuple[Set[bytes], Set[bytes], Set[bytes]]: """Split object id list into three lists with commit, tag, and other SHAs. Commits referenced by tags are included into commits @@ -1209,9 +1221,9 @@ def _split_commits_and_tags(obj_store, lst, ignore_unk silently. Returns: A tuple of (commits, tags, others) SHA1s """ - commits = set() - tags = set() - others = set() + commits: Set[bytes] = set() + tags: Set[bytes] = set() + others: Set[bytes] = set() for e in lst: try: o = obj_store[e] @@ -1224,12 +1236,12 @@ def _split_commits_and_tags(obj_store, lst, ignore_unk elif isinstance(o, Tag): tags.add(e) tagged = o.object[1] - c, t, o = _split_commits_and_tags( + c, t, os = _split_commits_and_tags( obj_store, [tagged], ignore_unknown=ignore_unknown ) commits |= c tags |= t - others |= o + others |= os else: others.add(e) return (commits, tags, others) @@ -1270,20 +1282,22 @@ class MissingObjectFinder: # wants shall list only known SHAs, and otherwise # _split_commits_and_tags fails with KeyError have_commits, have_tags, have_others = _split_commits_and_tags( - object_store, haves, True + object_store, haves, ignore_unknown=True ) want_commits, want_tags, want_others = _split_commits_and_tags( - object_store, wants, False + object_store, wants, ignore_unknown=False ) # all_ancestors is a set of commits that shall not be sent # (complete repository up to 'haves') - all_ancestors = object_store._collect_ancestors( + all_ancestors = _collect_ancestors( + object_store, have_commits, shallow=shallow, get_parents=self._get_parents )[0] # all_missing - complete set of commits between haves and wants # common - commits from all_ancestors we hit into while # traversing parent hierarchy of wants - missing_commits, common_commits = object_store._collect_ancestors( + missing_commits, common_commits = _collect_ancestors( + object_store, want_commits, all_ancestors, shallow=shallow, @@ -1606,3 +1620,85 @@ class BucketBasedObjectStore(PackBasedObjectStore): return final_pack return pf, commit, pf.close + + +def _collect_ancestors( + store: ObjectContainer, + heads, + common=frozenset(), + shallow=frozenset(), + get_parents=lambda commit: commit.parents, +): + """Collect all ancestors of heads up to (excluding) those in common. + + Args: + heads: commits to start from + common: commits to end at, or empty set to walk repository + completely + get_parents: Optional function for getting the parents of a + commit. + Returns: a tuple (A, B) where A - all commits reachable + from heads but not present in common, B - common (shared) elements + that are directly reachable from heads + """ + bases = set() + commits = set() + queue = [] + queue.extend(heads) + while queue: + e = queue.pop(0) + if e in common: + bases.add(e) + elif e not in commits: + commits.add(e) + if e in shallow: + continue + cmt = store[e] + queue.extend(get_parents(cmt)) + return (commits, bases) + + +def iter_tree_contents( + store: ObjectContainer, tree_id: bytes, *, include_trees: bool = False): + """Iterate the contents of a tree and all subtrees. + + Iteration is depth-first pre-order, as in e.g. os.walk. + + Args: + tree_id: SHA1 of the tree. + include_trees: If True, include tree objects in the iteration. + Returns: Iterator over TreeEntry namedtuples for all the objects in a + tree. + """ + # This could be fairly easily generalized to >2 trees if we find a use + # case. + todo = [TreeEntry(b"", stat.S_IFDIR, tree_id)] + while todo: + entry = todo.pop() + if stat.S_ISDIR(entry.mode): + extra = [] + tree = store[entry.sha] + assert isinstance(tree, Tree) + for subentry in tree.iteritems(name_order=True): + extra.append(subentry.in_path(entry.path)) + todo.extend(reversed(extra)) + if not stat.S_ISDIR(entry.mode) or include_trees: + yield entry + + +def peel_sha(store: ObjectContainer, sha: bytes) -> ShaFile: + """Peel all tags from a SHA. + + Args: + sha: The object SHA to peel. + Returns: The fully-peeled SHA1 of a tag object, after peeling all + intermediate tags; if the original ref does not point to a tag, + this will equal the original SHA1. + """ + obj = store[sha] + obj_class = object_class(obj.type_name) + while obj_class is Tag: + assert isinstance(obj, Tag) + obj_class, sha = obj.object + obj = store[sha] + return obj blob - ee15fa1377a0669cab618f931812671b6c291338 blob + 1fd4feb6ac3adacc2e41fb828b530cd6a13028e5 --- dulwich/pack.py +++ dulwich/pack.py @@ -49,7 +49,7 @@ from itertools import chain import os import sys -from typing import Optional, Callable, Tuple, List, Deque, Union +from typing import Optional, Callable, Tuple, List, Deque, Union, Protocol, Iterable, Iterator import warnings from hashlib import sha1 @@ -94,6 +94,34 @@ DELTA_TYPES = (OFS_DELTA, REF_DELTA) DEFAULT_PACK_DELTA_WINDOW_SIZE = 10 + + +class ObjectContainer(Protocol): + + def add_object(self, obj: ShaFile) -> None: + """Add a single object to this object store.""" + + def add_objects( + self, objects: Iterable[Tuple[ShaFile, Optional[str]]], + progress: Optional[Callable[[str], None]] = None) -> None: + """Add a set of objects to this object store. + + Args: + objects: Iterable over a list of (object, path) tuples + """ + + def __contains__(self, sha1: bytes) -> bool: + """Check if a hex sha is present.""" + + def __getitem__(self, sha1: bytes) -> ShaFile: + """Retrieve an object.""" + + +class PackedObjectContainer(ObjectContainer): + + def get_raw_unresolved(self, sha1: bytes) -> Tuple[int, Union[bytes, None], List[bytes]]: + """Get a raw unresolved object.""" + raise NotImplementedError(self.get_raw_unresolved) def take_msb_bytes(read: Callable[[int], bytes], crc32: Optional[int] = None) -> Tuple[List[int], Optional[int]]: @@ -513,7 +541,7 @@ class FilePackIndex(PackIndex): self._contents, self._size = (contents, size) @property - def path(self): + def path(self) -> str: return self._filename def __eq__(self, other): @@ -526,16 +554,16 @@ class FilePackIndex(PackIndex): return super().__eq__(other) - def close(self): + def close(self) -> None: self._file.close() if getattr(self._contents, "close", None) is not None: self._contents.close() - def __len__(self): + def __len__(self) -> int: """Return the number of entries in this pack index.""" return self._fan_out_table[-1] - def _unpack_entry(self, i): + def _unpack_entry(self, i: int) -> Tuple[bytes, int, Optional[int]]: """Unpack the i-th entry in the index file. Returns: Tuple with object name (SHA), offset in pack file and CRC32 @@ -555,11 +583,11 @@ class FilePackIndex(PackIndex): """Unpack the crc32 checksum for the ith object from the index file.""" raise NotImplementedError(self._unpack_crc32_checksum) - def _itersha(self): + def _itersha(self) -> Iterator[bytes]: for i in range(len(self)): yield self._unpack_name(i) - def iterentries(self): + def iterentries(self) -> Iterator[Tuple[bytes, int, Optional[int]]]: """Iterate over the entries in this pack index. Returns: iterator over tuples with object name, offset in packfile and @@ -568,7 +596,7 @@ class FilePackIndex(PackIndex): for i in range(len(self)): yield self._unpack_entry(i) - def _read_fan_out_table(self, start_offset): + def _read_fan_out_table(self, start_offset: int): ret = [] for i in range(0x100): fanout_entry = self._contents[ @@ -577,35 +605,35 @@ class FilePackIndex(PackIndex): ret.append(struct.unpack(">L", fanout_entry)[0]) return ret - def check(self): + def check(self) -> None: """Check that the stored checksum matches the actual checksum.""" actual = self.calculate_checksum() stored = self.get_stored_checksum() if actual != stored: raise ChecksumMismatch(stored, actual) - def calculate_checksum(self): + def calculate_checksum(self) -> bytes: """Calculate the SHA1 checksum over this pack index. Returns: This is a 20-byte binary digest """ return sha1(self._contents[:-20]).digest() - def get_pack_checksum(self): + def get_pack_checksum(self) -> bytes: """Return the SHA1 checksum stored for the corresponding packfile. Returns: 20-byte binary digest """ return bytes(self._contents[-40:-20]) - def get_stored_checksum(self): + def get_stored_checksum(self) -> bytes: """Return the SHA1 checksum stored for this index. Returns: 20-byte binary digest """ return bytes(self._contents[-20:]) - def object_index(self, sha): + def object_index(self, sha: bytes) -> int: """Return the index in to the corresponding packfile for the object. Given the name of an object it will return the offset that object @@ -644,7 +672,7 @@ class FilePackIndex(PackIndex): class PackIndex1(FilePackIndex): """Version 1 Pack Index file.""" - def __init__(self, filename, file=None, contents=None, size=None): + def __init__(self, filename: str, file=None, contents=None, size=None): super().__init__(filename, file, contents, size) self.version = 1 self._fan_out_table = self._read_fan_out_table(0) @@ -669,7 +697,7 @@ class PackIndex1(FilePackIndex): class PackIndex2(FilePackIndex): """Version 2 Pack Index file.""" - def __init__(self, filename, file=None, contents=None, size=None): + def __init__(self, filename: str, file=None, contents=None, size=None): super().__init__(filename, file, contents, size) if self._contents[:4] != b"\377tOc": raise AssertionError("Not a v2 pack index file") @@ -707,7 +735,7 @@ class PackIndex2(FilePackIndex): return unpack_from(">L", self._contents, self._crc32_table_offset + i * 4)[0] -def read_pack_header(read): +def read_pack_header(read) -> Tuple[Optional[int], Optional[int]]: """Read the header of a pack file. Args: @@ -727,7 +755,7 @@ def read_pack_header(read): return (version, num_objects) -def chunks_length(chunks): +def chunks_length(chunks: Union[bytes, Iterable[bytes]]) -> int: if isinstance(chunks, bytes): return len(chunks) else: @@ -740,7 +768,7 @@ def unpack_object( compute_crc32=False, include_comp=False, zlib_bufsize=_ZLIB_BUFSIZE, -): +) -> Tuple[UnpackedObject, bytes]: """Unpack a Git object. Args: @@ -1596,12 +1624,13 @@ def write_pack_object(write, type, object, sha=None, c def write_pack( - filename, - objects, - deltify=None, - delta_window_size=None, - compression_level=-1, -): + filename, + objects, + *, + deltify: Optional[bool] = None, + delta_window_size: Optional[int] = None, + compression_level: int = -1, + reuse_pack: Optional[PackedObjectContainer] = None): """Write a new pack data file. Args: @@ -1619,6 +1648,7 @@ def write_pack( delta_window_size=delta_window_size, deltify=deltify, compression_level=compression_level, + reuse_pack=reuse_pack, ) entries = sorted([(k, v[0], v[1]) for (k, v) in entries.items()]) with GitFile(filename + ".idx", "wb") as f: @@ -1643,7 +1673,10 @@ def write_pack_header(write, num_objects): write(chunk) -def deltify_pack_objects(objects, window_size: Optional[int] = None, reuse_pack=None): +def deltify_pack_objects( + objects: Iterable[Tuple[ShaFile, str]], + window_size: Optional[int] = None, + reuse_pack: Optional[PackedObjectContainer] = None): """Generate deltas for pack objects. Args: @@ -1685,7 +1718,7 @@ def deltify_pack_objects(objects, window_size: Optiona magic.append((obj.type_num, path, -obj.raw_length(), obj)) magic.sort() - possible_bases: Deque[Tuple[bytes, int, bytes]] = deque() + possible_bases: Deque[Tuple[bytes, int, List[bytes]]] = deque() for type_num, path, neg_length, o in magic: raw = o.as_raw_chunks() @@ -1712,7 +1745,11 @@ def deltify_pack_objects(objects, window_size: Optiona possible_bases.pop() -def pack_objects_to_data(objects): +def pack_objects_to_data( + objects, + delta_window_size: Optional[int] = None, + deltify: Optional[bool] = None, + reuse_pack: Optional[PackedObjectContainer] = None): """Create pack data from objects Args: @@ -1720,17 +1757,30 @@ def pack_objects_to_data(objects): Returns: Tuples with (type_num, hexdigest, delta base, object chunks) """ count = len(objects) - return ( - count, - ( - (o.type_num, o.sha().digest(), None, o.as_raw_chunks()) - for (o, path) in objects - ), - ) + if deltify is None: + # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too + # slow at the moment. + deltify = False + if deltify: + pack_contents = deltify_pack_objects( + objects, window_size=delta_window_size, reuse_pack=reuse_pack) + return (count, pack_contents) + else: + return ( + count, + ( + (o.type_num, o.sha().digest(), None, o.as_raw_chunks()) + for (o, path) in objects + ), + ) def write_pack_objects( - write, objects, delta_window_size=None, deltify=None, reuse_pack=None, compression_level=-1 + write, objects, + delta_window_size: Optional[int] = None, + deltify: Optional[bool] = None, + reuse_pack: Optional[PackedObjectContainer] = None, + compression_level: int = -1 ): """Write a new pack data file. @@ -1751,16 +1801,10 @@ def write_pack_objects( DeprecationWarning, stacklevel=2) write = write.write - if deltify is None: - # PERFORMANCE/TODO(jelmer): This should be enabled but is *much* too - # slow at the moment. - deltify = False - if deltify: - pack_contents = deltify_pack_objects( - objects, window_size=delta_window_size, reuse_pack=reuse_pack) - pack_contents_count = len(objects) - else: - pack_contents_count, pack_contents = pack_objects_to_data(objects) + pack_contents_count, pack_contents = pack_objects_to_data( + objects, delta_window_size=delta_window_size, + deltify=deltify, + reuse_pack=reuse_pack) return write_pack_data( write, blob - bcae70e32166efc77b675b9fa5167f8c376101f1 blob + 3652e33424e0937c1a3cb683458755689c7d5951 --- dulwich/patch.py +++ dulwich/patch.py @@ -34,6 +34,7 @@ from dulwich.objects import ( Commit, S_ISGITLINK, ) +from dulwich.pack import ObjectContainer FIRST_FEW_BYTES = 8000 @@ -192,7 +193,7 @@ def patch_filename(p, root): return root + b"/" + p -def write_object_diff(f, store, old_file, new_file, diff_binary=False): +def write_object_diff(f, store: ObjectContainer, old_file, new_file, diff_binary=False): """Write the diff for an object. Args: blob - 850747722e33b4d032f087e7c1c18f0d22a6ff78 blob + 31d590f8a356e3a7829dfdf267e04c359bac0dcc --- dulwich/refs.py +++ dulwich/refs.py @@ -37,6 +37,7 @@ from dulwich.objects import ( Tag, ObjectID, ) +from dulwich.pack import ObjectContainer from dulwich.file import ( GitFile, ensure_dir_exists, @@ -1150,8 +1151,10 @@ def read_info_refs(f): return ret -def write_info_refs(refs, store): +def write_info_refs(refs, store: ObjectContainer): """Generate info refs.""" + # Avoid recursive import :( + from dulwich.object_store import peel_sha for name, sha in sorted(refs.items()): # get_refs() includes HEAD as a special case, but we don't want to # advertise it @@ -1161,7 +1164,7 @@ def write_info_refs(refs, store): o = store[sha] except KeyError: continue - peeled = store.peel_sha(sha) + peeled = peel_sha(store, sha) yield o.id + b"\t" + name + b"\n" if o.id != peeled.id: yield peeled.id + b"\t" + name + ANNOTATED_TAG_SUFFIX + b"\n" blob - f8ccfeedd838706b5d182b6eb7b18de95abe58f3 blob + 53d889c29e74271cc59631a4ae5bab62f8254bb0 --- dulwich/repo.py +++ dulwich/repo.py @@ -72,6 +72,7 @@ from dulwich.object_store import ( MemoryObjectStore, BaseObjectStore, ObjectStoreGraphWalker, + peel_sha, ) from dulwich.objects import ( check_hexsha, @@ -757,7 +758,7 @@ class BaseRepo: cached = self.refs.get_peeled(ref) if cached is not None: return cached - return self.object_store.peel_sha(self.refs[ref]).id + return peel_sha(self.object_store, self.refs[ref]).id def get_walker(self, include: Optional[List[bytes]] = None, *args, **kwargs): blob - 43b23fe1012a34267186a6935608f68b40cfd746 blob + 53376d3a5859dbed1a819b3d1f69ec089c1d1b01 --- dulwich/server.py +++ dulwich/server.py @@ -47,7 +47,7 @@ import os import socket import sys import time -from typing import List, Tuple, Dict, Optional, Iterable +from typing import List, Tuple, Dict, Optional, Iterable, Set import zlib import socketserver @@ -66,9 +66,13 @@ from dulwich import log_utils from dulwich.objects import ( Commit, valid_hexsha, +) +from dulwich.object_store import ( + peel_sha, ) from dulwich.pack import ( write_pack_objects, + ObjectContainer, ) from dulwich.protocol import ( BufferedPktLineWriter, @@ -456,7 +460,7 @@ def _split_proto_line(line, allowed): raise GitProtocolError("Received invalid line from client: %r" % line) -def _find_shallow(store, heads, depth): +def _find_shallow(store: ObjectContainer, heads, depth): """Find shallow commits according to a given depth. Args: @@ -468,7 +472,7 @@ def _find_shallow(store, heads, depth): considered shallow and unshallow according to the arguments. Note that these sets may overlap if a commit is reachable along multiple paths. """ - parents = {} + parents: Dict[bytes, List[bytes]] = {} def get_parents(sha): result = parents.get(sha, None) @@ -479,7 +483,7 @@ def _find_shallow(store, heads, depth): todo = [] # stack of (sha, depth) for head_sha in heads: - obj = store.peel_sha(head_sha) + obj = peel_sha(store, head_sha) if isinstance(obj, Commit): todo.append((obj.id, 1)) @@ -497,7 +501,7 @@ def _find_shallow(store, heads, depth): return shallow, not_shallow -def _want_satisfied(store, haves, want, earliest): +def _want_satisfied(store: ObjectContainer, haves, want, earliest): o = store[want] pending = collections.deque([o]) known = {want} @@ -505,7 +509,7 @@ def _want_satisfied(store, haves, want, earliest): commit = pending.popleft() if commit.id in haves: return True - if commit.type_name != b"commit": + if not isinstance(commit, Commit): # non-commit wants are assumed to be satisfied continue for parent in commit.parents: @@ -513,13 +517,14 @@ def _want_satisfied(store, haves, want, earliest): continue known.add(parent) parent_obj = store[parent] + assert isinstance(parent_obj, Commit) # TODO: handle parents with later commit times than children if parent_obj.commit_time >= earliest: pending.append(parent_obj) return False -def _all_wants_satisfied(store, haves, wants): +def _all_wants_satisfied(store: ObjectContainer, haves, wants): """Check whether all the current wants are satisfied by a set of haves. Args: @@ -531,7 +536,8 @@ def _all_wants_satisfied(store, haves, wants): """ haves = set(haves) if haves: - earliest = min([store[h].commit_time for h in haves]) + have_objs = [store[h] for h in haves] + earliest = min([h.commit_time for h in have_objs if isinstance(h, Commit)]) else: earliest = 0 for want in wants: @@ -555,20 +561,20 @@ class _ProtocolGraphWalker: any calls to next() or ack() are made. """ - def __init__(self, handler, object_store, get_peeled, get_symrefs): + def __init__(self, handler, object_store: ObjectContainer, get_peeled, get_symrefs): self.handler = handler - self.store = object_store + self.store: ObjectContainer = object_store self.get_peeled = get_peeled self.get_symrefs = get_symrefs self.proto = handler.proto self.stateless_rpc = handler.stateless_rpc self.advertise_refs = handler.advertise_refs - self._wants = [] - self.shallow = set() - self.client_shallow = set() - self.unshallow = set() + self._wants: List[bytes] = [] + self.shallow: Set[bytes] = set() + self.client_shallow: Set[bytes] = set() + self.unshallow: Set[bytes] = set() self._cached = False - self._cache = [] + self._cache: List[bytes] = [] self._cache_index = 0 self._impl = None @@ -1104,7 +1110,7 @@ class UploadArchiveHandler(Handler): prefix = b"" format = "tar" i = 0 - store = self.repo.object_store + store: ObjectContainer = self.repo.object_store while i < len(arguments): argument = arguments[i] if argument == b"--prefix": blob - dc98d8792450299ff187445bbf197eda758a6580 blob + 5bc44c27cdd444437b8d31b0fba3b9b086b65ed5 --- dulwich/submodule.py +++ dulwich/submodule.py @@ -22,6 +22,7 @@ """ from typing import Iterator, Tuple +from .object_store import iter_tree_contents from .objects import S_ISGITLINK @@ -35,6 +36,6 @@ def iter_cached_submodules(store, root_tree_id: bytes) Returns: Iterator over over (path, sha) tuples """ - for entry in store.iter_tree_contents(root_tree_id): + for entry in iter_tree_contents(store, root_tree_id): if S_ISGITLINK(entry.mode): yield entry.path, entry.sha blob - be067b5110c14e61c9bd63d26e469b5346930641 blob + e4f8c537dfcdf5f82ca0d9786cc77897e55fda70 --- dulwich/tests/test_object_store.py +++ dulwich/tests/test_object_store.py @@ -51,6 +51,8 @@ from dulwich.object_store import ( OverlayObjectStore, ObjectStoreGraphWalker, commit_tree_changes, + iter_tree_contents, + peel_sha, read_packs_file, tree_lookup_path, ) @@ -219,7 +221,7 @@ class ObjectStoreTests: tree_id = commit_tree(self.store, blobs) self.assertEqual( [TreeEntry(p, m, h) for (p, h, m) in blobs], - list(self.store.iter_tree_contents(tree_id)), + list(iter_tree_contents(self.store, tree_id)), ) def test_iter_tree_contents_include_trees(self): @@ -247,7 +249,7 @@ class ObjectStoreTests: TreeEntry(b"ad/bd", 0o040000, tree_bd.id), TreeEntry(b"ad/bd/c", 0o100755, blob_c.id), ] - actual = self.store.iter_tree_contents(tree_id, include_trees=True) + actual = iter_tree_contents(self.store, tree_id, include_trees=True) self.assertEqual(expected, list(actual)) def make_tag(self, name, obj): @@ -261,7 +263,7 @@ class ObjectStoreTests: tag2 = self.make_tag(b"2", testobject) tag3 = self.make_tag(b"3", testobject) for obj in [testobject, tag1, tag2, tag3]: - self.assertEqual(testobject, self.store.peel_sha(obj.id)) + self.assertEqual(testobject, peel_sha(self.store, obj.id)) def test_get_raw(self): self.store.add_object(testobject)