commit b7d16897c24c7d5a0e28d952b027401b37417f92 from: Antoine Lambert date: Thu Feb 22 13:01:01 2024 UTC dumb: Synchronize fetch_pack behavior with smart loader As with the smart git loader, restrain the maximum size for a pack file to download. Move the code writing pack data bytes and checking size in an utility class to avoid code duplication. Add missing tests covering the cases where the pack size limit is reached. commit - 64ac020485d0968eeeff96292515aa8bb2d858c7 commit + b7d16897c24c7d5a0e28d952b027401b37417f92 blob - 2a3180fb40a74e28797cd4bb48ca91fd79561f01 blob + 6e3006e035fa06720e40ef205e8e7ef151f1863e --- swh/loader/git/dumb.py +++ swh/loader/git/dumb.py @@ -11,7 +11,17 @@ import logging import stat import struct from tempfile import SpooledTemporaryFile -from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Set, cast +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + Iterable, + List, + Protocol, + Set, + cast, +) import urllib.parse from dulwich.errors import NotGitRepository @@ -21,14 +31,20 @@ import requests from tenacity.before_sleep import before_sleep_log from swh.core.retry import http_retry -from swh.loader.git.utils import HexBytes +from swh.loader.git.utils import HexBytes, PackWriter if TYPE_CHECKING: from .loader import RepoRepresentation logger = logging.getLogger(__name__) +fetch_pack_logger = logger.getChild("fetch_pack") +class BytesWriter(Protocol): + def write(self, data: bytes): + ... + + def requests_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]: """Inject User-Agent header in the requests kwargs""" ret = copy.deepcopy(kwargs) @@ -91,12 +107,14 @@ class GitObjectsFetcher: self, repo_url: str, base_repo: RepoRepresentation, + pack_size_limit: int, requests_extra_kwargs: Dict[str, Any] = {}, ): self._session = requests.Session() self.requests_extra_kwargs = requests_extra_kwargs self.repo_url = repo_url self.base_repo = base_repo + self.pack_size_limit = pack_size_limit self.objects: Dict[bytes, Set[bytes]] = defaultdict(set) self.refs = self._get_refs() self.head = self._get_head() if self.refs else {} @@ -153,8 +171,16 @@ class GitObjectsFetcher: ) response.raise_for_status() buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024) + bytes_writer: BytesWriter = buffer + if path.startswith("objects/pack/") and path.endswith(".pack"): + bytes_writer = PackWriter( + pack_buffer=buffer, + size_limit=self.pack_size_limit, + origin_url=self.repo_url, + fetch_pack_logger=fetch_pack_logger, + ) for chunk in response.iter_content(chunk_size=10 * 1024 * 1024): - buffer.write(chunk) + bytes_writer.write(chunk) buffer.flush() buffer.seek(0) return buffer blob - 25713898777ac2449410cc2a0203ab2175175064 blob + 1a5d8d1c87bccb651ce57639a1c1564bcf757451 --- swh/loader/git/loader.py +++ swh/loader/git/loader.py @@ -1,4 +1,4 @@ -# Copyright (C) 2016-2022 The Software Heritage developers +# Copyright (C) 2016-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -59,17 +59,14 @@ from swh.storage.interface import StorageInterface from . import converters, dumb, utils from .base import BaseGitLoader -from .utils import HexBytes +from .utils import LOGGING_INTERVAL, HexBytes, PackWriter logger = logging.getLogger(__name__) heads_logger = logger.getChild("refs") remote_logger = logger.getChild("remote") fetch_pack_logger = logger.getChild("fetch_pack") -# How often to log messages for long-running operations, in seconds -LOGGING_INTERVAL = 30 - def split_lines_and_remainder(buf: bytes) -> Tuple[List[bytes], bytes]: """Get newline-terminated (``b"\\r"`` or ``b"\\n"``) lines from `buf`, and the beginning of the last line if it isn't terminated.""" @@ -274,38 +271,18 @@ class GitLoader(BaseGitLoader): logger.debug("Client %s to fetch pack at %s", client, path) - size_limit = self.pack_size_bytes - - last_line_logged = time.monotonic() - - def do_pack(data: bytes) -> None: - nonlocal last_line_logged - - cur_size = pack_buffer.tell() - would_write = len(data) - fetched = cur_size + would_write - if fetched > size_limit: - raise IOError( - f"Pack file too big for repository {origin_url}, " - f"limit is {size_limit} bytes, current size is {cur_size}, " - f"would write {would_write}" - ) - - if time.monotonic() > last_line_logged + LOGGING_INTERVAL: - fetch_pack_logger.info( - "Fetched %s packfile bytes so far (%.2f%% of configured limit)", - fetched, - 100 * fetched / size_limit, - ) - last_line_logged = time.monotonic() - - pack_buffer.write(data) + pack_writer = PackWriter( + pack_buffer=pack_buffer, + size_limit=self.pack_size_bytes, + origin_url=origin_url, + fetch_pack_logger=fetch_pack_logger, + ) pack_result = client.fetch_pack( path, base_repo.determine_wants, base_repo.graph_walker(), - do_pack, + pack_writer.write, progress=do_activity, ) @@ -490,8 +467,9 @@ class GitLoader(BaseGitLoader): ) if self.dumb: self.dumb_fetcher = dumb.GitObjectsFetcher( - self.origin.url, - base_repo, + repo_url=self.origin.url, + base_repo=base_repo, + pack_size_limit=self.pack_size_bytes, requests_extra_kwargs=self.requests_extra_kwargs, ) self.dumb_fetcher.fetch_object_ids() blob - 25c460ee36f0f78b3c27962a9d05df9edb660190 blob + 1858a71402a362f799ebe7a8390325d4dca7a6f2 --- swh/loader/git/tests/test_loader.py +++ swh/loader/git/tests/test_loader.py @@ -23,6 +23,7 @@ import dulwich.repo from dulwich.tests.utils import build_pack import pytest from requests import HTTPError +import sentry_sdk from swh.loader.git import converters, dumb from swh.loader.git.loader import FetchPackReturn, GitLoader, split_lines_and_remainder @@ -435,6 +436,17 @@ class TestGitLoader(FullGitLoaderTests, CommonGitLoade call(statsd_metric, "c", 1, {"type": "revision", "result": "found"}, 1), ] + def test_load_pack_size_limit(self, sentry_events): + # set max pack size to a really small value + self.loader.pack_size_bytes = 10 + res = self.loader.load() + assert res == {"status": "failed"} + assert sentry_events + assert sentry_events[0]["level"] == "error" + assert sentry_events[0]["exception"]["values"][0]["value"].startswith( + "Pack file too big for repository" + ) + class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound): """Mostly the same loading scenario but with a ``parent_origin`` different from the @@ -1014,7 +1026,22 @@ class TestDumbGitLoaderWithPack(DumbGitLoaderTestBase) assert dumb.check_protocol(self.repo_url) sleep.assert_has_calls([mocker.call(1)]) + def test_load_pack_size_limit(self, sentry_events): + # without that hack, the following error is raised when running test + # AttributeError: 'TestTransport' object has no attribute 'parsed_dsn' + sentry_sdk.Hub.current.client.integrations.pop("stdlib", None) + # set max pack size to a really small value + self.loader.pack_size_bytes = 10 + res = self.loader.load() + assert res == {"status": "failed"} + assert sentry_events + assert sentry_events[0]["level"] == "error" + assert sentry_events[0]["exception"]["values"][0]["value"].startswith( + "Pack file too big for repository" + ) + + class TestDumbGitLoaderWithoutPack(DumbGitLoaderTestBase): @classmethod def setup_class(cls): blob - 0c3e7983ba2677043e5afbd95aedf5795086da1c blob + d6b77dd2a8d35f6f38a24835c1d64c40e0bc69f4 --- swh/loader/git/utils.py +++ swh/loader/git/utils.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2023 The Software Heritage developers +# Copyright (C) 2017-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -11,6 +11,7 @@ import logging import os import shutil import tempfile +import time from typing import Dict, Mapping, NewType, Optional, Union from dulwich.client import HTTPUnauthorized @@ -161,3 +162,46 @@ def raise_not_found_repository(): raise NotFound(e) # otherwise transmit the error raise + + +# How often to log messages for long-running operations, in seconds +LOGGING_INTERVAL = 30 + + +class PackWriter: + """Helper class to abort git loading if pack file currently downloaded + has a size in bytes that exceeds a given threshold.""" + + def __init__( + self, + pack_buffer: tempfile.SpooledTemporaryFile, + size_limit: int, + origin_url: str, + fetch_pack_logger: logging.Logger, + ): + self.pack_buffer = pack_buffer + self.size_limit = size_limit + self.origin_url = origin_url + self.fetch_pack_logger = fetch_pack_logger + self.last_time_logged = time.monotonic() + + def write(self, data: bytes): + cur_size = self.pack_buffer.tell() + would_write = len(data) + fetched = cur_size + would_write + if fetched > self.size_limit: + raise IOError( + f"Pack file too big for repository {self.origin_url}, " + f"limit is {self.size_limit} bytes, current size is {cur_size}, " + f"would write {would_write}" + ) + + if time.monotonic() > self.last_time_logged + LOGGING_INTERVAL: + self.fetch_pack_logger.info( + "Fetched %s packfile bytes so far (%.2f%% of configured limit)", + fetched, + 100 * fetched / self.size_limit, + ) + self.last_time_logged = time.monotonic() + + self.pack_buffer.write(data)