commit - 64ac020485d0968eeeff96292515aa8bb2d858c7
commit + b7d16897c24c7d5a0e28d952b027401b37417f92
blob - 2a3180fb40a74e28797cd4bb48ca91fd79561f01
blob + 6e3006e035fa06720e40ef205e8e7ef151f1863e
--- swh/loader/git/dumb.py
+++ swh/loader/git/dumb.py
import stat
import struct
from tempfile import SpooledTemporaryFile
-from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Set, cast
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Dict,
+ Iterable,
+ List,
+ Protocol,
+ Set,
+ cast,
+)
import urllib.parse
from dulwich.errors import NotGitRepository
from tenacity.before_sleep import before_sleep_log
from swh.core.retry import http_retry
-from swh.loader.git.utils import HexBytes
+from swh.loader.git.utils import HexBytes, PackWriter
if TYPE_CHECKING:
from .loader import RepoRepresentation
logger = logging.getLogger(__name__)
+fetch_pack_logger = logger.getChild("fetch_pack")
+class BytesWriter(Protocol):
+ def write(self, data: bytes):
+ ...
+
+
def requests_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
"""Inject User-Agent header in the requests kwargs"""
ret = copy.deepcopy(kwargs)
self,
repo_url: str,
base_repo: RepoRepresentation,
+ pack_size_limit: int,
requests_extra_kwargs: Dict[str, Any] = {},
):
self._session = requests.Session()
self.requests_extra_kwargs = requests_extra_kwargs
self.repo_url = repo_url
self.base_repo = base_repo
+ self.pack_size_limit = pack_size_limit
self.objects: Dict[bytes, Set[bytes]] = defaultdict(set)
self.refs = self._get_refs()
self.head = self._get_head() if self.refs else {}
)
response.raise_for_status()
buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024)
+ bytes_writer: BytesWriter = buffer
+ if path.startswith("objects/pack/") and path.endswith(".pack"):
+ bytes_writer = PackWriter(
+ pack_buffer=buffer,
+ size_limit=self.pack_size_limit,
+ origin_url=self.repo_url,
+ fetch_pack_logger=fetch_pack_logger,
+ )
for chunk in response.iter_content(chunk_size=10 * 1024 * 1024):
- buffer.write(chunk)
+ bytes_writer.write(chunk)
buffer.flush()
buffer.seek(0)
return buffer
blob - 25713898777ac2449410cc2a0203ab2175175064
blob + 1a5d8d1c87bccb651ce57639a1c1564bcf757451
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
-# Copyright (C) 2016-2022 The Software Heritage developers
+# Copyright (C) 2016-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from . import converters, dumb, utils
from .base import BaseGitLoader
-from .utils import HexBytes
+from .utils import LOGGING_INTERVAL, HexBytes, PackWriter
logger = logging.getLogger(__name__)
heads_logger = logger.getChild("refs")
remote_logger = logger.getChild("remote")
fetch_pack_logger = logger.getChild("fetch_pack")
-# How often to log messages for long-running operations, in seconds
-LOGGING_INTERVAL = 30
-
def split_lines_and_remainder(buf: bytes) -> Tuple[List[bytes], bytes]:
"""Get newline-terminated (``b"\\r"`` or ``b"\\n"``) lines from `buf`,
and the beginning of the last line if it isn't terminated."""
logger.debug("Client %s to fetch pack at %s", client, path)
- size_limit = self.pack_size_bytes
-
- last_line_logged = time.monotonic()
-
- def do_pack(data: bytes) -> None:
- nonlocal last_line_logged
-
- cur_size = pack_buffer.tell()
- would_write = len(data)
- fetched = cur_size + would_write
- if fetched > size_limit:
- raise IOError(
- f"Pack file too big for repository {origin_url}, "
- f"limit is {size_limit} bytes, current size is {cur_size}, "
- f"would write {would_write}"
- )
-
- if time.monotonic() > last_line_logged + LOGGING_INTERVAL:
- fetch_pack_logger.info(
- "Fetched %s packfile bytes so far (%.2f%% of configured limit)",
- fetched,
- 100 * fetched / size_limit,
- )
- last_line_logged = time.monotonic()
-
- pack_buffer.write(data)
+ pack_writer = PackWriter(
+ pack_buffer=pack_buffer,
+ size_limit=self.pack_size_bytes,
+ origin_url=origin_url,
+ fetch_pack_logger=fetch_pack_logger,
+ )
pack_result = client.fetch_pack(
path,
base_repo.determine_wants,
base_repo.graph_walker(),
- do_pack,
+ pack_writer.write,
progress=do_activity,
)
)
if self.dumb:
self.dumb_fetcher = dumb.GitObjectsFetcher(
- self.origin.url,
- base_repo,
+ repo_url=self.origin.url,
+ base_repo=base_repo,
+ pack_size_limit=self.pack_size_bytes,
requests_extra_kwargs=self.requests_extra_kwargs,
)
self.dumb_fetcher.fetch_object_ids()
blob - 25c460ee36f0f78b3c27962a9d05df9edb660190
blob + 1858a71402a362f799ebe7a8390325d4dca7a6f2
--- swh/loader/git/tests/test_loader.py
+++ swh/loader/git/tests/test_loader.py
from dulwich.tests.utils import build_pack
import pytest
from requests import HTTPError
+import sentry_sdk
from swh.loader.git import converters, dumb
from swh.loader.git.loader import FetchPackReturn, GitLoader, split_lines_and_remainder
call(statsd_metric, "c", 1, {"type": "revision", "result": "found"}, 1),
]
+ def test_load_pack_size_limit(self, sentry_events):
+ # set max pack size to a really small value
+ self.loader.pack_size_bytes = 10
+ res = self.loader.load()
+ assert res == {"status": "failed"}
+ assert sentry_events
+ assert sentry_events[0]["level"] == "error"
+ assert sentry_events[0]["exception"]["values"][0]["value"].startswith(
+ "Pack file too big for repository"
+ )
+
class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound):
"""Mostly the same loading scenario but with a ``parent_origin`` different from the
assert dumb.check_protocol(self.repo_url)
sleep.assert_has_calls([mocker.call(1)])
+ def test_load_pack_size_limit(self, sentry_events):
+ # without that hack, the following error is raised when running test
+ # AttributeError: 'TestTransport' object has no attribute 'parsed_dsn'
+ sentry_sdk.Hub.current.client.integrations.pop("stdlib", None)
+ # set max pack size to a really small value
+ self.loader.pack_size_bytes = 10
+ res = self.loader.load()
+ assert res == {"status": "failed"}
+ assert sentry_events
+ assert sentry_events[0]["level"] == "error"
+ assert sentry_events[0]["exception"]["values"][0]["value"].startswith(
+ "Pack file too big for repository"
+ )
+
+
class TestDumbGitLoaderWithoutPack(DumbGitLoaderTestBase):
@classmethod
def setup_class(cls):
blob - 0c3e7983ba2677043e5afbd95aedf5795086da1c
blob + d6b77dd2a8d35f6f38a24835c1d64c40e0bc69f4
--- swh/loader/git/utils.py
+++ swh/loader/git/utils.py
-# Copyright (C) 2017-2023 The Software Heritage developers
+# Copyright (C) 2017-2024 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
import shutil
import tempfile
+import time
from typing import Dict, Mapping, NewType, Optional, Union
from dulwich.client import HTTPUnauthorized
raise NotFound(e)
# otherwise transmit the error
raise
+
+
+# How often to log messages for long-running operations, in seconds
+LOGGING_INTERVAL = 30
+
+
+class PackWriter:
+ """Helper class to abort git loading if pack file currently downloaded
+ has a size in bytes that exceeds a given threshold."""
+
+ def __init__(
+ self,
+ pack_buffer: tempfile.SpooledTemporaryFile,
+ size_limit: int,
+ origin_url: str,
+ fetch_pack_logger: logging.Logger,
+ ):
+ self.pack_buffer = pack_buffer
+ self.size_limit = size_limit
+ self.origin_url = origin_url
+ self.fetch_pack_logger = fetch_pack_logger
+ self.last_time_logged = time.monotonic()
+
+ def write(self, data: bytes):
+ cur_size = self.pack_buffer.tell()
+ would_write = len(data)
+ fetched = cur_size + would_write
+ if fetched > self.size_limit:
+ raise IOError(
+ f"Pack file too big for repository {self.origin_url}, "
+ f"limit is {self.size_limit} bytes, current size is {cur_size}, "
+ f"would write {would_write}"
+ )
+
+ if time.monotonic() > self.last_time_logged + LOGGING_INTERVAL:
+ self.fetch_pack_logger.info(
+ "Fetched %s packfile bytes so far (%.2f%% of configured limit)",
+ fetched,
+ 100 * fetched / self.size_limit,
+ )
+ self.last_time_logged = time.monotonic()
+
+ self.pack_buffer.write(data)