Commit Diff


commit - 64ac020485d0968eeeff96292515aa8bb2d858c7
commit + b7d16897c24c7d5a0e28d952b027401b37417f92
blob - 2a3180fb40a74e28797cd4bb48ca91fd79561f01
blob + 6e3006e035fa06720e40ef205e8e7ef151f1863e
--- swh/loader/git/dumb.py
+++ swh/loader/git/dumb.py
@@ -11,7 +11,17 @@ import logging
 import stat
 import struct
 from tempfile import SpooledTemporaryFile
-from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Set, cast
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Protocol,
+    Set,
+    cast,
+)
 import urllib.parse
 
 from dulwich.errors import NotGitRepository
@@ -21,14 +31,20 @@ import requests
 from tenacity.before_sleep import before_sleep_log
 
 from swh.core.retry import http_retry
-from swh.loader.git.utils import HexBytes
+from swh.loader.git.utils import HexBytes, PackWriter
 
 if TYPE_CHECKING:
     from .loader import RepoRepresentation
 
 logger = logging.getLogger(__name__)
+fetch_pack_logger = logger.getChild("fetch_pack")
 
 
+class BytesWriter(Protocol):
+    def write(self, data: bytes):
+        ...
+
+
 def requests_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
     """Inject User-Agent header in the requests kwargs"""
     ret = copy.deepcopy(kwargs)
@@ -91,12 +107,14 @@ class GitObjectsFetcher:
         self,
         repo_url: str,
         base_repo: RepoRepresentation,
+        pack_size_limit: int,
         requests_extra_kwargs: Dict[str, Any] = {},
     ):
         self._session = requests.Session()
         self.requests_extra_kwargs = requests_extra_kwargs
         self.repo_url = repo_url
         self.base_repo = base_repo
+        self.pack_size_limit = pack_size_limit
         self.objects: Dict[bytes, Set[bytes]] = defaultdict(set)
         self.refs = self._get_refs()
         self.head = self._get_head() if self.refs else {}
@@ -153,8 +171,16 @@ class GitObjectsFetcher:
         )
         response.raise_for_status()
         buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024)
+        bytes_writer: BytesWriter = buffer
+        if path.startswith("objects/pack/") and path.endswith(".pack"):
+            bytes_writer = PackWriter(
+                pack_buffer=buffer,
+                size_limit=self.pack_size_limit,
+                origin_url=self.repo_url,
+                fetch_pack_logger=fetch_pack_logger,
+            )
         for chunk in response.iter_content(chunk_size=10 * 1024 * 1024):
-            buffer.write(chunk)
+            bytes_writer.write(chunk)
         buffer.flush()
         buffer.seek(0)
         return buffer
blob - 25713898777ac2449410cc2a0203ab2175175064
blob + 1a5d8d1c87bccb651ce57639a1c1564bcf757451
--- swh/loader/git/loader.py
+++ swh/loader/git/loader.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2016-2022 The Software Heritage developers
+# Copyright (C) 2016-2024 The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -59,17 +59,14 @@ from swh.storage.interface import StorageInterface
 
 from . import converters, dumb, utils
 from .base import BaseGitLoader
-from .utils import HexBytes
+from .utils import LOGGING_INTERVAL, HexBytes, PackWriter
 
 logger = logging.getLogger(__name__)
 heads_logger = logger.getChild("refs")
 remote_logger = logger.getChild("remote")
 fetch_pack_logger = logger.getChild("fetch_pack")
 
-# How often to log messages for long-running operations, in seconds
-LOGGING_INTERVAL = 30
 
-
 def split_lines_and_remainder(buf: bytes) -> Tuple[List[bytes], bytes]:
     """Get newline-terminated (``b"\\r"`` or ``b"\\n"``) lines from `buf`,
     and the beginning of the last line if it isn't terminated."""
@@ -274,38 +271,18 @@ class GitLoader(BaseGitLoader):
 
         logger.debug("Client %s to fetch pack at %s", client, path)
 
-        size_limit = self.pack_size_bytes
-
-        last_line_logged = time.monotonic()
-
-        def do_pack(data: bytes) -> None:
-            nonlocal last_line_logged
-
-            cur_size = pack_buffer.tell()
-            would_write = len(data)
-            fetched = cur_size + would_write
-            if fetched > size_limit:
-                raise IOError(
-                    f"Pack file too big for repository {origin_url}, "
-                    f"limit is {size_limit} bytes, current size is {cur_size}, "
-                    f"would write {would_write}"
-                )
-
-            if time.monotonic() > last_line_logged + LOGGING_INTERVAL:
-                fetch_pack_logger.info(
-                    "Fetched %s packfile bytes so far (%.2f%% of configured limit)",
-                    fetched,
-                    100 * fetched / size_limit,
-                )
-                last_line_logged = time.monotonic()
-
-            pack_buffer.write(data)
+        pack_writer = PackWriter(
+            pack_buffer=pack_buffer,
+            size_limit=self.pack_size_bytes,
+            origin_url=origin_url,
+            fetch_pack_logger=fetch_pack_logger,
+        )
 
         pack_result = client.fetch_pack(
             path,
             base_repo.determine_wants,
             base_repo.graph_walker(),
-            do_pack,
+            pack_writer.write,
             progress=do_activity,
         )
 
@@ -490,8 +467,9 @@ class GitLoader(BaseGitLoader):
         )
         if self.dumb:
             self.dumb_fetcher = dumb.GitObjectsFetcher(
-                self.origin.url,
-                base_repo,
+                repo_url=self.origin.url,
+                base_repo=base_repo,
+                pack_size_limit=self.pack_size_bytes,
                 requests_extra_kwargs=self.requests_extra_kwargs,
             )
             self.dumb_fetcher.fetch_object_ids()
blob - 25c460ee36f0f78b3c27962a9d05df9edb660190
blob + 1858a71402a362f799ebe7a8390325d4dca7a6f2
--- swh/loader/git/tests/test_loader.py
+++ swh/loader/git/tests/test_loader.py
@@ -23,6 +23,7 @@ import dulwich.repo
 from dulwich.tests.utils import build_pack
 import pytest
 from requests import HTTPError
+import sentry_sdk
 
 from swh.loader.git import converters, dumb
 from swh.loader.git.loader import FetchPackReturn, GitLoader, split_lines_and_remainder
@@ -435,6 +436,17 @@ class TestGitLoader(FullGitLoaderTests, CommonGitLoade
                 call(statsd_metric, "c", 1, {"type": "revision", "result": "found"}, 1),
             ]
 
+    def test_load_pack_size_limit(self, sentry_events):
+        # set max pack size to a really small value
+        self.loader.pack_size_bytes = 10
+        res = self.loader.load()
+        assert res == {"status": "failed"}
+        assert sentry_events
+        assert sentry_events[0]["level"] == "error"
+        assert sentry_events[0]["exception"]["values"][0]["value"].startswith(
+            "Pack file too big for repository"
+        )
+
 
 class TestGitLoader2(FullGitLoaderTests, CommonGitLoaderNotFound):
     """Mostly the same loading scenario but with a ``parent_origin`` different from the
@@ -1014,7 +1026,22 @@ class TestDumbGitLoaderWithPack(DumbGitLoaderTestBase)
         assert dumb.check_protocol(self.repo_url)
         sleep.assert_has_calls([mocker.call(1)])
 
+    def test_load_pack_size_limit(self, sentry_events):
+        # without that hack, the following error is raised when running test
+        # AttributeError: 'TestTransport' object has no attribute 'parsed_dsn'
+        sentry_sdk.Hub.current.client.integrations.pop("stdlib", None)
 
+        # set max pack size to a really small value
+        self.loader.pack_size_bytes = 10
+        res = self.loader.load()
+        assert res == {"status": "failed"}
+        assert sentry_events
+        assert sentry_events[0]["level"] == "error"
+        assert sentry_events[0]["exception"]["values"][0]["value"].startswith(
+            "Pack file too big for repository"
+        )
+
+
 class TestDumbGitLoaderWithoutPack(DumbGitLoaderTestBase):
     @classmethod
     def setup_class(cls):
blob - 0c3e7983ba2677043e5afbd95aedf5795086da1c
blob + d6b77dd2a8d35f6f38a24835c1d64c40e0bc69f4
--- swh/loader/git/utils.py
+++ swh/loader/git/utils.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2017-2023  The Software Heritage developers
+# Copyright (C) 2017-2024  The Software Heritage developers
 # See the AUTHORS file at the top-level directory of this distribution
 # License: GNU General Public License version 3, or any later version
 # See top-level LICENSE file for more information
@@ -11,6 +11,7 @@ import logging
 import os
 import shutil
 import tempfile
+import time
 from typing import Dict, Mapping, NewType, Optional, Union
 
 from dulwich.client import HTTPUnauthorized
@@ -161,3 +162,46 @@ def raise_not_found_repository():
                 raise NotFound(e)
         # otherwise transmit the error
         raise
+
+
+# How often to log messages for long-running operations, in seconds
+LOGGING_INTERVAL = 30
+
+
+class PackWriter:
+    """Helper class to abort git loading if pack file currently downloaded
+    has a size in bytes that exceeds a given threshold."""
+
+    def __init__(
+        self,
+        pack_buffer: tempfile.SpooledTemporaryFile,
+        size_limit: int,
+        origin_url: str,
+        fetch_pack_logger: logging.Logger,
+    ):
+        self.pack_buffer = pack_buffer
+        self.size_limit = size_limit
+        self.origin_url = origin_url
+        self.fetch_pack_logger = fetch_pack_logger
+        self.last_time_logged = time.monotonic()
+
+    def write(self, data: bytes):
+        cur_size = self.pack_buffer.tell()
+        would_write = len(data)
+        fetched = cur_size + would_write
+        if fetched > self.size_limit:
+            raise IOError(
+                f"Pack file too big for repository {self.origin_url}, "
+                f"limit is {self.size_limit} bytes, current size is {cur_size}, "
+                f"would write {would_write}"
+            )
+
+        if time.monotonic() > self.last_time_logged + LOGGING_INTERVAL:
+            self.fetch_pack_logger.info(
+                "Fetched %s packfile bytes so far (%.2f%% of configured limit)",
+                fetched,
+                100 * fetched / self.size_limit,
+            )
+            self.last_time_logged = time.monotonic()
+
+        self.pack_buffer.write(data)