commit 64ac020485d0968eeeff96292515aa8bb2d858c7 from: Antoine Lambert date: Thu Feb 22 10:30:05 2024 UTC dumb: Fix streaming of HTTP responses When using the requests library to perform HTTP requests, if responses need to be streamed the stream parameter must be set to True to ensure content is downloaded by chunks. Previously, a whole HTTP response was cached in memory which could lead to OOM errors when dealing with a repository with large pack files. commit - 038c094d28759500e77961be2c30a9d0d5b3df84 commit + 64ac020485d0968eeeff96292515aa8bb2d858c7 blob - 36fb900810aa976c924adaaa9923f6a2f091acfb blob + 2a3180fb40a74e28797cd4bb48ca91fd79561f01 --- swh/loader/git/dumb.py +++ swh/loader/git/dumb.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021-2023 The Software Heritage developers +# Copyright (C) 2021-2024 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -148,7 +148,9 @@ class GitObjectsFetcher: def _http_get(self, path: str) -> SpooledTemporaryFile: url = urllib.parse.urljoin(self.repo_url.rstrip("/") + "/", path) logger.debug("Fetching %s", url) - response = self._session.get(url, **requests_kwargs(self.requests_extra_kwargs)) + response = self._session.get( + url, stream=True, **requests_kwargs(self.requests_extra_kwargs) + ) response.raise_for_status() buffer = SpooledTemporaryFile(max_size=100 * 1024 * 1024) for chunk in response.iter_content(chunk_size=10 * 1024 * 1024):