From 82c220152975c84c379f849b32ceac24b52c2c62 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 17 Feb 2026 15:18:51 +0000 Subject: [PATCH] Enhance large DL support for retries and custom UA --- CHANGELOG.md | 4 ++++ src/zimscraperlib/download.py | 37 ++++++++++++++++++++------------- tests/download/test_download.py | 22 ++++++++++++++++++++ 3 files changed, 49 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5fac8f..dafb7c7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Add support for custom number of retries and user-agent in save_large_file (#278) + ### Fixed - Add proper typing @overload to `zimscraperlib.image.optimize_xxx` methods (#273) diff --git a/src/zimscraperlib/download.py b/src/zimscraperlib/download.py index 7b08e0e..f775b41 100644 --- a/src/zimscraperlib/download.py +++ b/src/zimscraperlib/download.py @@ -121,21 +121,30 @@ class BestMp4(YoutubeConfig): } -def save_large_file(url: str, fpath: pathlib.Path) -> None: - """download a binary file from its URL, using wget""" +def save_large_file( + url: str, fpath: pathlib.Path, retries: int = 5, user_agent: str | None = None +) -> None: + """download a binary file from its URL, using wget + + Arguments - + url: + """ + command = [ + "/usr/bin/env", + "wget", + "-t", + f"{retries}", + "--retry-connrefused", + "--random-wait", + "-O", + str(fpath), + "-c", + url, + ] + if user_agent: + command += ["-U", user_agent] subprocess.run( - [ - "/usr/bin/env", - "wget", - "-t", - "5", - "--retry-connrefused", - "--random-wait", - "-O", - str(fpath), - "-c", - url, - ], + command, check=True, ) diff --git a/tests/download/test_download.py b/tests/download/test_download.py index 3c47357..3a1bf8e 100644 --- a/tests/download/test_download.py +++ b/tests/download/test_download.py @@ -195,6 +195,28 @@ def test_large_download_https(tmp_path: pathlib.Path, valid_https_url: str): assert_downloaded_file(valid_https_url, dest_file) +@pytest.mark.slow +def test_large_download_https_custom_retry( + tmp_path: pathlib.Path, valid_https_url: str +): + dest_file = tmp_path / "favicon.ico" + save_large_file(valid_https_url, dest_file, 1) + assert_downloaded_file(valid_https_url, dest_file) + + +@pytest.mark.slow +def test_large_download_https_custom_ua(tmp_path: pathlib.Path, valid_https_url: str): + dest_file = tmp_path / "favicon.ico" + save_large_file( + valid_https_url, + dest_file, + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36", + ) + assert_downloaded_file(valid_https_url, dest_file) + + @pytest.mark.slow @pytest.mark.parametrize( "url,video_id",