From 347c6ee0475928ac10df9ef7b2abe6c0468ffbc1 Mon Sep 17 00:00:00 2001 From: Chandra Sirimala Date: Tue, 17 Mar 2026 14:23:01 +0000 Subject: [PATCH 1/3] feat(storage): support returning skipped items as UserWarning in download_many_to_path --- google/cloud/storage/transfer_manager.py | 33 ++++++--- tests/system/test_transfer_manager.py | 86 +++++++++++++++++++++++- tests/unit/test_transfer_manager.py | 84 ++++++++++++++++++++--- 3 files changed, 184 insertions(+), 19 deletions(-) diff --git a/google/cloud/storage/transfer_manager.py b/google/cloud/storage/transfer_manager.py index c655244b0..91119cc1d 100644 --- a/google/cloud/storage/transfer_manager.py +++ b/google/cloud/storage/transfer_manager.py @@ -805,43 +805,60 @@ def download_many_to_path( :raises: :exc:`concurrent.futures.TimeoutError` if deadline is exceeded. - :rtype: list + :rtype: List[None|Exception|UserWarning] :returns: A list of results corresponding to, in order, each item in the - input list. If an exception was received, it will be the result - for that operation. Otherwise, the return value from the successful - download method is used (which will be None). + input list. If an exception was received or a download was skipped + (e.g., due to existing file or path traversal), it will be the result + for that operation (as an Exception or UserWarning, respectively). + Otherwise, the result will be None for a successful download. """ + results = [None] * len(blob_names) blob_file_pairs = [] + indices_to_process = [] - for blob_name in blob_names: + for i, blob_name in enumerate(blob_names): full_blob_name = blob_name_prefix + blob_name resolved_path = _resolve_path(destination_directory, blob_name) if not resolved_path.parent.is_relative_to( Path(destination_directory).resolve() ): - warnings.warn( + msg = ( f"The blob {blob_name} will **NOT** be downloaded. " f"The resolved destination_directory - {resolved_path.parent} - is either invalid or " f"escapes user provided {Path(destination_directory).resolve()} . Please download this file separately using `download_to_filename`" ) + warnings.warn(msg) + results[i] = UserWarning(msg) continue resolved_path = str(resolved_path) + if skip_if_exists and os.path.isfile(resolved_path): + msg = f"The blob {blob_name} is skipped because destination file already exists" + results[i] = UserWarning(msg) + continue + if create_directories: directory, _ = os.path.split(resolved_path) os.makedirs(directory, exist_ok=True) blob_file_pairs.append((bucket.blob(full_blob_name), resolved_path)) + indices_to_process.append(i) - return download_many( + many_results = download_many( blob_file_pairs, download_kwargs=download_kwargs, deadline=deadline, raise_exception=raise_exception, worker_type=worker_type, max_workers=max_workers, - skip_if_exists=skip_if_exists, + skip_if_exists=False, ) + for meta_index, result in zip(indices_to_process, many_results): + results[meta_index] = result + + return results + + def download_chunks_concurrently( blob, diff --git a/tests/system/test_transfer_manager.py b/tests/system/test_transfer_manager.py index 844562c90..6bb0e03fd 100644 --- a/tests/system/test_transfer_manager.py +++ b/tests/system/test_transfer_manager.py @@ -187,8 +187,9 @@ def test_download_many_to_path_skips_download( [str(warning.message) for warning in w] ) - # 1 total - 1 skipped = 0 results - assert len(results) == 0 + # 1 total - 1 skipped = 1 result (containing Warning) + assert len(results) == 1 + assert isinstance(results[0], UserWarning) @pytest.mark.parametrize( @@ -266,6 +267,87 @@ def test_download_many_to_path_downloads_within_dest_dir( assert downloaded_contents == source_contents + +def test_download_many_to_path_mixed_results( + shared_bucket, file_data, blobs_to_delete +): + """ + Test download_many_to_path with successful downloads, skip_if_exists skips, and path traversal skips. + """ + PREFIX = "mixed_results/" + BLOBNAMES = [ + "success1.txt", + "success2.txt", + "exists.txt", + "../escape.txt" + ] + + FILE_BLOB_PAIRS = [ + ( + file_data["simple"]["path"], + shared_bucket.blob(PREFIX + name), + ) + for name in BLOBNAMES + ] + + results = transfer_manager.upload_many( + FILE_BLOB_PAIRS, + skip_if_exists=True, + deadline=DEADLINE, + ) + for result in results: + assert result is None + + blobs = list(shared_bucket.list_blobs(prefix=PREFIX)) + blobs_to_delete.extend(blobs) + assert len(blobs) == 4 + + # Actual Test + with tempfile.TemporaryDirectory() as tempdir: + existing_file_path = os.path.join(tempdir, "exists.txt") + with open(existing_file_path, "w") as f: + f.write("already here") + + import warnings + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + results = transfer_manager.download_many_to_path( + shared_bucket, + BLOBNAMES, + destination_directory=tempdir, + blob_name_prefix=PREFIX, + deadline=DEADLINE, + create_directories=True, + skip_if_exists=True, + ) + + assert len(results) == 4 + + path_traversal_warnings = [ + warning + for warning in w + if str(warning.message).startswith("The blob ") + and "will **NOT** be downloaded. The resolved destination_directory" + in str(warning.message) + ] + assert len(path_traversal_warnings) == 1, "---".join( + [str(warning.message) for warning in w] + ) + + assert results[0] is None + assert results[1] is None + assert isinstance(results[2], UserWarning) + assert "skipped because destination file already exists" in str(results[2]) + assert isinstance(results[3], UserWarning) + assert "will **NOT** be downloaded" in str(results[3]) + + assert os.path.exists(os.path.join(tempdir, "success1.txt")) + assert os.path.exists(os.path.join(tempdir, "success2.txt")) + + with open(existing_file_path, "r") as f: + assert f.read() == "already here" + + def test_download_many(listable_bucket): blobs = list(listable_bucket.list_blobs()) with tempfile.TemporaryDirectory() as tempdir: diff --git a/tests/unit/test_transfer_manager.py b/tests/unit/test_transfer_manager.py index 85ffd9eaa..799bfd314 100644 --- a/tests/unit/test_transfer_manager.py +++ b/tests/unit/test_transfer_manager.py @@ -530,9 +530,10 @@ def test_download_many_to_path(): ] with mock.patch( - "google.cloud.storage.transfer_manager.download_many" + "google.cloud.storage.transfer_manager.download_many", + return_value=[FAKE_RESULT] * len(BLOBNAMES), ) as mock_download_many: - transfer_manager.download_many_to_path( + results = transfer_manager.download_many_to_path( bucket, BLOBNAMES, destination_directory=PATH_ROOT, @@ -553,11 +554,71 @@ def test_download_many_to_path(): raise_exception=True, max_workers=MAX_WORKERS, worker_type=WORKER_TYPE, - skip_if_exists=True, + skip_if_exists=False, ) + assert results == [FAKE_RESULT] * len(BLOBNAMES) for blobname in BLOBNAMES: bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname) +def test_download_many_to_path_with_skip_if_exists(): + bucket = mock.Mock() + + BLOBNAMES = ["file_a.txt", "file_b.txt", "dir_a/file_c.txt"] + PATH_ROOT = "mypath/" + BLOB_NAME_PREFIX = "myprefix/" + DOWNLOAD_KWARGS = {"accept-encoding": "fake-gzip"} + MAX_WORKERS = 7 + DEADLINE = 10 + WORKER_TYPE = transfer_manager.THREAD + + from google.cloud.storage.transfer_manager import _resolve_path + + existing_file = str(_resolve_path(PATH_ROOT, "file_a.txt")) + + def isfile_side_effect(path): + return path == existing_file + + EXPECTED_BLOB_FILE_PAIRS = [ + (mock.ANY, str(_resolve_path(PATH_ROOT, "file_b.txt"))), + (mock.ANY, str(_resolve_path(PATH_ROOT, "dir_a/file_c.txt"))), + ] + + with mock.patch("os.path.isfile", side_effect=isfile_side_effect): + with mock.patch( + "google.cloud.storage.transfer_manager.download_many", + return_value=[FAKE_RESULT, FAKE_RESULT], + ) as mock_download_many: + results = transfer_manager.download_many_to_path( + bucket, + BLOBNAMES, + destination_directory=PATH_ROOT, + blob_name_prefix=BLOB_NAME_PREFIX, + download_kwargs=DOWNLOAD_KWARGS, + deadline=DEADLINE, + create_directories=False, + raise_exception=True, + max_workers=MAX_WORKERS, + worker_type=WORKER_TYPE, + skip_if_exists=True, + ) + + mock_download_many.assert_called_once_with( + EXPECTED_BLOB_FILE_PAIRS, + download_kwargs=DOWNLOAD_KWARGS, + deadline=DEADLINE, + raise_exception=True, + max_workers=MAX_WORKERS, + worker_type=WORKER_TYPE, + skip_if_exists=False, + ) + + assert len(results) == 3 + assert isinstance(results[0], UserWarning) + assert str(results[0]) == "The blob file_a.txt is skipped because destination file already exists" + assert results[1] == FAKE_RESULT + assert results[2] == FAKE_RESULT + + @pytest.mark.parametrize( "blobname", @@ -584,9 +645,10 @@ def test_download_many_to_path_skips_download(blobname): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") with mock.patch( - "google.cloud.storage.transfer_manager.download_many" + "google.cloud.storage.transfer_manager.download_many", + return_value=[], ) as mock_download_many: - transfer_manager.download_many_to_path( + results = transfer_manager.download_many_to_path( bucket, BLOBNAMES, destination_directory=PATH_ROOT, @@ -614,8 +676,10 @@ def test_download_many_to_path_skips_download(blobname): raise_exception=True, max_workers=MAX_WORKERS, worker_type=WORKER_TYPE, - skip_if_exists=True, + skip_if_exists=False, ) + assert len(results) == 1 + assert isinstance(results[0], UserWarning) @pytest.mark.parametrize( @@ -649,9 +713,10 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname): ] with mock.patch( - "google.cloud.storage.transfer_manager.download_many" + "google.cloud.storage.transfer_manager.download_many", + return_value=[FAKE_RESULT], ) as mock_download_many: - transfer_manager.download_many_to_path( + results = transfer_manager.download_many_to_path( bucket, BLOBNAMES, destination_directory=PATH_ROOT, @@ -672,8 +737,9 @@ def test_download_many_to_path_downloads_within_dest_dir(blobname): raise_exception=True, max_workers=MAX_WORKERS, worker_type=WORKER_TYPE, - skip_if_exists=True, + skip_if_exists=False, ) + assert results == [FAKE_RESULT] bucket.blob.assert_any_call(BLOB_NAME_PREFIX + blobname) From 948de5f9815d277052471d7de96a61c55750c9cf Mon Sep 17 00:00:00 2001 From: Chandra Sirimala Date: Tue, 17 Mar 2026 16:01:19 +0000 Subject: [PATCH 2/3] feat(storage): raise InvalidPathError for Windows full paths with ':' in download_many_to_path --- google/cloud/storage/exceptions.py | 6 +++ google/cloud/storage/transfer_manager.py | 13 ++++-- tests/unit/test_transfer_manager.py | 58 ++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 3 deletions(-) diff --git a/google/cloud/storage/exceptions.py b/google/cloud/storage/exceptions.py index 4eb05cef7..12f69071b 100644 --- a/google/cloud/storage/exceptions.py +++ b/google/cloud/storage/exceptions.py @@ -33,6 +33,12 @@ DataCorruptionDynamicParent = Exception +class InvalidPathError(Exception): + """Raised when the provided path string is malformed.""" + + pass + + class InvalidResponse(InvalidResponseDynamicParent): """Error class for responses which are not in the correct state. diff --git a/google/cloud/storage/transfer_manager.py b/google/cloud/storage/transfer_manager.py index 91119cc1d..69902bdf7 100644 --- a/google/cloud/storage/transfer_manager.py +++ b/google/cloud/storage/transfer_manager.py @@ -39,7 +39,7 @@ from google.cloud.storage._media.requests.upload import XMLMPUContainer from google.cloud.storage._media.requests.upload import XMLMPUPart -from google.cloud.storage.exceptions import DataCorruption +from google.cloud.storage.exceptions import DataCorruption, InvalidPathError TM_DEFAULT_CHUNK_SIZE = 32 * 1024 * 1024 DEFAULT_MAX_WORKERS = 8 @@ -263,6 +263,8 @@ def upload_many( def _resolve_path(target_dir, blob_path): + if os.name == "nt" and ":" in blob_path: + raise InvalidPathError(f"{blob_path} cannot be downloaded into {target_dir}") target_dir = Path(target_dir) blob_path = Path(blob_path) # blob_path.anchor will be '/' if `blob_path` is full path else it'll empty. @@ -818,7 +820,13 @@ def download_many_to_path( for i, blob_name in enumerate(blob_names): full_blob_name = blob_name_prefix + blob_name - resolved_path = _resolve_path(destination_directory, blob_name) + try: + resolved_path = _resolve_path(destination_directory, blob_name) + except InvalidPathError as e: + msg = f"The blob {blob_name} will **NOT** be downloaded. {e}" + warnings.warn(msg) + results[i] = UserWarning(msg) + continue if not resolved_path.parent.is_relative_to( Path(destination_directory).resolve() ): @@ -859,7 +867,6 @@ def download_many_to_path( return results - def download_chunks_concurrently( blob, filename, diff --git a/tests/unit/test_transfer_manager.py b/tests/unit/test_transfer_manager.py index 799bfd314..091716371 100644 --- a/tests/unit/test_transfer_manager.py +++ b/tests/unit/test_transfer_manager.py @@ -513,6 +513,64 @@ def test_upload_many_from_filenames_additional_properties(): assert getattr(blob, attrib) == value + +def test__resolve_path_raises_invalid_path_error_on_windows(): + from google.cloud.storage.transfer_manager import _resolve_path, InvalidPathError + + with mock.patch("os.name", "nt"): + with pytest.raises(InvalidPathError) as exc_info: + _resolve_path("C:\\target", "C:\\target\\file.txt") + assert "cannot be downloaded into" in str(exc_info.value) + + # Test that it DOES NOT raise on non-windows + with mock.patch("os.name", "posix"): + # Should not raise + _resolve_path("/target", "C:\\target\\file.txt") + + +def test_download_many_to_path_raises_invalid_path_error(): + bucket = mock.Mock() + + BLOBNAMES = ["C:\\target\\file.txt"] + PATH_ROOT = "mypath/" + BLOB_NAME_PREFIX = "myprefix/" + DOWNLOAD_KWARGS = {"accept-encoding": "fake-gzip"} + MAX_WORKERS = 7 + DEADLINE = 10 + WORKER_TYPE = transfer_manager.THREAD + + from google.cloud.storage.transfer_manager import InvalidPathError + + def resolve_path_side_effect(target_dir, blob_path): + raise InvalidPathError(f"{blob_path} cannot be downloaded into {target_dir}") + + with mock.patch( + "google.cloud.storage.transfer_manager._resolve_path", + side_effect=resolve_path_side_effect, + ): + import warnings + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + results = transfer_manager.download_many_to_path( + bucket, + BLOBNAMES, + destination_directory=PATH_ROOT, + blob_name_prefix=BLOB_NAME_PREFIX, + download_kwargs=DOWNLOAD_KWARGS, + deadline=DEADLINE, + create_directories=False, + raise_exception=True, + max_workers=MAX_WORKERS, + worker_type=WORKER_TYPE, + skip_if_exists=True, + ) + + assert len(w) == 1 + assert "will **NOT** be downloaded" in str(w[0].message) + assert len(results) == 1 + assert isinstance(results[0], UserWarning) + + def test_download_many_to_path(): bucket = mock.Mock() From 0db308d586cc703fde5ed9dfefe193bf7c37c961 Mon Sep 17 00:00:00 2001 From: Chandra Sirimala Date: Tue, 17 Mar 2026 16:10:31 +0000 Subject: [PATCH 3/3] test(storage): update InvalidPathError test to use native triggers --- tests/unit/test_transfer_manager.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/unit/test_transfer_manager.py b/tests/unit/test_transfer_manager.py index 091716371..90c5c478a 100644 --- a/tests/unit/test_transfer_manager.py +++ b/tests/unit/test_transfer_manager.py @@ -539,16 +539,9 @@ def test_download_many_to_path_raises_invalid_path_error(): DEADLINE = 10 WORKER_TYPE = transfer_manager.THREAD - from google.cloud.storage.transfer_manager import InvalidPathError - - def resolve_path_side_effect(target_dir, blob_path): - raise InvalidPathError(f"{blob_path} cannot be downloaded into {target_dir}") - - with mock.patch( - "google.cloud.storage.transfer_manager._resolve_path", - side_effect=resolve_path_side_effect, - ): + with mock.patch("os.name", "nt"): import warnings + with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") results = transfer_manager.download_many_to_path(