From 8cee77684530a3e86fa555b57878e809bdfb97cd Mon Sep 17 00:00:00 2001
From: BryanFauble <17128019+BryanFauble@users.noreply.github.com>
Date: Wed, 4 Feb 2026 22:27:23 +0000
Subject: [PATCH 01/31] Supporting storage locations in SYNPY
---
.gitignore | 1 +
.../storage_location_architecture.md | 785 ++++++++
docs/js/mermaid-init.js | 12 +
docs/reference/experimental/async/folder.md | 6 +
docs/reference/experimental/async/project.md | 6 +
.../experimental/async/storage_location.md | 23 +
.../mixins/manifest_generatable.md | 69 +
.../mixins/storage_location_configurable.md | 54 +
docs/reference/experimental/sync/folder.md | 6 +
docs/reference/experimental/sync/project.md | 6 +
.../experimental/sync/storage_location.md | 24 +
docs/tutorials/python/manifest_operations.md | 328 ++++
docs/tutorials/python/storage_location.md | 135 ++
.../tutorial_scripts/storage_location.py | 86 +
mkdocs.yml | 11 +
synapseclient/api/__init__.py | 15 +
.../api/storage_location_services.py | 169 ++
synapseclient/client.py | 51 +-
.../core/constants/concrete_types.py | 26 +-
synapseclient/models/__init__.py | 19 +
synapseclient/models/folder.py | 6 +
synapseclient/models/mixins/__init__.py | 12 +
.../models/mixins/asynchronous_job.py | 2 +
synapseclient/models/mixins/manifest.py | 950 ++++++++++
.../models/mixins/storable_container.py | 28 +-
.../models/mixins/storage_location_mixin.py | 450 +++++
synapseclient/models/project.py | 6 +
.../protocols/download_list_protocol.py | 97 +
.../models/protocols/manifest_protocol.py | 240 +++
.../protocols/storable_container_protocol.py | 13 +-
.../storage_location_mixin_protocol.py | 279 +++
.../protocols/storage_location_protocol.py | 159 ++
synapseclient/models/services/__init__.py | 29 +-
synapseclient/models/services/migration.py | 1650 +++++++++++++++++
.../models/services/migration_types.py | 371 ++++
synapseclient/models/storage_location.py | 600 ++++++
.../unit_test_storage_location_services.py | 215 +++
.../models/unit_test_manifest.py | 499 +++++
.../models/unit_test_storage_location.py | 355 ++++
39 files changed, 7777 insertions(+), 16 deletions(-)
create mode 100644 docs/explanations/storage_location_architecture.md
create mode 100644 docs/js/mermaid-init.js
create mode 100644 docs/reference/experimental/async/storage_location.md
create mode 100644 docs/reference/experimental/mixins/manifest_generatable.md
create mode 100644 docs/reference/experimental/mixins/storage_location_configurable.md
create mode 100644 docs/reference/experimental/sync/storage_location.md
create mode 100644 docs/tutorials/python/manifest_operations.md
create mode 100644 docs/tutorials/python/storage_location.md
create mode 100644 docs/tutorials/python/tutorial_scripts/storage_location.py
create mode 100644 synapseclient/api/storage_location_services.py
create mode 100644 synapseclient/models/mixins/manifest.py
create mode 100644 synapseclient/models/mixins/storage_location_mixin.py
create mode 100644 synapseclient/models/protocols/download_list_protocol.py
create mode 100644 synapseclient/models/protocols/manifest_protocol.py
create mode 100644 synapseclient/models/protocols/storage_location_mixin_protocol.py
create mode 100644 synapseclient/models/protocols/storage_location_protocol.py
create mode 100644 synapseclient/models/services/migration.py
create mode 100644 synapseclient/models/services/migration_types.py
create mode 100644 synapseclient/models/storage_location.py
create mode 100644 tests/unit/synapseclient/api/unit_test_storage_location_services.py
create mode 100644 tests/unit/synapseclient/models/unit_test_manifest.py
create mode 100644 tests/unit/synapseclient/models/unit_test_storage_location.py
diff --git a/.gitignore b/.gitignore
index fa4e7f520..19eb11079 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@ jenkins/
.idea/*
docs/build/doctrees/*
docs/build/html/_sources/*
+docs_site/*
build/*
/venv
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
new file mode 100644
index 000000000..612ef7d21
--- /dev/null
+++ b/docs/explanations/storage_location_architecture.md
@@ -0,0 +1,785 @@
+# Storage Location Architecture
+
+This document provides an in-depth architectural overview of the StorageLocation
+system in the Synapse Python Client. It explains the design decisions, class
+relationships, and data flows that enable flexible storage configuration.
+
+---
+
+## On This Page
+
+
+
+- **[Domain Model](#domain-model)**
+
+ Core classes, enums, and their relationships
+
+- **[Storage Types](#storage-type-mapping)**
+
+ How storage types map to REST API types and choosing the right one
+
+- **[Entity Inheritance](#entity-inheritance-hierarchy)**
+
+ How Projects and Folders gain storage capabilities
+
+- **[Operation Flows](#operation-flows)**
+
+ Sequence diagrams for store, setup, and STS operations
+
+- **[Settings & API](#project-setting-lifecycle)**
+
+ Project settings lifecycle and REST API architecture
+
+- **[Migration](#migration-flow)**
+
+ Two-phase file migration process
+
+
+
+---
+
+## Overview
+
+The StorageLocation system enables Synapse users to configure where uploaded files
+are stored. By default, Synapse stores files in its internal S3 storage, but
+users can configure projects and folders to use external storage backends such as
+AWS S3 buckets, Google Cloud Storage, SFTP servers, or proxy servers.
+
+!!! info "Key Concepts"
+ - **StorageLocation**: A configuration describing where files are stored
+ - **Project Setting**: Links a storage location to a Project or Folder
+ - **STS Credentials**: Temporary AWS credentials for direct S3 access
+ - **Storage Migration**: Moving files between storage locations
+
+---
+
+
+
+# Part 1: Data Model
+
+This section covers the core classes, enumerations, and type mappings.
+
+
+
+## Domain Model
+
+The following class diagram shows the core classes and their relationships in the
+StorageLocation system.
+
+```mermaid
+classDiagram
+ direction TB
+
+ class StorageLocation {
+ +int storage_location_id
+ +StorageLocationType storage_type
+ +UploadType upload_type
+ +str bucket
+ +str base_key
+ +bool sts_enabled
+ +str banner
+ +str description
+ +str etag
+ +str created_on
+ +int created_by
+ +str url
+ +bool supports_subfolders
+ +str endpoint_url
+ +str proxy_url
+ +str secret_key
+ +str benefactor_id
+ +store() StorageLocation
+ +get() StorageLocation
+ +setup_s3() Tuple~Folder, StorageLocation~
+ +fill_from_dict(dict) StorageLocation
+ }
+
+ class StorageLocationType {
+ <>
+ SYNAPSE_S3
+ EXTERNAL_S3
+ EXTERNAL_GOOGLE_CLOUD
+ EXTERNAL_SFTP
+ EXTERNAL_OBJECT_STORE
+ PROXY
+ }
+
+ class UploadType {
+ <>
+ S3
+ GOOGLE_CLOUD_STORAGE
+ SFTP
+ HTTPS
+ NONE
+ }
+
+ class StorageLocationConfigurable {
+ <>
+ +set_storage_location(storage_location_id)
+ +get_project_setting(setting_type)
+ +delete_project_setting(setting_id)
+ +get_sts_storage_token(permission, output_format)
+ +index_files_for_migration(dest_storage_location_id, db_path)
+ +migrate_indexed_files(db_path)
+ }
+
+ class Project {
+ +str id
+ +str name
+ +str description
+ }
+
+ class Folder {
+ +str id
+ +str name
+ +str parent_id
+ }
+
+ StorageLocation --> StorageLocationType : storage_type
+ StorageLocation --> UploadType : upload_type
+ StorageLocationConfigurable <|-- Project : implements
+ StorageLocationConfigurable <|-- Folder : implements
+```
+
+
+
+### Key Components
+
+| Component | Purpose |
+|-----------|---------|
+| [StorageLocation][synapseclient.models.StorageLocation] | Data model representing a storage location setting in Synapse |
+| [StorageLocationType][synapseclient.models.StorageLocationType] | Enumeration defining the supported storage backend types |
+| [UploadType][synapseclient.models.UploadType] | Enumeration defining the upload protocol for each storage type |
+| [StorageLocationConfigurable][synapseclient.models.mixins.StorageLocationConfigurable] | Mixin providing storage management methods to entities |
+
+---
+
+
+
+## Storage Type Mapping
+
+Each `StorageLocationType` maps to a specific REST API `concreteType` and has a
+default `UploadType`. This mapping is bidirectional, allowing the system to parse
+responses from the API and construct requests.
+
+```mermaid
+flowchart LR
+ subgraph StorageLocationType
+ SYNAPSE_S3["SYNAPSE_S3"]
+ EXTERNAL_S3["EXTERNAL_S3"]
+ EXTERNAL_GOOGLE_CLOUD["EXTERNAL_GOOGLE_CLOUD"]
+ EXTERNAL_SFTP["EXTERNAL_SFTP"]
+ EXTERNAL_OBJECT_STORE["EXTERNAL_OBJECT_STORE"]
+ PROXY["PROXY"]
+ end
+
+ subgraph concreteType
+ S3SLS["S3StorageLocationSetting"]
+ ExtS3SLS["ExternalS3StorageLocationSetting"]
+ ExtGCSSLS["ExternalGoogleCloudStorageLocationSetting"]
+ ExtSLS["ExternalStorageLocationSetting"]
+ ExtObjSLS["ExternalObjectStorageLocationSetting"]
+ ProxySLS["ProxyStorageLocationSettings"]
+ end
+
+ subgraph UploadType
+ S3["S3"]
+ GCS["GOOGLECLOUDSTORAGE"]
+ SFTP["SFTP"]
+ HTTPS["HTTPS"]
+ end
+
+ SYNAPSE_S3 --> S3SLS --> S3
+ EXTERNAL_S3 --> ExtS3SLS --> S3
+ EXTERNAL_GOOGLE_CLOUD --> ExtGCSSLS --> GCS
+ EXTERNAL_SFTP --> ExtSLS --> SFTP
+ EXTERNAL_OBJECT_STORE --> ExtObjSLS --> S3
+ PROXY --> ProxySLS --> HTTPS
+```
+
+
+
+### Type-Specific Attributes
+
+Different storage types support different configuration attributes:
+
+| Attribute | SYNAPSE | EXT_S3 | EXT_GCS | EXT_SFTP | EXT_OBJ | PROXY |
+|-----------|:-------:|:------:|:-------:|:--------:|:-------:|:-----:|
+| `bucket` | ✓ | ✓ | ✓ | | ✓ | |
+| `base_key` | ✓ | ✓ | ✓ | | | |
+| `sts_enabled` | ✓ | ✓ | | | | |
+| `endpoint_url` | | ✓ | | | ✓ | |
+| `url` | | | | ✓ | | |
+| `supports_subfolders` | | | | ✓ | | |
+| `proxy_url` | | | | | | ✓ |
+| `secret_key` | | | | | | ✓ |
+| `benefactor_id` | | | | | | ✓ |
+
+**Legend:** SYNAPSE = SYNAPSE_S3, EXT_S3 = EXTERNAL_S3, EXT_GCS = EXTERNAL_GOOGLE_CLOUD, EXT_SFTP = EXTERNAL_SFTP, EXT_OBJ = EXTERNAL_OBJECT_STORE
+
+
+
+### Choosing a Storage Type
+
+Use this decision tree to select the appropriate storage type for your use case:
+
+```mermaid
+flowchart TB
+ Start([Need custom storage?]) --> Q1{Want Synapse to
manage storage?}
+
+ Q1 -->|Yes| SYNAPSE_S3[Use SYNAPSE_S3]
+ Q1 -->|No| Q2{What storage
backend?}
+
+ Q2 -->|AWS S3| Q3{Synapse accesses
bucket directly?}
+ Q2 -->|Google Cloud| EXTERNAL_GOOGLE_CLOUD[Use EXTERNAL_GOOGLE_CLOUD]
+ Q2 -->|SFTP Server| EXTERNAL_SFTP[Use EXTERNAL_SFTP]
+ Q2 -->|Proxy Server| PROXY[Use PROXY]
+ Q2 -->|S3-compatible
non-AWS| EXTERNAL_OBJECT_STORE[Use EXTERNAL_OBJECT_STORE]
+
+ Q3 -->|Yes| Q4{Need STS
credentials?}
+ Q3 -->|No| EXTERNAL_OBJECT_STORE
+
+ Q4 -->|Yes| EXTERNAL_S3_STS[Use EXTERNAL_S3
with sts_enabled=True]
+ Q4 -->|No| EXTERNAL_S3[Use EXTERNAL_S3]
+
+ SYNAPSE_S3 --> Benefits1[Benefits:
- Zero configuration
- Managed by Synapse
- STS available]
+ EXTERNAL_S3 --> Benefits2[Benefits:
- Use your own bucket
- Control access & costs
- Optional STS]
+ EXTERNAL_S3_STS --> Benefits2
+ EXTERNAL_GOOGLE_CLOUD --> Benefits3[Benefits:
- GCP native
- Use existing GCS buckets]
+ EXTERNAL_SFTP --> Benefits4[Benefits:
- Legacy systems
- Synapse never touches data]
+ EXTERNAL_OBJECT_STORE --> Benefits5[Benefits:
- OpenStack, MinIO, etc
- Synapse never touches data]
+ PROXY --> Benefits6[Benefits:
- Custom access control
- Data transformation]
+```
+
+---
+
+
+
+## Entity Inheritance Hierarchy
+
+Projects and Folders inherit storage configuration capabilities through the
+`StorageLocationConfigurable` mixin. This pattern allows consistent storage
+management across container entities.
+
+```mermaid
+classDiagram
+ direction TB
+
+ class AccessControllable {
+ <>
+ +get_permissions()
+ +set_permissions()
+ +delete_permissions()
+ }
+
+ class StorableContainer {
+ <>
+ +sync()
+ +get_children()
+ }
+
+ class StorageLocationConfigurable {
+ <>
+ +set_storage_location()
+ +get_project_setting()
+ +delete_project_setting()
+ +get_sts_storage_token()
+ +index_files_for_migration()
+ +migrate_indexed_files()
+ }
+
+ class Project {
+ +str id
+ +str name
+ +str description
+ +str etag
+ }
+
+ class Folder {
+ +str id
+ +str name
+ +str parent_id
+ +str etag
+ }
+
+ AccessControllable <|-- Project
+ AccessControllable <|-- Folder
+ StorableContainer <|-- Project
+ StorableContainer <|-- Folder
+ StorageLocationConfigurable <|-- Project
+ StorageLocationConfigurable <|-- Folder
+```
+
+!!! tip "Mixin Pattern"
+ The mixin pattern allows `Project` and `Folder` to share storage location
+ functionality without code duplication. Both classes inherit the same
+ methods from `StorageLocationConfigurable`.
+
+---
+
+
+
+
+# Part 2: Operation Flows
+
+This section contains sequence diagrams for key operations.
+
+
+
+## Operation Flows
+
+### Store Operation
+
+The `store()` method creates a new storage location in Synapse.
+
+```mermaid
+sequenceDiagram
+ participant User
+ participant StorageLocation
+ participant _to_synapse_request as _to_synapse_request()
+ participant API as storage_location_services
+ participant Synapse as Synapse REST API
+
+ User->>StorageLocation: store()
+ activate StorageLocation
+
+ StorageLocation->>_to_synapse_request: Build request body
+ activate _to_synapse_request
+
+ Note over _to_synapse_request: Validate storage_type is set
+ Note over _to_synapse_request: Build concreteType from storage_type
+ Note over _to_synapse_request: Determine uploadType
+ Note over _to_synapse_request: Add type-specific fields
+
+ _to_synapse_request-->>StorageLocation: Request body dict
+ deactivate _to_synapse_request
+
+ StorageLocation->>API: create_storage_location_setting(body)
+ activate API
+
+ API->>Synapse: POST /storageLocation
+ activate Synapse
+
+ Synapse-->>API: Response with storageLocationId
+ deactivate Synapse
+
+ API-->>StorageLocation: Response dict
+ deactivate API
+
+ StorageLocation->>StorageLocation: fill_from_dict(response)
+ Note over StorageLocation: Parse storageLocationId
+ Note over StorageLocation: Parse concreteType → storage_type
+ Note over StorageLocation: Parse uploadType → upload_type
+ Note over StorageLocation: Extract type-specific fields
+
+ StorageLocation-->>User: StorageLocation (populated)
+ deactivate StorageLocation
+```
+
+!!! note "Idempotent Behavior"
+ Storage locations are immutable once created. If you call `store()` with
+ identical parameters, Synapse returns the existing storage location rather
+ than creating a duplicate.
+
+
+
+### Setup S3 Convenience Flow
+
+The `setup_s3()` class method creates a folder with S3 storage in a single call.
+
+??? example "Click to expand sequence diagram"
+ ```mermaid
+ sequenceDiagram
+ participant User
+ participant setup_s3 as StorageLocation.setup_s3()
+ participant StorageLocation
+ participant Folder
+ participant Mixin as StorageLocationConfigurable
+ participant API as storage_location_services
+ participant Synapse as Synapse REST API
+
+ User->>setup_s3: setup_s3(parent, folder_name, bucket_name)
+ activate setup_s3
+
+ Note over setup_s3: Validate: folder_name XOR folder
+
+ alt folder_name provided
+ setup_s3->>Folder: Folder(name, parent_id).store()
+ activate Folder
+ Folder->>Synapse: POST /entity
+ Synapse-->>Folder: Folder response
+ Folder-->>setup_s3: New Folder
+ deactivate Folder
+ else folder ID provided
+ setup_s3->>Folder: Folder(id).get()
+ activate Folder
+ Folder->>Synapse: GET /entity/{id}
+ Synapse-->>Folder: Folder response
+ Folder-->>setup_s3: Existing Folder
+ deactivate Folder
+ end
+
+ alt bucket_name provided
+ Note over setup_s3: storage_type = EXTERNAL_S3
+ else bucket_name is None
+ Note over setup_s3: storage_type = SYNAPSE_S3
+ end
+
+ setup_s3->>StorageLocation: StorageLocation(...).store()
+ activate StorageLocation
+ StorageLocation->>Synapse: POST /storageLocation
+ Synapse-->>StorageLocation: StorageLocation response
+ StorageLocation-->>setup_s3: StorageLocation
+ deactivate StorageLocation
+
+ setup_s3->>Mixin: folder.set_storage_location(storage_location_id)
+ activate Mixin
+
+ Mixin->>API: get_project_setting(project_id, "upload")
+ API->>Synapse: GET /projectSettings/{id}/type/upload
+ Synapse-->>API: Setting or empty
+
+ alt Setting exists
+ API-->>Mixin: Existing setting
+ Mixin->>API: update_project_setting(body)
+ API->>Synapse: PUT /projectSettings
+ else No setting
+ Mixin->>API: create_project_setting(body)
+ API->>Synapse: POST /projectSettings
+ end
+
+ Synapse-->>API: Project setting response
+ API-->>Mixin: Updated setting
+ deactivate Mixin
+
+ setup_s3-->>User: (Folder, StorageLocation)
+ deactivate setup_s3
+ ```
+
+
+
+### STS Token Retrieval
+
+STS (AWS Security Token Service) enables direct S3 access using temporary credentials.
+
+```mermaid
+sequenceDiagram
+ participant User
+ participant Entity as Folder/Project
+ participant Mixin as StorageLocationConfigurable
+ participant STS as sts_transfer module
+ participant Client as Synapse Client
+ participant Synapse as Synapse REST API
+
+ User->>Entity: get_sts_storage_token(permission, output_format)
+ activate Entity
+
+ Entity->>Mixin: get_sts_storage_token_async()
+ activate Mixin
+
+ Mixin->>Client: Synapse.get_client()
+ Client-->>Mixin: Synapse client instance
+
+ Mixin->>STS: sts_transfer.get_sts_credentials()
+ activate STS
+
+ STS->>Synapse: GET /entity/{id}/sts?permission={permission}
+ activate Synapse
+
+ Synapse-->>STS: STS credentials response
+ deactivate Synapse
+
+ Note over STS: Parse credentials
+
+ alt output_format == "boto"
+ Note over STS: Format for boto3 client kwargs
+ STS-->>Mixin: {aws_access_key_id, aws_secret_access_key, aws_session_token}
+ else output_format == "json"
+ Note over STS: Return JSON string
+ STS-->>Mixin: JSON credentials string
+ else output_format == "shell" / "bash"
+ Note over STS: Format as export commands
+ STS-->>Mixin: Shell export commands
+ else output_format == "dictionary"
+ Note over STS: Return raw dict
+ STS-->>Mixin: Dictionary
+ end
+ deactivate STS
+
+ Mixin-->>Entity: Formatted credentials
+ deactivate Mixin
+
+ Entity-->>User: Credentials
+ deactivate Entity
+```
+
+
+
+#### Credential Output Formats
+
+| Format | Description | Use Case |
+|--------|-------------|----------|
+| `boto` | Dict with `aws_access_key_id`, `aws_secret_access_key`, `aws_session_token` | Pass directly to `boto3.client('s3', **creds)` |
+| `json` | JSON string | Store or pass to external tools |
+| `shell` / `bash` | `export AWS_ACCESS_KEY_ID=...` format | Execute in shell |
+| `cmd` | Windows SET commands | Windows command prompt |
+| `powershell` | PowerShell variable assignments | PowerShell scripts |
+| `dictionary` | Raw Python dict | Custom processing |
+
+---
+
+
+
+
+# Part 3: Settings & Infrastructure
+
+This section covers project settings, API architecture, and the async/sync pattern.
+
+
+
+## Project Setting Lifecycle
+
+Project settings control which storage location(s) are used for uploads to an
+entity. The following state diagram shows the lifecycle of a project setting.
+
+```mermaid
+stateDiagram-v2
+ [*] --> NoSetting: Entity created
+
+ NoSetting --> Created: set_storage_location()
+ Note right of NoSetting: Inherits from parent\nor uses Synapse default
+
+ Created --> Updated: set_storage_location()\nwith different locations
+ Updated --> Updated: set_storage_location()\nwith different locations
+
+ Created --> Deleted: delete_project_setting()
+ Updated --> Deleted: delete_project_setting()
+
+ Deleted --> NoSetting: Returns to default
+
+ state Created {
+ [*] --> Active
+ Active: locations = [storage_location_id]
+ Active: settingsType = "upload"
+ }
+
+ state Updated {
+ [*] --> Modified
+ Modified: locations = [new_id, ...]
+ Modified: settingsType = "upload"
+ }
+```
+
+
+
+### Setting Types
+
+| Type | Purpose |
+|------|---------|
+| `upload` | Configures upload destination storage location(s) |
+| `external_sync` | Configures external sync settings |
+| `requester_pays` | Configures requester-pays bucket access |
+
+---
+
+
+
+## API Layer Architecture
+
+The storage location services module provides async functions that wrap the
+Synapse REST API endpoints. This layer handles serialization and error handling.
+
+```mermaid
+flowchart TB
+ subgraph "Model Layer"
+ SL[StorageLocation]
+ SLCM[StorageLocationConfigurable Mixin]
+ end
+
+ subgraph "API Layer (storage_location_services.py)"
+ create_sls[create_storage_location_setting]
+ get_sls[get_storage_location_setting]
+ get_ps[get_project_setting]
+ create_ps[create_project_setting]
+ update_ps[update_project_setting]
+ delete_ps[delete_project_setting]
+ end
+
+ subgraph "REST Endpoints"
+ POST_SL["POST /storageLocation"]
+ GET_SL["GET /storageLocation/{id}"]
+ GET_PS["GET /projectSettings/{id}/type/{type}"]
+ POST_PS["POST /projectSettings"]
+ PUT_PS["PUT /projectSettings"]
+ DELETE_PS["DELETE /projectSettings/{id}"]
+ end
+
+ SL --> create_sls --> POST_SL
+ SL --> get_sls --> GET_SL
+
+ SLCM --> get_ps --> GET_PS
+ SLCM --> create_ps --> POST_PS
+ SLCM --> update_ps --> PUT_PS
+ SLCM --> delete_ps --> DELETE_PS
+```
+
+
+
+### REST API Reference
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/storageLocation` | Create a new storage location setting |
+| GET | `/storageLocation/{id}` | Retrieve a storage location by ID |
+| GET | `/projectSettings/{projectId}/type/{type}` | Get project settings for an entity |
+| POST | `/projectSettings` | Create a new project setting |
+| PUT | `/projectSettings` | Update an existing project setting |
+| DELETE | `/projectSettings/{id}` | Delete a project setting |
+
+---
+
+
+
+## Async/Sync Pattern
+
+The StorageLocation system follows the Python client's `@async_to_sync` pattern,
+providing both async and sync versions of all methods.
+
+```mermaid
+flowchart LR
+ subgraph "User Code"
+ SyncCall["folder.set_storage_location()"]
+ AsyncCall["await folder.set_storage_location_async()"]
+ end
+
+ subgraph "@async_to_sync Decorator"
+ Wrapper["Sync wrapper"]
+ AsyncMethod["Async implementation"]
+ end
+
+ subgraph "Event Loop"
+ RunSync["wrap_async_to_sync()"]
+ AsyncIO["asyncio"]
+ end
+
+ SyncCall --> Wrapper
+ Wrapper --> RunSync
+ RunSync --> AsyncIO
+ AsyncIO --> AsyncMethod
+
+ AsyncCall --> AsyncMethod
+```
+
+
+
+### Method Pairs
+
+| Sync Method | Async Method |
+|-------------|--------------|
+| `StorageLocation.store()` | `StorageLocation.store_async()` |
+| `StorageLocation.get()` | `StorageLocation.get_async()` |
+| `StorageLocation.setup_s3()` | `StorageLocation.setup_s3_async()` |
+| `folder.set_storage_location()` | `folder.set_storage_location_async()` |
+| `folder.get_project_setting()` | `folder.get_project_setting_async()` |
+| `folder.delete_project_setting()` | `folder.delete_project_setting_async()` |
+| `folder.get_sts_storage_token()` | `folder.get_sts_storage_token_async()` |
+| `folder.index_files_for_migration()` | `folder.index_files_for_migration_async()` |
+| `folder.migrate_indexed_files()` | `folder.migrate_indexed_files_async()` |
+
+---
+
+
+
+
+# Part 4: Migration
+
+This section covers the file migration system.
+
+
+
+## Migration Flow
+
+File migration is a two-phase process that moves files from one storage location
+to another while preserving Synapse metadata.
+
+```mermaid
+sequenceDiagram
+ participant User
+ participant Entity as Project/Folder
+ participant IndexFn as index_files_for_migration
+ participant DB as SQLite Database
+ participant MigrateFn as migrate_indexed_files
+ participant Synapse as Synapse REST API
+
+ rect rgb(240, 248, 255)
+ Note over User,Synapse: Phase 1: Index Files
+ User->>Entity: index_files_for_migration(dest_id, db_path)
+ activate Entity
+
+ Entity->>IndexFn: Start indexing
+ activate IndexFn
+
+ IndexFn->>Synapse: Query entity tree
+ Synapse-->>IndexFn: File list
+
+ loop For each file
+ IndexFn->>Synapse: Get file metadata
+ Synapse-->>IndexFn: File info
+ IndexFn->>DB: Record file for migration
+ end
+
+ IndexFn-->>Entity: MigrationResult (indexed counts)
+ deactivate IndexFn
+
+ Entity-->>User: MigrationResult
+ deactivate Entity
+ end
+
+ rect rgb(255, 248, 240)
+ Note over User,Synapse: Phase 2: Migrate Files
+ User->>Entity: migrate_indexed_files(db_path)
+ activate Entity
+
+ Entity->>MigrateFn: Start migration
+ activate MigrateFn
+
+ MigrateFn->>DB: Read indexed files
+
+ loop For each indexed file
+ MigrateFn->>Synapse: Copy file to new storage
+ Synapse-->>MigrateFn: Success/Failure
+ MigrateFn->>DB: Update status
+ end
+
+ MigrateFn-->>Entity: MigrationResult (migrated counts)
+ deactivate MigrateFn
+
+ Entity-->>User: MigrationResult
+ deactivate Entity
+ end
+```
+
+
+
+### Migration Strategies
+
+| Strategy | Description |
+|----------|-------------|
+| `new` | Create new file versions in destination (default) |
+| `all` | Migrate all versions of each file |
+| `latest` | Only migrate the latest version |
+| `skip` | Skip if file already exists in destination |
+
+---
+
+
+
+
+# Learn More
+
+| Resource | Description |
+|----------|-------------|
+| [Storage Location Tutorial](../tutorials/python/storage_location.md) | Step-by-step guide to using storage locations |
+| [StorageLocation API Reference][synapseclient.models.StorageLocation] | Complete API documentation |
+| [StorageLocationConfigurable Mixin][synapseclient.models.mixins.StorageLocationConfigurable] | Mixin methods for Projects and Folders |
+| [Custom Storage Locations (Synapse Docs)](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html) | Official Synapse documentation |
diff --git a/docs/js/mermaid-init.js b/docs/js/mermaid-init.js
new file mode 100644
index 000000000..823cbce57
--- /dev/null
+++ b/docs/js/mermaid-init.js
@@ -0,0 +1,12 @@
+// Initialize Mermaid diagrams
+document.addEventListener("DOMContentLoaded", function() {
+ mermaid.initialize({
+ startOnLoad: true,
+ theme: "default",
+ securityLevel: "loose",
+ flowchart: {
+ useMaxWidth: true,
+ htmlLabels: true
+ }
+ });
+});
diff --git a/docs/reference/experimental/async/folder.md b/docs/reference/experimental/async/folder.md
index 7b29f84ea..fd74e65dd 100644
--- a/docs/reference/experimental/async/folder.md
+++ b/docs/reference/experimental/async/folder.md
@@ -30,3 +30,9 @@ at your own risk.
- get_schema_derived_keys_async
- get_schema_validation_statistics_async
- get_invalid_validation_async
+ - set_storage_location_async
+ - get_project_setting_async
+ - delete_project_setting_async
+ - get_sts_storage_token_async
+ - index_files_for_migration_async
+ - migrate_indexed_files_async
diff --git a/docs/reference/experimental/async/project.md b/docs/reference/experimental/async/project.md
index e3adfa9fc..42803e871 100644
--- a/docs/reference/experimental/async/project.md
+++ b/docs/reference/experimental/async/project.md
@@ -29,3 +29,9 @@ at your own risk.
- get_schema_derived_keys_async
- get_schema_validation_statistics_async
- get_invalid_validation_async
+ - set_storage_location_async
+ - get_project_setting_async
+ - delete_project_setting_async
+ - get_sts_storage_token_async
+ - index_files_for_migration_async
+ - migrate_indexed_files_async
diff --git a/docs/reference/experimental/async/storage_location.md b/docs/reference/experimental/async/storage_location.md
new file mode 100644
index 000000000..00e03fc47
--- /dev/null
+++ b/docs/reference/experimental/async/storage_location.md
@@ -0,0 +1,23 @@
+# StorageLocation
+
+Contained within this file are experimental interfaces for working with the Synapse Python
+Client. Unless otherwise noted these interfaces are subject to change at any time. Use
+at your own risk.
+
+## API Reference
+
+::: synapseclient.models.StorageLocation
+ options:
+ inherited_members: true
+ members:
+ - store_async
+ - get_async
+ - setup_s3_async
+
+---
+
+::: synapseclient.models.StorageLocationType
+
+---
+
+::: synapseclient.models.UploadType
diff --git a/docs/reference/experimental/mixins/manifest_generatable.md b/docs/reference/experimental/mixins/manifest_generatable.md
new file mode 100644
index 000000000..47aac2a4c
--- /dev/null
+++ b/docs/reference/experimental/mixins/manifest_generatable.md
@@ -0,0 +1,69 @@
+# ManifestGeneratable Mixin
+
+The `ManifestGeneratable` mixin provides manifest TSV file generation and reading capabilities for container entities (Projects and Folders).
+
+## Overview
+
+This mixin enables:
+
+- Generating manifest TSV files after syncing from Synapse
+- Uploading files from manifest TSV files
+- Validating manifest files before upload
+
+## Usage
+
+The mixin is automatically available on `Project` and `Folder` classes:
+
+```python
+from synapseclient.models import Project, Folder
+
+# Project and Folder both have manifest capabilities
+project = Project(id="syn123")
+folder = Folder(id="syn456")
+```
+
+## API Reference
+
+::: synapseclient.models.mixins.manifest.ManifestGeneratable
+ options:
+ show_root_heading: true
+ show_source: false
+ members:
+ - generate_manifest
+ - generate_manifest_async
+ - from_manifest
+ - from_manifest_async
+ - validate_manifest
+ - validate_manifest_async
+ - get_manifest_data
+ - get_manifest_data_async
+
+## Constants
+
+### MANIFEST_FILENAME
+
+The default filename for generated manifests: `SYNAPSE_METADATA_MANIFEST.tsv`
+
+```python
+from synapseclient.models import MANIFEST_FILENAME
+
+print(MANIFEST_FILENAME) # "SYNAPSE_METADATA_MANIFEST.tsv"
+```
+
+### DEFAULT_GENERATED_MANIFEST_KEYS
+
+The default columns included in generated manifest files:
+
+```python
+from synapseclient.models import DEFAULT_GENERATED_MANIFEST_KEYS
+
+print(DEFAULT_GENERATED_MANIFEST_KEYS)
+# ['path', 'parent', 'name', 'id', 'synapseStore', 'contentType',
+# 'used', 'executed', 'activityName', 'activityDescription']
+```
+
+## See Also
+
+- [Manifest Operations Tutorial](../../../tutorials/python/manifest_operations.md)
+- [StorableContainer Mixin](storable_container.md)
+- [Manifest TSV Format](../../../explanations/manifest_tsv.md)
diff --git a/docs/reference/experimental/mixins/storage_location_configurable.md b/docs/reference/experimental/mixins/storage_location_configurable.md
new file mode 100644
index 000000000..3cf29d81a
--- /dev/null
+++ b/docs/reference/experimental/mixins/storage_location_configurable.md
@@ -0,0 +1,54 @@
+# StorageLocationConfigurable
+
+The `StorageLocationConfigurable` mixin provides methods for managing storage locations
+on entities (Projects and Folders).
+
+For architecture diagrams and design documentation, see
+[Storage Location Architecture](../../../explanations/storage_location_architecture.md).
+
+This mixin includes:
+
+- Setting upload storage locations
+- Getting and deleting project settings
+- Obtaining STS credentials for direct S3 access
+- Migrating files to new storage locations
+
+## Methods Overview
+
+| Method | Description |
+|--------|-------------|
+| `set_storage_location` | Set the upload storage location for this entity |
+| `get_project_setting` | Get project settings (upload, external_sync, etc.) |
+| `delete_project_setting` | Delete a project setting |
+| `get_sts_storage_token` | Get STS credentials for direct S3 access |
+| `index_files_for_migration` | Index files for migration to a new storage location |
+| `migrate_indexed_files` | Migrate previously indexed files |
+
+## Usage Example
+
+```python
+from synapseclient.models import Folder, StorageLocation, StorageLocationType
+
+# Create a storage location
+storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket="my-bucket",
+ sts_enabled=True,
+).store()
+
+# Set storage location on a folder
+folder = Folder(id="syn123").get()
+folder.set_storage_location(storage_location_id=storage.storage_location_id)
+
+# Get STS credentials
+credentials = folder.get_sts_storage_token(
+ permission="read_write",
+ output_format="boto",
+)
+```
+
+::: synapseclient.models.mixins.StorageLocationConfigurable
+
+---
+
+::: synapseclient.models.protocols.storage_location_mixin_protocol.StorageLocationConfigurableSynchronousProtocol
diff --git a/docs/reference/experimental/sync/folder.md b/docs/reference/experimental/sync/folder.md
index 43272ea30..c866a727e 100644
--- a/docs/reference/experimental/sync/folder.md
+++ b/docs/reference/experimental/sync/folder.md
@@ -41,3 +41,9 @@ at your own risk.
- get_schema_derived_keys
- get_schema_validation_statistics
- get_invalid_validation
+ - set_storage_location
+ - get_project_setting
+ - delete_project_setting
+ - get_sts_storage_token
+ - index_files_for_migration
+ - migrate_indexed_files
diff --git a/docs/reference/experimental/sync/project.md b/docs/reference/experimental/sync/project.md
index 4e2f35a26..1bb859795 100644
--- a/docs/reference/experimental/sync/project.md
+++ b/docs/reference/experimental/sync/project.md
@@ -40,3 +40,9 @@ at your own risk.
- get_schema_derived_keys
- get_schema_validation_statistics
- get_invalid_validation
+ - set_storage_location
+ - get_project_setting
+ - delete_project_setting
+ - get_sts_storage_token
+ - index_files_for_migration
+ - migrate_indexed_files
diff --git a/docs/reference/experimental/sync/storage_location.md b/docs/reference/experimental/sync/storage_location.md
new file mode 100644
index 000000000..a764c9d7d
--- /dev/null
+++ b/docs/reference/experimental/sync/storage_location.md
@@ -0,0 +1,24 @@
+[](){ #storage-location-reference-sync }
+# StorageLocation
+
+Contained within this file are experimental interfaces for working with the Synapse Python
+Client. Unless otherwise noted these interfaces are subject to change at any time. Use
+at your own risk.
+
+## API Reference
+
+::: synapseclient.models.StorageLocation
+ options:
+ inherited_members: true
+ members:
+ - store
+ - get
+ - setup_s3
+
+---
+
+::: synapseclient.models.StorageLocationType
+
+---
+
+::: synapseclient.models.UploadType
diff --git a/docs/tutorials/python/manifest_operations.md b/docs/tutorials/python/manifest_operations.md
new file mode 100644
index 000000000..25362a347
--- /dev/null
+++ b/docs/tutorials/python/manifest_operations.md
@@ -0,0 +1,328 @@
+# Manifest Operations
+
+This tutorial covers how to work with manifest TSV files for bulk file operations in Synapse. Manifest files provide a way to track file metadata, download files with their annotations, and upload files with provenance information.
+
+## Overview
+
+A manifest file is a tab-separated values (TSV) file that contains metadata about files in Synapse. The manifest includes:
+
+- File paths and Synapse IDs
+- Parent container IDs
+- Annotations
+- Provenance information (used/executed references)
+
+## Generating Manifests During Download
+
+When syncing files from Synapse, you can automatically generate a manifest file that captures all file metadata.
+
+### Using sync_from_synapse with Manifest Generation
+
+```python
+from synapseclient.models import Project
+import synapseclient
+
+synapseclient.login()
+
+# Download a project with manifest generation at each directory level
+project = Project(id="syn123456").sync_from_synapse(
+ path="/path/to/download",
+ generate_manifest="all"
+)
+
+# Or generate a single manifest at the root level only
+project = Project(id="syn123456").sync_from_synapse(
+ path="/path/to/download",
+ generate_manifest="root"
+)
+```
+
+### Manifest Generation Options
+
+The `generate_manifest` parameter accepts three values:
+
+| Value | Description |
+|-------|-------------|
+| `"suppress"` | (Default) Do not create any manifest files |
+| `"root"` | Create a single manifest at the root download path |
+| `"all"` | Create a manifest in each directory level |
+
+### Generating Manifest Separately
+
+You can also generate a manifest after syncing:
+
+```python
+from synapseclient.models import Project
+import synapseclient
+
+synapseclient.login()
+
+# First sync without manifest
+project = Project(id="syn123456").sync_from_synapse(
+ path="/path/to/download"
+)
+
+# Then generate manifest separately
+manifest_path = project.generate_manifest(
+ path="/path/to/download",
+ manifest_scope="root"
+)
+print(f"Manifest created at: {manifest_path}")
+```
+
+## Manifest File Format
+
+The generated manifest file (`SYNAPSE_METADATA_MANIFEST.tsv`) contains the following columns:
+
+| Column | Description |
+|--------|-------------|
+| `path` | Local file path |
+| `parent` | Synapse ID of the parent container |
+| `name` | File name in Synapse |
+| `id` | Synapse file ID |
+| `synapseStore` | Whether the file is stored in Synapse |
+| `contentType` | MIME type of the file |
+| `used` | Provenance - entities used to create this file |
+| `executed` | Provenance - code/scripts executed |
+| `activityName` | Name of the provenance activity |
+| `activityDescription` | Description of the provenance activity |
+| *custom columns* | Any annotations on the files |
+
+### Example Manifest
+
+```tsv
+path parent name id synapseStore contentType used executed activityName activityDescription study dataType
+/data/file1.csv syn123 file1.csv syn456 True text/csv Data Processing Study1 RNA-seq
+/data/file2.csv syn123 file2.csv syn789 True text/csv syn456 Analysis Processed from file1 Study1 RNA-seq
+```
+
+## Uploading Files from a Manifest
+
+You can upload files to Synapse using a manifest file:
+
+```python
+from synapseclient.models import Project
+import synapseclient
+
+synapseclient.login()
+
+# Upload files from a manifest
+files = Project.from_manifest(
+ manifest_path="/path/to/manifest.tsv",
+ parent_id="syn123456"
+)
+
+for file in files:
+ print(f"Uploaded: {file.name} ({file.id})")
+```
+
+### Dry Run Validation
+
+Before uploading, you can validate the manifest:
+
+```python
+from synapseclient.models import Project
+
+# Validate without uploading
+is_valid, errors = Project.validate_manifest(
+ manifest_path="/path/to/manifest.tsv"
+)
+
+if is_valid:
+ print("Manifest is valid, ready for upload")
+else:
+ for error in errors:
+ print(f"Error: {error}")
+```
+
+Or use the `dry_run` option to validate the manifest and see what would be uploaded without making changes:
+
+```python
+# Dry run - validates and returns what would be uploaded, but doesn't upload
+files = Project.from_manifest(
+ manifest_path="/path/to/manifest.tsv",
+ parent_id="syn123456",
+ dry_run=True # Validate only, no actual upload
+)
+print(f"Would upload {len(files)} files")
+```
+
+The `dry_run` parameter is useful for:
+
+- Validating manifest format before committing to an upload
+- Testing your manifest configuration
+- Previewing which files will be affected
+
+## Working with Annotations
+
+Annotations in the manifest are automatically handled:
+
+### On Download
+
+When generating a manifest, all file annotations are included as additional columns:
+
+```python
+project = Project(id="syn123456").sync_from_synapse(
+ path="/path/to/download",
+ generate_manifest="root"
+)
+# Annotations appear as columns in the manifest
+```
+
+### On Upload
+
+Any columns in the manifest that aren't standard fields become annotations:
+
+```tsv
+path parent study dataType specimenType
+/data/file1.csv syn123 Study1 RNA-seq tissue
+```
+
+```python
+files = Project.from_manifest(
+ manifest_path="/path/to/manifest.tsv",
+ parent_id="syn123456",
+ merge_existing_annotations=True # Merge with existing annotations
+)
+```
+
+## Working with Provenance
+
+### On Download
+
+Provenance information is captured in the `used`, `executed`, `activityName`, and `activityDescription` columns:
+
+```python
+project = Project(id="syn123456").sync_from_synapse(
+ path="/path/to/download",
+ include_activity=True, # Include provenance
+ generate_manifest="root"
+)
+```
+
+### On Upload
+
+You can specify provenance in the manifest:
+
+```tsv
+path parent used executed activityName activityDescription
+/data/output.csv syn123 syn456;syn789 https://github.com/repo/script.py Analysis Generated from input files
+```
+
+- Multiple references are separated by semicolons (`;`)
+- References can be Synapse IDs, URLs, or local file paths
+
+## Synapse Download List Integration
+
+The manifest functionality integrates with Synapse's Download List feature. You can generate a manifest directly from your Synapse download list, which is useful for exporting metadata about files you've queued for download in the Synapse web interface.
+
+### Generating Manifest from Download List
+
+```python
+from synapseclient.models import Project
+import synapseclient
+
+synapseclient.login()
+
+# Generate a manifest from your Synapse download list
+manifest_path = Project.generate_download_list_manifest(
+ download_path="/path/to/save/manifest"
+)
+print(f"Manifest downloaded to: {manifest_path}")
+```
+
+### Custom CSV Formatting
+
+You can customize the manifest format:
+
+```python
+from synapseclient.models import Project
+import synapseclient
+
+synapseclient.login()
+
+# Generate a tab-separated manifest
+manifest_path = Project.generate_download_list_manifest(
+ download_path="/path/to/save/manifest",
+ csv_separator="\t", # Tab-separated
+ include_header=True
+)
+```
+
+### Using DownloadListManifestRequest Directly
+
+For more control over the manifest generation process, use the `DownloadListManifestRequest` class directly:
+
+```python
+from synapseclient.models import DownloadListManifestRequest, CsvTableDescriptor
+import synapseclient
+
+synapseclient.login()
+
+# Create a request with custom CSV formatting
+request = DownloadListManifestRequest(
+ csv_table_descriptor=CsvTableDescriptor(
+ separator="\t",
+ quote_character='"',
+ is_first_line_header=True
+ )
+)
+
+# Send the job and wait for completion
+request.send_job_and_wait()
+
+# Download the generated manifest
+manifest_path = request.download_manifest(download_path="/path/to/download")
+print(f"Manifest file handle: {request.result_file_handle_id}")
+```
+
+## Best Practices
+
+1. **Use `generate_manifest="root"` for simple cases** - Creates a single manifest at the root level, easier to manage.
+
+2. **Use `generate_manifest="all"` for complex hierarchies** - Creates manifests at each directory level, useful for large projects with many subdirectories.
+
+3. **Validate manifests before upload** - Use `validate_manifest()` or `dry_run=True` to catch errors early.
+
+4. **Include provenance information** - Set `include_activity=True` when syncing to capture provenance in the manifest.
+
+5. **Backup your manifest** - The manifest is a valuable record of your data and its metadata.
+
+## Async API
+
+All manifest operations are available as async methods:
+
+```python
+import asyncio
+from synapseclient.models import Project
+import synapseclient
+
+async def main():
+ synapseclient.login()
+
+ # Async sync with manifest
+ project = Project(id="syn123456")
+ await project.sync_from_synapse_async(
+ path="/path/to/download",
+ generate_manifest="root"
+ )
+
+ # Async manifest generation
+ manifest_path = await project.generate_manifest_async(
+ path="/path/to/download",
+ manifest_scope="root"
+ )
+
+ # Async upload from manifest
+ files = await Project.from_manifest_async(
+ manifest_path="/path/to/manifest.tsv",
+ parent_id="syn123456"
+ )
+
+asyncio.run(main())
+```
+
+## See Also
+
+- [Download Data in Bulk](download_data_in_bulk.md)
+- [Upload Data in Bulk](upload_data_in_bulk.md)
+- [Manifest TSV Format](../../explanations/manifest_tsv.md)
diff --git a/docs/tutorials/python/storage_location.md b/docs/tutorials/python/storage_location.md
new file mode 100644
index 000000000..41dd5036c
--- /dev/null
+++ b/docs/tutorials/python/storage_location.md
@@ -0,0 +1,135 @@
+# Storage Locations in Synapse
+
+Storage locations allow you to configure where files uploaded to Synapse are
+stored. By default, files are stored in Synapse's internal S3 storage, but you
+can configure projects or folders to use your own AWS S3 buckets, Google Cloud
+Storage buckets, or other external storage.
+
+This tutorial demonstrates how to use the Python client to manage storage
+locations using the new object-oriented models.
+
+[Read more about Custom Storage Locations](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html)
+
+## Tutorial Purpose
+In this tutorial you will:
+
+1. Create an external S3 storage location
+2. Set up a folder backed by external S3 storage
+3. Create an STS-enabled storage location for direct S3 access
+4. Use STS credentials with boto3
+5. Retrieve and inspect storage location settings
+
+## Prerequisites
+
+* Make sure that you have completed the [Installation](../installation.md) and
+ [Authentication](../authentication.md) setup.
+* You must have a [Project](./project.md) created and replace the one used in
+ this tutorial.
+* An AWS S3 bucket properly configured for use with Synapse, including an
+ `owner.txt` file. See
+ [Custom Storage Locations](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html).
+* (Optional) `boto3` installed for STS credential examples.
+
+## Understanding Storage Location Types
+
+Synapse supports several types of storage locations:
+
+- **SYNAPSE_S3**: Synapse-managed S3 storage (default)
+- **EXTERNAL_S3**: User-owned Amazon S3 bucket accessed by Synapse
+- **EXTERNAL_GOOGLE_CLOUD**: User-owned Google Cloud Storage bucket
+- **EXTERNAL_SFTP**: External SFTP server not accessed by Synapse
+- **EXTERNAL_OBJECT_STORE**: S3-like bucket (e.g., OpenStack) not accessed by Synapse
+- **PROXY**: A proxy server that controls access to storage
+
+## STS-Enabled Storage
+
+STS (AWS Security Token Service) enabled storage locations allow users to get
+temporary AWS credentials for direct S3 access. This is useful for:
+
+- Uploading large files directly to S3
+- Using AWS tools like the AWS CLI or boto3
+- Performing bulk operations on files
+
+## 1. Set up and get project
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=5-12}
+```
+
+## 2. Create an external S3 storage location
+
+Create a storage location backed by your own S3 bucket. The bucket must be
+properly configured with an `owner.txt` file.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=14-27}
+```
+
+
+ You'll notice the output looks like:
+
+```
+Created storage location: 12345
+Type: StorageLocationType.EXTERNAL_S3
+Bucket: my-synapse-bucket
+```
+
+
+## 3. Set up a folder with external S3 storage
+
+The `setup_s3` convenience method handles creating the folder, storage location,
+and project settings in a single call.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=29-38}
+```
+
+## 4. Create an STS-enabled storage location
+
+STS-enabled storage locations allow you to get temporary AWS credentials for
+direct S3 access.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=40-50}
+```
+
+## 5. Use STS credentials with boto3
+
+Once you have an STS-enabled folder, you can get temporary credentials to
+access the underlying S3 bucket directly.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=52-72}
+```
+
+## 6. Retrieve and inspect storage location settings
+
+You can retrieve your storage location settings and inspect their configuration.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=74-86}
+```
+
+## Source code for this tutorial
+
+
+ Click to show me
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!}
+```
+
+
+## References used in this tutorial
+
+- [StorageLocation][synapseclient.models.StorageLocation]
+- [StorageLocationType][synapseclient.models.StorageLocationType]
+- [Folder][synapseclient.models.Folder]
+- [Project][synapseclient.models.Project]
+- [syn.login][synapseclient.Synapse.login]
+- [Custom Storage Locations Documentation](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html)
+
+## See also
+
+- [Storage Location Architecture](../../explanations/storage_location_architecture.md) -
+ In-depth architecture diagrams and design documentation
diff --git a/docs/tutorials/python/tutorial_scripts/storage_location.py b/docs/tutorials/python/tutorial_scripts/storage_location.py
new file mode 100644
index 000000000..9fe81ff6e
--- /dev/null
+++ b/docs/tutorials/python/tutorial_scripts/storage_location.py
@@ -0,0 +1,86 @@
+"""
+Here is where you'll find the code for the Storage Location tutorial.
+"""
+
+# Step 1: Create an External S3 Storage Location
+import synapseclient
+from synapseclient.models import Project, StorageLocation, StorageLocationType
+
+syn = synapseclient.login()
+
+# Retrieve the project
+my_project = Project(name="My uniquely named project about Alzheimer's Disease").get()
+
+# Step 2: Create an External S3 Storage Location
+# Replace with your S3 bucket name (must have owner.txt configured)
+MY_BUCKET_NAME = "my-synapse-bucket"
+MY_BASE_KEY = "synapse-data"
+
+storage_location = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket=MY_BUCKET_NAME,
+ base_key=MY_BASE_KEY,
+).store()
+
+print(f"Created storage location: {storage_location.storage_location_id}")
+print(f"Type: {storage_location.storage_type}")
+print(f"Bucket: {storage_location.bucket}")
+
+# Step 3: Set up a folder with external S3 storage
+folder, storage = StorageLocation.setup_s3(
+ folder_name="my-external-storage-folder",
+ parent=my_project.id,
+ bucket_name=MY_BUCKET_NAME,
+ base_key="folder-specific-prefix",
+)
+
+print(f"Created folder: {folder.id}")
+print(f"Storage location ID: {storage.storage_location_id}")
+
+# Step 4: Create an STS-enabled storage location
+sts_folder, sts_storage = StorageLocation.setup_s3(
+ folder_name="my-sts-enabled-folder",
+ parent=my_project.id,
+ bucket_name=MY_BUCKET_NAME,
+ base_key="sts-data",
+ sts_enabled=True,
+)
+
+print(f"Created STS-enabled folder: {sts_folder.id}")
+print(f"STS enabled: {sts_storage.sts_enabled}")
+
+# Step 5: Use STS credentials with boto3
+credentials = sts_folder.get_sts_storage_token(
+ permission="read_write",
+ output_format="boto",
+)
+
+print(f"AWS Access Key ID: {credentials['aws_access_key_id'][:10]}...")
+print("Credentials expire: check 'expiration' in json format")
+
+try:
+ import boto3
+
+ s3_client = boto3.client("s3", **credentials)
+ response = s3_client.list_objects_v2(
+ Bucket=MY_BUCKET_NAME,
+ Prefix="sts-data/",
+ MaxKeys=10,
+ )
+ print(f"Found {response.get('KeyCount', 0)} objects")
+except ImportError:
+ print("boto3 not installed, skipping S3 client example")
+
+# Step 6: Retrieve and inspect storage location settings
+retrieved_storage = StorageLocation(
+ storage_location_id=storage_location.storage_location_id
+).get()
+
+print("Retrieved storage location:")
+print(f" ID: {retrieved_storage.storage_location_id}")
+print(f" Type: {retrieved_storage.storage_type}")
+print(f" Bucket: {retrieved_storage.bucket}")
+print(f" Base Key: {retrieved_storage.base_key}")
+print(f" STS Enabled: {retrieved_storage.sts_enabled}")
+print(f" Created By: {retrieved_storage.created_by}")
+print(f" Created On: {retrieved_storage.created_on}")
diff --git a/mkdocs.yml b/mkdocs.yml
index 85a237d0c..7461f91f3 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -45,8 +45,10 @@ nav:
# - Team: tutorials/python/team.md
- Upload data in bulk: tutorials/python/upload_data_in_bulk.md
- Download data in bulk: tutorials/python/download_data_in_bulk.md
+ - Manifest Operations: tutorials/python/manifest_operations.md
- Creating JSON Schema: tutorials/python/schema_operations.md
- Working with JSON Schema: tutorials/python/json_schema.md
+ - Storage Location: tutorials/python/storage_location.md
# - Move Files and Folders: tutorials/python/move_files_and_folders.md
# - Migrate data to other storage locations: tutorials/python/migrate_data_to_other_storage_locations.md
- Working with the Command Line Client: tutorials/command_line_client.md
@@ -111,6 +113,7 @@ nav:
- JSONSchema: reference/experimental/sync/json_schema.md
- Wiki: reference/experimental/sync/wiki.md
- FormGroup and Form: reference/experimental/sync/form.md
+ - StorageLocation: reference/experimental/sync/storage_location.md
- Extensions:
- Curator: reference/extensions/curator.md
- Asynchronous:
@@ -139,15 +142,18 @@ nav:
- JSONSchema: reference/experimental/async/json_schema.md
- Wiki: reference/experimental/async/wiki.md
- FormGroup and Form: reference/experimental/async/form.md
+ - StorageLocation: reference/experimental/async/storage_location.md
- Mixins:
- AccessControllable: reference/experimental/mixins/access_controllable.md
- StorableContainer: reference/experimental/mixins/storable_container.md
+ - ManifestGeneratable: reference/experimental/mixins/manifest_generatable.md
- AsynchronousCommunicator: reference/experimental/mixins/asynchronous_communicator.md
- FailureStrategy: reference/experimental/mixins/failure_strategy.md
- BaseJSONSchema: reference/experimental/mixins/base_json_schema.md
- ContainerEntityJSONSchema: reference/experimental/mixins/container_json_schema.md
- FormData: reference/experimental/mixins/form_data.md
- FormGroup: reference/experimental/mixins/form_group.md
+ - StorageLocationConfigurable: reference/experimental/mixins/storage_location_configurable.md
- Further Reading:
- Home: explanations/home.md
@@ -159,6 +165,7 @@ nav:
- Structuring Your Project: explanations/structuring_your_project.md
- Asyncio Changes in Python 3.14: explanations/asyncio_in_python_3_14.md
- Curator Data model: explanations/curator_data_model.md
+ - Storage Location Architecture: explanations/storage_location_architecture.md
- News:
- news.md
- Contact Us: https://sagebionetworks.jira.com/servicedesk/customer/portal/9/group/16/create/206
@@ -201,6 +208,10 @@ theme:
extra_css:
- css/custom.css
+extra_javascript:
+ - https://unpkg.com/mermaid@10/dist/mermaid.min.js
+ - js/mermaid-init.js
+
plugins:
- search
- mkdocstrings:
diff --git a/synapseclient/api/__init__.py b/synapseclient/api/__init__.py
index 6b0961677..13e97c701 100644
--- a/synapseclient/api/__init__.py
+++ b/synapseclient/api/__init__.py
@@ -130,6 +130,14 @@
update_organization_acl,
validate_entity_with_json_schema,
)
+from .storage_location_services import (
+ create_project_setting,
+ create_storage_location_setting,
+ delete_project_setting,
+ get_project_setting,
+ get_storage_location_setting,
+ update_project_setting,
+)
from .table_services import (
ViewEntityType,
ViewTypeMask,
@@ -357,4 +365,11 @@
"create_form_data",
"list_form_data",
"list_form_data_sync",
+ # storage_location_services
+ "create_storage_location_setting",
+ "get_storage_location_setting",
+ "get_project_setting",
+ "create_project_setting",
+ "update_project_setting",
+ "delete_project_setting",
]
diff --git a/synapseclient/api/storage_location_services.py b/synapseclient/api/storage_location_services.py
new file mode 100644
index 000000000..c73c7e8cc
--- /dev/null
+++ b/synapseclient/api/storage_location_services.py
@@ -0,0 +1,169 @@
+"""Services for interacting with storage location settings and project settings in Synapse.
+
+This module provides async REST wrappers for creating, retrieving, and managing
+storage location settings and their associated project settings.
+"""
+
+import json
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+if TYPE_CHECKING:
+ from synapseclient import Synapse
+
+
+async def create_storage_location_setting(
+ body: Dict[str, Any],
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> Dict[str, Any]:
+ """Create a new storage location setting in Synapse.
+
+ Storage location creation is idempotent per user - if the same user creates
+ a storage location with identical properties, the existing one is returned.
+
+ Arguments:
+ body: The storage location setting request body containing concreteType
+ and other type-specific fields.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The created or existing storage location setting as a dictionary.
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ return await client.rest_post_async(
+ uri="/storageLocation",
+ body=json.dumps(body),
+ )
+
+
+async def get_storage_location_setting(
+ storage_location_id: int,
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> Dict[str, Any]:
+ """Retrieve a storage location setting by its ID.
+
+ Only the creator of a StorageLocationSetting can retrieve it by its ID.
+
+ Arguments:
+ storage_location_id: The ID of the storage location setting to retrieve.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The storage location setting as a dictionary.
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ return await client.rest_get_async(
+ uri=f"/storageLocation/{storage_location_id}",
+ )
+
+
+async def get_project_setting(
+ project_id: str,
+ setting_type: str,
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> Optional[Dict[str, Any]]:
+ """Get the project setting for an entity.
+
+ Arguments:
+ project_id: The Synapse ID of the project or folder.
+ setting_type: The type of setting to retrieve. One of:
+ 'upload', 'external_sync', 'requester_pays'.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The project setting as a dictionary, or None if no setting exists.
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ response = await client.rest_get_async(
+ uri=f"/projectSettings/{project_id}/type/{setting_type}",
+ )
+ # If no project setting, an empty string is returned as the response
+ return response if response else None
+
+
+async def create_project_setting(
+ body: Dict[str, Any],
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> Dict[str, Any]:
+ """Create a new project setting.
+
+ Arguments:
+ body: The project setting request body.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The created project setting as a dictionary.
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ return await client.rest_post_async(
+ uri="/projectSettings",
+ body=json.dumps(body),
+ )
+
+
+async def update_project_setting(
+ body: Dict[str, Any],
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> Dict[str, Any]:
+ """Update an existing project setting.
+
+ Arguments:
+ body: The project setting request body including the id field.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The updated project setting as a dictionary.
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ return await client.rest_put_async(
+ uri="/projectSettings",
+ body=json.dumps(body),
+ )
+
+
+async def delete_project_setting(
+ setting_id: str,
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> None:
+ """Delete a project setting.
+
+ Arguments:
+ setting_id: The ID of the project setting to delete.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ None
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ await client.rest_delete_async(
+ uri=f"/projectSettings/{setting_id}",
+ )
diff --git a/synapseclient/client.py b/synapseclient/client.py
index 2e9c543cb..35d521a27 100644
--- a/synapseclient/client.py
+++ b/synapseclient/client.py
@@ -5512,6 +5512,11 @@ def _createExternalObjectStoreFileHandle(
"/externalFileHandle", json.dumps(file_handle), self.fileHandleEndpoint
)
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `synapseclient.api.post_external_s3_file_handle()` instead.",
+ )
def create_external_s3_file_handle(
self,
bucket_name,
@@ -5650,7 +5655,11 @@ def _getUserCredentials(
# Project/Folder storage location settings #
############################################
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `StorageLocation(...).store()` from synapseclient.models instead.",
+ )
def createStorageLocationSetting(self, storage_type, **kwargs):
"""
Creates an IMMUTABLE storage location based on the specified type.
@@ -5707,7 +5716,12 @@ def createStorageLocationSetting(self, storage_type, **kwargs):
return self.restPOST("/storageLocation", body=json.dumps(kwargs))
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `StorageLocation(storage_location_id=id).get()` from "
+ "synapseclient.models instead.",
+ )
def getMyStorageLocationSetting(self, storage_location_id):
"""
Get a StorageLocationSetting by its id.
@@ -5721,7 +5735,12 @@ def getMyStorageLocationSetting(self, storage_location_id):
"""
return self.restGET("/storageLocation/%s" % storage_location_id)
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `Folder(id=...).set_storage_location(...)` or "
+ "`Project(id=...).set_storage_location(...)` from synapseclient.models instead.",
+ )
def setStorageLocation(self, entity, storage_location_id):
"""
Sets the storage location for a Project or Folder
@@ -5759,7 +5778,12 @@ def setStorageLocation(self, entity, storage_location_id):
"/projectSettings", body=json.dumps(project_destination)
)
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `Folder(id=...).get_project_setting(...)` or "
+ "`Project(id=...).get_project_setting(...)` from synapseclient.models instead.",
+ )
def getProjectSetting(self, project, setting_type):
"""
Gets the ProjectSetting for a project.
@@ -5787,7 +5811,12 @@ def getProjectSetting(self, project, setting_type):
response if response else None
) # if no project setting, a empty string is returned as the response
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `Folder(id=...).get_sts_storage_token(...)` or "
+ "`Project(id=...).get_sts_storage_token(...)` from synapseclient.models instead.",
+ )
def get_sts_storage_token(
self, entity, permission, *, output_format="json", min_remaining_life=None
):
@@ -5820,7 +5849,11 @@ def get_sts_storage_token(
min_remaining_life=min_remaining_life,
)
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `StorageLocation.setup_s3(...)` from synapseclient.models instead.",
+ )
def create_s3_storage_location(
self,
*,
@@ -5862,7 +5895,11 @@ def create_s3_storage_location(
)
)
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `StorageLocation.setup_s3_async(...)` from synapseclient.models instead.",
+ )
async def create_s3_storage_location_async(
self,
*,
diff --git a/synapseclient/core/constants/concrete_types.py b/synapseclient/core/constants/concrete_types.py
index fba11dbdb..f34fc3887 100644
--- a/synapseclient/core/constants/concrete_types.py
+++ b/synapseclient/core/constants/concrete_types.py
@@ -9,7 +9,23 @@
EXTERNAL_S3_STORAGE_LOCATION_SETTING = (
"org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting"
)
-# EXTERNAL_GCP_STORAGE_LOCATION_SETTING = 'org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting' # noqa: E501
+EXTERNAL_GCP_STORAGE_LOCATION_SETTING = (
+ "org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting"
+)
+EXTERNAL_STORAGE_LOCATION_SETTING = (
+ "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting"
+)
+EXTERNAL_OBJECT_STORAGE_LOCATION_SETTING = (
+ "org.sagebionetworks.repo.model.project.ExternalObjectStorageLocationSetting"
+)
+PROXY_STORAGE_LOCATION_SETTINGS = (
+ "org.sagebionetworks.repo.model.project.ProxyStorageLocationSettings"
+)
+
+# Concrete types for ProjectSettings
+UPLOAD_DESTINATION_LIST_SETTING = (
+ "org.sagebionetworks.repo.model.project.UploadDestinationListSetting"
+)
# Concrete types for UploadDestinations
SYNAPSE_S3_UPLOAD_DESTINATION = (
@@ -117,6 +133,14 @@
"org.sagebionetworks.repo.model.curation.metadata.RecordBasedMetadataTaskProperties"
)
+# Download List Types
+DOWNLOAD_LIST_MANIFEST_REQUEST = (
+ "org.sagebionetworks.repo.model.download.DownloadListManifestRequest"
+)
+DOWNLOAD_LIST_MANIFEST_RESPONSE = (
+ "org.sagebionetworks.repo.model.download.DownloadListManifestResponse"
+)
+
# Grid Session Types
CREATE_GRID_REQUEST = "org.sagebionetworks.repo.model.grid.CreateGridRequest"
GRID_RECORD_SET_EXPORT_REQUEST = (
diff --git a/synapseclient/models/__init__.py b/synapseclient/models/__init__.py
index 554de0bc2..9d5bc90b0 100644
--- a/synapseclient/models/__init__.py
+++ b/synapseclient/models/__init__.py
@@ -14,6 +14,7 @@
RecordBasedMetadataTaskProperties,
)
from synapseclient.models.dataset import Dataset, DatasetCollection, EntityRef
+from synapseclient.models.download_list import DownloadListManifestRequest
from synapseclient.models.entityview import EntityView, ViewTypeMask
from synapseclient.models.evaluation import Evaluation
from synapseclient.models.file import File, FileHandle
@@ -21,11 +22,20 @@
from synapseclient.models.form import FormData, FormGroup
from synapseclient.models.link import Link
from synapseclient.models.materializedview import MaterializedView
+from synapseclient.models.mixins.manifest import (
+ DEFAULT_GENERATED_MANIFEST_KEYS,
+ MANIFEST_FILENAME,
+)
from synapseclient.models.mixins.table_components import QueryMixin
from synapseclient.models.project import Project
from synapseclient.models.recordset import RecordSet
from synapseclient.models.schema_organization import JSONSchema, SchemaOrganization
from synapseclient.models.services import FailureStrategy
+from synapseclient.models.storage_location import (
+ StorageLocation,
+ StorageLocationType,
+ UploadType,
+)
from synapseclient.models.submission import Submission
from synapseclient.models.submission_bundle import SubmissionBundle
from synapseclient.models.submission_status import SubmissionStatus
@@ -153,6 +163,15 @@
# Form models
"FormGroup",
"FormData",
+ # Storage Location models
+ "StorageLocation",
+ "StorageLocationType",
+ "UploadType",
+ # Manifest constants
+ "MANIFEST_FILENAME",
+ "DEFAULT_GENERATED_MANIFEST_KEYS",
+ # Download List models
+ "DownloadListManifestRequest",
]
# Static methods to expose as functions
diff --git a/synapseclient/models/folder.py b/synapseclient/models/folder.py
index a0658f521..c4d4e0718 100644
--- a/synapseclient/models/folder.py
+++ b/synapseclient/models/folder.py
@@ -18,6 +18,10 @@
ContainerEntityJSONSchema,
StorableContainer,
)
+from synapseclient.models.mixins.manifest import ManifestGeneratable
+from synapseclient.models.mixins.storage_location_mixin import (
+ StorageLocationConfigurable,
+)
from synapseclient.models.protocols.folder_protocol import FolderSynchronousProtocol
from synapseclient.models.services.search import get_id
from synapseclient.models.services.storable_entity import store_entity
@@ -47,6 +51,8 @@ class Folder(
AccessControllable,
StorableContainer,
ContainerEntityJSONSchema,
+ StorageLocationConfigurable,
+ ManifestGeneratable,
):
"""Folder is a hierarchical container for organizing data in Synapse.
diff --git a/synapseclient/models/mixins/__init__.py b/synapseclient/models/mixins/__init__.py
index 62ddcf017..491ea9616 100644
--- a/synapseclient/models/mixins/__init__.py
+++ b/synapseclient/models/mixins/__init__.py
@@ -20,11 +20,20 @@
JSONSchemaValidationStatistics,
ValidationException,
)
+from synapseclient.models.mixins.manifest import (
+ DEFAULT_GENERATED_MANIFEST_KEYS,
+ MANIFEST_FILENAME,
+ ManifestGeneratable,
+)
from synapseclient.models.mixins.storable_container import StorableContainer
+from synapseclient.models.mixins.storage_location_mixin import (
+ StorageLocationConfigurable,
+)
__all__ = [
"AccessControllable",
"StorableContainer",
+ "StorageLocationConfigurable",
"AsynchronousCommunicator",
"BaseJSONSchema",
"ContainerEntityJSONSchema",
@@ -40,4 +49,7 @@
"FormChangeRequest",
"FormSubmissionStatus",
"StateEnum",
+ "ManifestGeneratable",
+ "MANIFEST_FILENAME",
+ "DEFAULT_GENERATED_MANIFEST_KEYS",
]
diff --git a/synapseclient/models/mixins/asynchronous_job.py b/synapseclient/models/mixins/asynchronous_job.py
index fd3649bc1..407babe92 100644
--- a/synapseclient/models/mixins/asynchronous_job.py
+++ b/synapseclient/models/mixins/asynchronous_job.py
@@ -14,6 +14,7 @@
AGENT_CHAT_REQUEST,
CREATE_GRID_REQUEST,
CREATE_SCHEMA_REQUEST,
+ DOWNLOAD_LIST_MANIFEST_REQUEST,
GET_VALIDATION_SCHEMA_REQUEST,
GRID_RECORD_SET_EXPORT_REQUEST,
QUERY_BUNDLE_REQUEST,
@@ -29,6 +30,7 @@
ASYNC_JOB_URIS = {
AGENT_CHAT_REQUEST: "/agent/chat/async",
CREATE_GRID_REQUEST: "/grid/session/async",
+ DOWNLOAD_LIST_MANIFEST_REQUEST: "/download/list/manifest/async",
GRID_RECORD_SET_EXPORT_REQUEST: "/grid/export/recordset/async",
TABLE_UPDATE_TRANSACTION_REQUEST: "/entity/{entityId}/table/transaction/async",
GET_VALIDATION_SCHEMA_REQUEST: "/schema/type/validation/async",
diff --git a/synapseclient/models/mixins/manifest.py b/synapseclient/models/mixins/manifest.py
new file mode 100644
index 000000000..785a9c7b9
--- /dev/null
+++ b/synapseclient/models/mixins/manifest.py
@@ -0,0 +1,950 @@
+"""Mixin for objects that can generate and read manifest TSV files."""
+
+import csv
+import datetime
+import io
+import os
+import re
+import sys
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+from synapseclient import Synapse
+from synapseclient.core import utils
+from synapseclient.core.async_utils import async_to_sync, otel_trace_method
+from synapseclient.core.utils import is_synapse_id_str, is_url, topolgical_sort
+from synapseclient.models.protocols.manifest_protocol import (
+ ManifestGeneratableSynchronousProtocol,
+)
+
+if TYPE_CHECKING:
+ from synapseclient.models import File
+
+# When new fields are added to the manifest they will also need to be added to
+# file.py#_determine_fields_to_ignore_in_merge
+REQUIRED_FIELDS = ["path", "parent"]
+FILE_CONSTRUCTOR_FIELDS = ["name", "id", "synapseStore", "contentType"]
+STORE_FUNCTION_FIELDS = ["activityName", "activityDescription", "forceVersion"]
+PROVENANCE_FIELDS = ["used", "executed"]
+MANIFEST_FILENAME = "SYNAPSE_METADATA_MANIFEST.tsv"
+DEFAULT_GENERATED_MANIFEST_KEYS = [
+ "path",
+ "parent",
+ "name",
+ "id",
+ "synapseStore",
+ "contentType",
+ "used",
+ "executed",
+ "activityName",
+ "activityDescription",
+]
+ARRAY_BRACKET_PATTERN = re.compile(r"^\[.*\]$")
+SINGLE_OPEN_BRACKET_PATTERN = re.compile(r"^\[")
+SINGLE_CLOSING_BRACKET_PATTERN = re.compile(r"\]$")
+# https://stackoverflow.com/questions/18893390/splitting-on-comma-outside-quotes
+COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)")
+
+
+def _manifest_filename(path: str) -> str:
+ """Get the full path to the manifest file.
+
+ Arguments:
+ path: The directory where the manifest file will be created.
+
+ Returns:
+ The full path to the manifest file.
+ """
+ return os.path.join(path, MANIFEST_FILENAME)
+
+
+def _convert_manifest_data_items_to_string_list(
+ items: List[Union[str, datetime.datetime, bool, int, float]],
+) -> str:
+ """
+ Handle converting an individual key that contains a possible list of data into a
+ list of strings or objects that can be written to the manifest file.
+
+ This has specific logic around how to handle datetime fields.
+
+ When working with datetime fields we are printing the ISO 8601 UTC representation of
+ the datetime.
+
+ When working with non strings we are printing the non-quoted version of the object.
+
+ Example: Examples
+ Several examples of how this function works.
+
+ >>> _convert_manifest_data_items_to_string_list(["a", "b", "c"])
+ '[a,b,c]'
+ >>> _convert_manifest_data_items_to_string_list(["string,with,commas", "string without commas"])
+ '["string,with,commas",string without commas]'
+ >>> _convert_manifest_data_items_to_string_list(["string,with,commas"])
+ 'string,with,commas'
+ >>> _convert_manifest_data_items_to_string_list(
+ [datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)])
+ '2020-01-01T00:00:00Z'
+ >>> _convert_manifest_data_items_to_string_list([True])
+ 'True'
+ >>> _convert_manifest_data_items_to_string_list([1])
+ '1'
+ >>> _convert_manifest_data_items_to_string_list([1.0])
+ '1.0'
+ >>> _convert_manifest_data_items_to_string_list(
+ [datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc),
+ datetime.datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)])
+ '[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]'
+
+
+ Args:
+ items: The list of items to convert.
+
+ Returns:
+ The list of items converted to strings.
+ """
+ items_to_write = []
+ for item in items:
+ if isinstance(item, datetime.datetime):
+ items_to_write.append(
+ utils.datetime_to_iso(dt=item, include_milliseconds_if_zero=False)
+ )
+ else:
+ # If a string based annotation has a comma in it
+ # this will wrap the string in quotes so it won't be parsed
+ # as multiple values. For example this is an annotation with 2 values:
+ # [my first annotation, "my, second, annotation"]
+ # This is an annotation with 4 value:
+ # [my first annotation, my, second, annotation]
+ if isinstance(item, str):
+ if len(items) > 1 and "," in item:
+ items_to_write.append(f'"{item}"')
+ else:
+ items_to_write.append(item)
+ else:
+ items_to_write.append(repr(item))
+
+ if len(items_to_write) > 1:
+ return f'[{",".join(items_to_write)}]'
+ elif len(items_to_write) == 1:
+ return items_to_write[0]
+ else:
+ return ""
+
+
+def _convert_manifest_data_row_to_dict(row: dict, keys: List[str]) -> dict:
+ """
+ Convert a row of data to a dict that can be written to a manifest file.
+
+ Args:
+ row: The row of data to convert.
+ keys: The keys of the manifest. Used to select the rows of data.
+
+ Returns:
+ The dict representation of the row.
+ """
+ data_to_write = {}
+ for key in keys:
+ data_for_key = row.get(key, "")
+ if isinstance(data_for_key, list):
+ items_to_write = _convert_manifest_data_items_to_string_list(data_for_key)
+ data_to_write[key] = items_to_write
+ else:
+ data_to_write[key] = data_for_key
+ return data_to_write
+
+
+def _write_manifest_data(filename: str, keys: List[str], data: List[dict]) -> None:
+ """
+ Write a number of keys and a list of data to a manifest file. This will write
+ the data out as a tab separated file.
+
+ For the data we are writing to the TSV file we are not quoting the content with any
+ characters. This is because the syncToSynapse function does not require strings to
+ be quoted. When quote characters were included extra double quotes were being added
+ to the strings when they were written to the manifest file. This was not causing
+ errors, however, it was changing the content of the manifest file when changes
+ were not required.
+
+ Args:
+ filename: The name of the file to write to.
+ keys: The keys of the manifest.
+ data: The data to write to the manifest. This should be a list of dicts where
+ each dict represents a row of data.
+ """
+ with io.open(filename, "w", encoding="utf8") if filename else sys.stdout as fp:
+ csv_writer = csv.DictWriter(
+ fp,
+ keys,
+ restval="",
+ extrasaction="ignore",
+ delimiter="\t",
+ quotechar=None,
+ quoting=csv.QUOTE_NONE,
+ )
+ csv_writer.writeheader()
+ for row in data:
+ csv_writer.writerow(rowdict=_convert_manifest_data_row_to_dict(row, keys))
+
+
+def _extract_entity_metadata_for_file(
+ all_files: List["File"],
+) -> Tuple[List[str], List[Dict[str, str]]]:
+ """
+ Extracts metadata from the list of File Entities and returns them in a form
+ usable by csv.DictWriter
+
+ Arguments:
+ all_files: an iterable that provides File entities
+
+ Returns:
+ keys: a list column headers
+ data: a list of dicts containing data from each row
+ """
+ keys = list(DEFAULT_GENERATED_MANIFEST_KEYS)
+ annotation_keys = set()
+ data = []
+ for entity in all_files:
+ row = {
+ "parent": entity.parent_id,
+ "path": entity.path,
+ "name": entity.name,
+ "id": entity.id,
+ "synapseStore": entity.synapse_store,
+ "contentType": entity.content_type,
+ }
+
+ if entity.annotations:
+ annotation_keys.update(set(entity.annotations.keys()))
+ row.update(
+ {
+ key: (val if len(val) > 0 else "")
+ for key, val in entity.annotations.items()
+ }
+ )
+
+ row_provenance = _get_entity_provenance_dict_for_file(entity=entity)
+ row.update(row_provenance)
+
+ data.append(row)
+ keys.extend(annotation_keys)
+ return keys, data
+
+
+def _get_entity_provenance_dict_for_file(entity: "File") -> Dict[str, str]:
+ """
+ Arguments:
+ entity: File entity object
+
+ Returns:
+ dict: a dict with a subset of the provenance metadata for the entity.
+ An empty dict is returned if the metadata does not have a provenance record.
+ """
+ if not entity.activity:
+ return {}
+
+ used_activities = []
+ for used_activity in entity.activity.used:
+ used_activities.append(used_activity.format_for_manifest())
+
+ executed_activities = []
+ for executed_activity in entity.activity.executed:
+ executed_activities.append(executed_activity.format_for_manifest())
+
+ return {
+ "used": ";".join(used_activities),
+ "executed": ";".join(executed_activities),
+ "activityName": entity.activity.name or "",
+ "activityDescription": entity.activity.description or "",
+ }
+
+
+def _validate_manifest_required_fields(
+ manifest_path: str,
+) -> Tuple[bool, List[str]]:
+ """
+ Validate that a manifest file exists and has the required fields.
+
+ Args:
+ manifest_path: Path to the manifest file.
+
+ Returns:
+ Tuple of (is_valid, list_of_error_messages).
+ """
+ errors = []
+
+ if not os.path.isfile(manifest_path):
+ errors.append(f"Manifest file not found: {manifest_path}")
+ return (False, errors)
+
+ try:
+ with io.open(manifest_path, "r", encoding="utf8") as fp:
+ reader = csv.DictReader(fp, delimiter="\t")
+ headers = reader.fieldnames or []
+
+ # Check for required fields
+ for field in REQUIRED_FIELDS:
+ if field not in headers:
+ errors.append(f"Missing required field: {field}")
+
+ # Validate each row
+ row_num = 1
+ for row in reader:
+ row_num += 1
+ path = row.get("path", "")
+ parent = row.get("parent", "")
+
+ if not path:
+ errors.append(f"Row {row_num}: 'path' is empty")
+
+ if not parent:
+ errors.append(f"Row {row_num}: 'parent' is empty")
+ elif not is_synapse_id_str(parent) and not is_url(parent):
+ errors.append(
+ f"Row {row_num}: 'parent' is not a valid Synapse ID: {parent}"
+ )
+
+ # Check if path exists (skip URLs)
+ if path and not is_url(path):
+ expanded_path = os.path.abspath(
+ os.path.expandvars(os.path.expanduser(path))
+ )
+ if not os.path.isfile(expanded_path):
+ errors.append(f"Row {row_num}: File not found: {path}")
+
+ except Exception as e:
+ errors.append(f"Error reading manifest file: {str(e)}")
+
+ return (len(errors) == 0, errors)
+
+
+@async_to_sync
+class ManifestGeneratable(ManifestGeneratableSynchronousProtocol):
+ """
+ Mixin for objects that can generate and read manifest TSV files.
+
+ In order to use this mixin, the class must have the following attributes:
+
+ - `id`
+ - `name`
+ - `_synced_from_synapse`
+
+ The class must also inherit from `StorableContainer` mixin which provides:
+
+ - `flatten_file_list()`
+ - `map_directory_to_all_contained_files()`
+ """
+
+ id: Optional[str] = None
+ name: Optional[str] = None
+ _synced_from_synapse: bool = False
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"{self.__class__.__name__}_generate_manifest: {self.id}"
+ )
+ async def generate_manifest_async(
+ self,
+ path: str,
+ manifest_scope: str = "all",
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Optional[str]:
+ """
+ Generate a manifest TSV file for all files in this container.
+
+ This method should be called after `sync_from_synapse()` to generate
+ a manifest of all downloaded files with their metadata.
+
+ Arguments:
+ path: The directory where the manifest file(s) will be written.
+ manifest_scope: Controls manifest file generation:
+
+ - "all": Create a manifest in each directory level
+ - "root": Create a single manifest at the root path only
+ - "suppress": Do not create any manifest files
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The path to the root manifest file if created, or None if suppressed.
+
+ Raises:
+ ValueError: If the container has not been synced from Synapse.
+ ValueError: If manifest_scope is not one of 'all', 'root', 'suppress'.
+
+ Example: Generate manifest after sync
+ Generate a manifest file after syncing from Synapse:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ project = Project(id="syn123").sync_from_synapse(
+ path="/path/to/download"
+ )
+ manifest_path = project.generate_manifest(
+ path="/path/to/download",
+ manifest_scope="root"
+ )
+ print(f"Manifest created at: {manifest_path}")
+ """
+ if manifest_scope not in ("all", "root", "suppress"):
+ raise ValueError(
+ 'Value of manifest_scope should be one of ("all", "root", "suppress")'
+ )
+
+ if manifest_scope == "suppress":
+ return None
+
+ if not self._synced_from_synapse:
+ raise ValueError(
+ "Container has not been synced from Synapse. "
+ "Call sync_from_synapse() before generating a manifest."
+ )
+
+ syn = Synapse.get_client(synapse_client=synapse_client)
+
+ # Expand the path
+ path = os.path.expanduser(path) if path else None
+ if not path:
+ raise ValueError("A path must be provided to generate a manifest.")
+
+ # Get all files from this container
+ all_files = self.flatten_file_list()
+
+ if not all_files:
+ syn.logger.info(
+ f"[{self.id}:{self.name}]: No files found in container, "
+ "skipping manifest generation."
+ )
+ return None
+
+ root_manifest_path = None
+
+ if manifest_scope == "root":
+ # Generate a single manifest at the root
+ keys, data = _extract_entity_metadata_for_file(all_files=all_files)
+ manifest_path = _manifest_filename(path)
+ _write_manifest_data(manifest_path, keys, data)
+ root_manifest_path = manifest_path
+ syn.logger.info(
+ f"[{self.id}:{self.name}]: Created manifest at {manifest_path}"
+ )
+ elif manifest_scope == "all":
+ # Generate a manifest at each directory level
+ directory_map = self.map_directory_to_all_contained_files(root_path=path)
+
+ for directory_path, files_in_directory in directory_map.items():
+ if files_in_directory:
+ keys, data = _extract_entity_metadata_for_file(
+ all_files=files_in_directory
+ )
+ manifest_path = _manifest_filename(directory_path)
+ _write_manifest_data(manifest_path, keys, data)
+
+ # Track the root manifest path
+ if directory_path == path:
+ root_manifest_path = manifest_path
+
+ syn.logger.info(
+ f"[{self.id}:{self.name}]: Created manifest at {manifest_path}"
+ )
+
+ return root_manifest_path
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"{self.__class__.__name__}_get_manifest_data: {self.id}"
+ )
+ async def get_manifest_data_async(
+ self,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Tuple[List[str], List[Dict[str, str]]]:
+ """
+ Get manifest data for all files in this container.
+
+ This method extracts metadata from all files that have been synced
+ to this container. The data can be used to generate a manifest file
+ or for other purposes.
+
+ Arguments:
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ Tuple of (keys, data) where keys is a list of column headers
+ and data is a list of dictionaries, one per file, containing
+ the file metadata.
+
+ Raises:
+ ValueError: If the container has not been synced from Synapse.
+
+ Example: Get manifest data
+ Get manifest data for all files in a project:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ project = Project(id="syn123").sync_from_synapse(
+ path="/path/to/download"
+ )
+ keys, data = project.get_manifest_data()
+ for row in data:
+ print(f"File: {row['name']} at {row['path']}")
+ """
+ if not self._synced_from_synapse:
+ raise ValueError(
+ "Container has not been synced from Synapse. "
+ "Call sync_from_synapse() before getting manifest data."
+ )
+
+ all_files = self.flatten_file_list()
+ return _extract_entity_metadata_for_file(all_files=all_files)
+
+ @classmethod
+ @otel_trace_method(
+ method_to_trace_name=lambda cls, **kwargs: f"{cls.__name__}_from_manifest"
+ )
+ async def from_manifest_async(
+ cls,
+ manifest_path: str,
+ parent_id: str,
+ dry_run: bool = False,
+ merge_existing_annotations: bool = True,
+ associate_activity_to_new_version: bool = False,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> List["File"]:
+ """
+ Upload files to Synapse from a manifest TSV file.
+
+ This method reads a manifest TSV file and uploads all files defined in it
+ to Synapse. The manifest file must contain at minimum the 'path' and 'parent'
+ columns.
+
+ Arguments:
+ manifest_path: Path to the manifest TSV file.
+ parent_id: The Synapse ID of the parent container (Project or Folder)
+ where files will be uploaded if not specified in the manifest.
+ dry_run: If True, validate the manifest but do not upload.
+ merge_existing_annotations: If True, merge annotations with existing
+ annotations on the file. If False, replace existing annotations.
+ associate_activity_to_new_version: If True, copy the activity
+ (provenance) from the previous version to the new version.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ List of File objects that were uploaded.
+
+ Raises:
+ ValueError: If the manifest file does not exist.
+ ValueError: If the manifest file is missing required fields.
+ IOError: If a file path in the manifest does not exist.
+
+ Example: Upload files from a manifest
+ Upload files from a manifest TSV file:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ files = Project.from_manifest(
+ manifest_path="/path/to/manifest.tsv",
+ parent_id="syn123"
+ )
+ for file in files:
+ print(f"Uploaded: {file.name} ({file.id})")
+
+ Example: Dry run validation
+ Validate a manifest without uploading:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ files = Project.from_manifest(
+ manifest_path="/path/to/manifest.tsv",
+ parent_id="syn123",
+ dry_run=True
+ )
+ print("Manifest is valid, ready for upload")
+ """
+ from synapseclient.models import Activity, File
+
+ syn = Synapse.get_client(synapse_client=synapse_client)
+
+ # Validate the manifest
+ is_valid, errors = _validate_manifest_required_fields(manifest_path)
+ if not is_valid:
+ raise ValueError(
+ "Invalid manifest file:\n" + "\n".join(f" - {e}" for e in errors)
+ )
+
+ # Read the manifest
+ rows = []
+ with io.open(manifest_path, "r", encoding="utf8") as fp:
+ reader = csv.DictReader(fp, delimiter="\t")
+ for row in reader:
+ rows.append(row)
+
+ if dry_run:
+ syn.logger.info(
+ f"Dry run: {len(rows)} files would be uploaded from manifest"
+ )
+ return []
+
+ # Build dependency graph for provenance ordering
+ path_to_row = {}
+ upload_order = {}
+
+ for row in rows:
+ path = row.get("path", "")
+ if path and not is_url(path):
+ path = os.path.abspath(os.path.expandvars(os.path.expanduser(path)))
+ path_to_row[path] = row
+
+ # Collect provenance references
+ all_refs = []
+ used = row.get("used", "")
+ if used and used.strip():
+ for item in used.split(";"):
+ item = item.strip()
+ if item:
+ if os.path.isfile(
+ os.path.abspath(
+ os.path.expandvars(os.path.expanduser(item))
+ )
+ ):
+ all_refs.append(
+ os.path.abspath(
+ os.path.expandvars(os.path.expanduser(item))
+ )
+ )
+
+ executed = row.get("executed", "")
+ if executed and executed.strip():
+ for item in executed.split(";"):
+ item = item.strip()
+ if item:
+ if os.path.isfile(
+ os.path.abspath(
+ os.path.expandvars(os.path.expanduser(item))
+ )
+ ):
+ all_refs.append(
+ os.path.abspath(
+ os.path.expandvars(os.path.expanduser(item))
+ )
+ )
+
+ upload_order[path] = all_refs
+
+ # Topologically sort based on provenance dependencies
+ sorted_paths = topolgical_sort(upload_order)
+ sorted_paths = [p[0] for p in sorted_paths]
+
+ # Track uploaded files for provenance resolution
+ path_to_synapse_id: Dict[str, str] = {}
+ uploaded_files: List["File"] = []
+
+ for path in sorted_paths:
+ row = path_to_row[path]
+
+ # Get parent - use manifest value or fall back to provided parent_id
+ file_parent = row.get("parent", "").strip() or parent_id
+
+ # Build the File object
+ file = File(
+ path=path,
+ parent_id=file_parent,
+ name=row.get("name", "").strip() or None,
+ id=row.get("id", "").strip() or None,
+ synapse_store=(
+ row.get("synapseStore", "").strip().lower() != "false"
+ if row.get("synapseStore", "").strip()
+ else True
+ ),
+ content_type=row.get("contentType", "").strip() or None,
+ merge_existing_annotations=merge_existing_annotations,
+ associate_activity_to_new_version=associate_activity_to_new_version,
+ )
+
+ # Build annotations from extra columns
+ annotations = {}
+ skip_keys = set(
+ REQUIRED_FIELDS
+ + FILE_CONSTRUCTOR_FIELDS
+ + STORE_FUNCTION_FIELDS
+ + PROVENANCE_FIELDS
+ )
+ for key, value in row.items():
+ if key not in skip_keys and value and value.strip():
+ annotations[key] = _parse_manifest_value(value.strip())
+ if annotations:
+ file.annotations = annotations
+
+ # Build provenance/activity
+ used_items = []
+ executed_items = []
+
+ used_str = row.get("used", "")
+ if used_str and used_str.strip():
+ for item in used_str.split(";"):
+ item = item.strip()
+ if item:
+ used_items.append(
+ _resolve_provenance_item(item, path_to_synapse_id)
+ )
+
+ executed_str = row.get("executed", "")
+ if executed_str and executed_str.strip():
+ for item in executed_str.split(";"):
+ item = item.strip()
+ if item:
+ executed_items.append(
+ _resolve_provenance_item(item, path_to_synapse_id)
+ )
+
+ if used_items or executed_items:
+ activity = Activity(
+ name=row.get("activityName", "").strip() or None,
+ description=row.get("activityDescription", "").strip() or None,
+ used=used_items,
+ executed=executed_items,
+ )
+ file.activity = activity
+
+ # Upload the file
+ file = await file.store_async(synapse_client=syn)
+
+ # Track for provenance resolution
+ path_to_synapse_id[path] = file.id
+ uploaded_files.append(file)
+
+ syn.logger.info(f"Uploaded: {file.name} ({file.id})")
+
+ return uploaded_files
+
+ @staticmethod
+ @otel_trace_method(method_to_trace_name=lambda **kwargs: "validate_manifest")
+ async def validate_manifest_async(
+ manifest_path: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Tuple[bool, List[str]]:
+ """
+ Validate a manifest TSV file without uploading.
+
+ This method validates a manifest file to ensure it is properly formatted
+ and all paths exist.
+
+ Arguments:
+ manifest_path: Path to the manifest TSV file.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ Tuple of (is_valid, list_of_error_messages). If the manifest is valid,
+ is_valid will be True and the list will be empty.
+
+ Example: Validate a manifest file
+ Validate a manifest file before uploading:
+
+ from synapseclient.models import Project
+
+ is_valid, errors = Project.validate_manifest(
+ manifest_path="/path/to/manifest.tsv"
+ )
+ if is_valid:
+ print("Manifest is valid")
+ else:
+ for error in errors:
+ print(f"Error: {error}")
+ """
+ return _validate_manifest_required_fields(manifest_path)
+
+ @staticmethod
+ async def generate_download_list_manifest_async(
+ download_path: str,
+ csv_separator: str = ",",
+ include_header: bool = True,
+ timeout: int = 120,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> str:
+ """
+ Generate a manifest file from the current user's download list using the
+ Synapse REST API.
+
+ This method creates a CSV manifest containing metadata about all files in
+ the user's download list. The manifest is generated server-side by Synapse
+ and then downloaded to the specified path.
+
+ This is interoperable with the Synapse download list feature and provides
+ a way to export the download list as a manifest file that can be used for
+ bulk operations.
+
+ Arguments:
+ download_path: The local directory path where the manifest will be saved.
+ csv_separator: The delimiter character for the CSV file.
+ Defaults to "," for comma-separated values. Use "\t" for tab-separated.
+ include_header: Whether to include column headers in the first row.
+ Defaults to True.
+ timeout: The number of seconds to wait for the job to complete.
+ Defaults to 120 seconds.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The full path to the downloaded manifest file.
+
+ Example: Generate manifest from download list
+ Generate a manifest from your Synapse download list:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ # Generate manifest from download list
+ manifest_path = Project.generate_download_list_manifest(
+ download_path="/path/to/download"
+ )
+ print(f"Manifest downloaded to: {manifest_path}")
+
+ Example: Generate tab-separated manifest
+ Generate a TSV manifest from your download list:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ manifest_path = Project.generate_download_list_manifest(
+ download_path="/path/to/download",
+ csv_separator="\t"
+ )
+
+ See Also:
+ - `DownloadListManifestRequest`: The underlying request class for more
+ fine-grained control over the manifest generation process.
+ """
+ from synapseclient.models.download_list import DownloadListManifestRequest
+ from synapseclient.models.table_components import CsvTableDescriptor
+
+ # Create the request with CSV formatting options
+ request = DownloadListManifestRequest(
+ csv_table_descriptor=CsvTableDescriptor(
+ separator=csv_separator,
+ is_first_line_header=include_header,
+ )
+ )
+
+ # Send the job and wait for completion
+ await request.send_job_and_wait_async(
+ timeout=timeout,
+ synapse_client=synapse_client,
+ )
+
+ # Download the manifest
+ manifest_file_path = await request.download_manifest_async(
+ download_path=download_path,
+ synapse_client=synapse_client,
+ )
+
+ return manifest_file_path
+
+
+def _resolve_provenance_item(
+ item: str,
+ path_to_synapse_id: Dict[str, str],
+) -> Any:
+ """
+ Resolve a provenance item to a UsedEntity or UsedURL.
+
+ Args:
+ item: The provenance item string (could be a path, Synapse ID, or URL).
+ path_to_synapse_id: Mapping of local file paths to their Synapse IDs.
+
+ Returns:
+ UsedEntity or UsedURL object.
+ """
+ from synapseclient.models import UsedEntity, UsedURL
+
+ # Check if it's a local file path that was uploaded
+ expanded_path = os.path.abspath(os.path.expandvars(os.path.expanduser(item)))
+ if expanded_path in path_to_synapse_id:
+ return UsedEntity(target_id=path_to_synapse_id[expanded_path])
+
+ # Check if it's a URL
+ if is_url(item):
+ return UsedURL(url=item)
+
+ # Check if it's a Synapse ID
+ if is_synapse_id_str(item):
+ return UsedEntity(target_id=item)
+
+ # Assume it's a Synapse ID
+ return UsedEntity(target_id=item)
+
+
+def _parse_manifest_value(value: str) -> Any:
+ """
+ Parse a manifest cell value into an appropriate Python type.
+
+ Handles:
+ - List syntax: [a,b,c] -> ['a', 'b', 'c']
+ - Boolean strings: 'true', 'false' -> True, False
+ - Numeric strings: '123' -> 123, '1.5' -> 1.5
+ - Everything else: returned as string
+
+ Args:
+ value: The string value from the manifest.
+
+ Returns:
+ The parsed value.
+ """
+ # Check for list syntax
+ if ARRAY_BRACKET_PATTERN.match(value):
+ # Remove brackets
+ inner = value[1:-1]
+ # Split on commas outside quotes
+ items = COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN.split(inner)
+ result = []
+ for item in items:
+ item = item.strip()
+ # Remove surrounding quotes if present
+ if item.startswith('"') and item.endswith('"'):
+ item = item[1:-1]
+ result.append(item)
+ return result
+
+ # Check for boolean
+ if value.lower() == "true":
+ return True
+ if value.lower() == "false":
+ return False
+
+ # Check for integer
+ try:
+ return int(value)
+ except ValueError:
+ pass
+
+ # Check for float
+ try:
+ return float(value)
+ except ValueError:
+ pass
+
+ # Return as string
+ return value
diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py
index 25432a6b9..1a2d557f2 100644
--- a/synapseclient/models/mixins/storable_container.py
+++ b/synapseclient/models/mixins/storable_container.py
@@ -159,6 +159,7 @@ async def sync_from_synapse_async(
link_hops: int = 1,
queue: asyncio.Queue = None,
include_types: Optional[List[str]] = None,
+ generate_manifest: str = "suppress",
*,
synapse_client: Optional[Synapse] = None,
) -> Self:
@@ -170,9 +171,8 @@ async def sync_from_synapse_async(
If you only want to retrieve the full tree of metadata about your
container specify `download_file` as False.
- This works similar to [synapseutils.syncFromSynapse][], however, this does not
- currently support the writing of data to a manifest TSV file. This will be a
- future enhancement.
+ This works similar to [synapseutils.syncFromSynapse][] and supports
+ generating a manifest TSV file with file metadata.
Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets,
DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The
@@ -208,6 +208,13 @@ async def sync_from_synapse_async(
`["folder", "file", "table", "entityview", "dockerrepo",
"submissionview", "dataset", "datasetcollection", "materializedview",
"virtualtable"]`.
+ generate_manifest: Controls manifest file generation. Options:
+
+ - "all": Create a manifest in each directory level
+ - "root": Create a single manifest at the root path only
+ - "suppress": (Default) Do not create any manifest files
+
+ A path must be specified for manifest generation.
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
@@ -386,7 +393,7 @@ async def my_function():
file_size=1, synapse_client=syn, custom_message=custom_message
):
self._synced_from_synapse = True
- return await self._sync_from_synapse_async(
+ await self._sync_from_synapse_async(
path=path,
recursive=recursive,
download_file=download_file,
@@ -400,6 +407,19 @@ async def my_function():
synapse_client=syn,
)
+ # Generate manifest if requested and path is provided
+ if generate_manifest != "suppress" and path:
+ # The manifest generation is handled by ManifestGeneratable mixin
+ # which provides generate_manifest_async method
+ if hasattr(self, "generate_manifest_async"):
+ await self.generate_manifest_async(
+ path=path,
+ manifest_scope=generate_manifest,
+ synapse_client=syn,
+ )
+
+ return self
+
async def _sync_from_synapse_async(
self: Self,
path: Optional[str] = None,
diff --git a/synapseclient/models/mixins/storage_location_mixin.py b/synapseclient/models/mixins/storage_location_mixin.py
new file mode 100644
index 000000000..db3c509a8
--- /dev/null
+++ b/synapseclient/models/mixins/storage_location_mixin.py
@@ -0,0 +1,450 @@
+"""Mixin for entities that can have their storage location configured."""
+
+import asyncio
+from typing import Any, Dict, List, Optional, Union
+
+from synapseclient import Synapse
+from synapseclient.api.storage_location_services import (
+ create_project_setting,
+ delete_project_setting,
+ get_project_setting,
+ update_project_setting,
+)
+from synapseclient.core.async_utils import async_to_sync, otel_trace_method
+from synapseclient.core.constants import concrete_types
+from synapseclient.models.protocols.storage_location_mixin_protocol import (
+ StorageLocationConfigurableSynchronousProtocol,
+)
+from synapseclient.models.services.migration import (
+ index_files_for_migration_async as _index_files_for_migration_async,
+)
+from synapseclient.models.services.migration import (
+ migrate_indexed_files_async as _migrate_indexed_files_async,
+)
+from synapseclient.models.services.migration_types import MigrationResult
+
+# Default storage location ID used by Synapse
+DEFAULT_STORAGE_LOCATION_ID = 1
+
+
+@async_to_sync
+class StorageLocationConfigurable(StorageLocationConfigurableSynchronousProtocol):
+ """Mixin for objects that can have their storage location configured.
+
+ In order to use this mixin, the class must have an `id` attribute.
+
+ This mixin provides methods for:
+ - Setting and getting the upload storage location for an entity
+ - Getting STS (AWS Security Token Service) credentials for direct S3 access
+ - Migrating files to a new storage location
+ """
+
+ id: Optional[str] = None
+ """The unique immutable ID for this entity."""
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_SetStorageLocation: {self.id}"
+ )
+ async def set_storage_location_async(
+ self,
+ storage_location_id: Optional[Union[int, List[int]]] = None,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Dict[str, Any]:
+ """Set the upload storage location for this entity. This configures where
+ files uploaded to this entity will be stored.
+
+ Arguments:
+ storage_location_id: The storage location ID(s) to set. Can be a single
+ ID, a list of IDs (first is default, max 10), or None to use
+ Synapse default storage.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The project setting dict returned from Synapse.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Using this function
+ Set storage location on a folder:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Folder
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ folder = await Folder(id="syn123").get_async()
+ setting = await folder.set_storage_location_async(
+ storage_location_id=12345
+ )
+ print(setting)
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ if storage_location_id is None:
+ storage_location_id = DEFAULT_STORAGE_LOCATION_ID
+
+ locations = (
+ storage_location_id
+ if isinstance(storage_location_id, list)
+ else [storage_location_id]
+ )
+
+ existing_setting = await get_project_setting(
+ project_id=self.id,
+ setting_type="upload",
+ synapse_client=synapse_client,
+ )
+
+ if existing_setting is not None:
+ existing_setting["locations"] = locations
+ await update_project_setting(
+ body=existing_setting,
+ synapse_client=synapse_client,
+ )
+ return await get_project_setting(
+ project_id=self.id,
+ setting_type="upload",
+ synapse_client=synapse_client,
+ )
+ else:
+ project_destination = {
+ "concreteType": concrete_types.UPLOAD_DESTINATION_LIST_SETTING,
+ "settingsType": "upload",
+ "locations": locations,
+ "projectId": self.id,
+ }
+ return await create_project_setting(
+ body=project_destination,
+ synapse_client=synapse_client,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_GetProjectSetting: {self.id}"
+ )
+ async def get_project_setting_async(
+ self,
+ setting_type: str = "upload",
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Optional[Dict[str, Any]]:
+ """Get the project setting for this entity.
+
+ Arguments:
+ setting_type: The type of setting to retrieve. One of:
+ 'upload', 'external_sync', 'requester_pays'. Default: 'upload'.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The project setting as a dictionary, or None if no setting exists.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Using this function
+ Get the upload settings for a folder:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Folder
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ folder = await Folder(id="syn123").get_async()
+ setting = await folder.get_project_setting_async(setting_type="upload")
+ if setting:
+ print(f"Storage locations: {setting.get('locations')}")
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ if setting_type not in {"upload", "external_sync", "requester_pays"}:
+ raise ValueError(f"Invalid setting_type: {setting_type}")
+
+ return await get_project_setting(
+ project_id=self.id,
+ setting_type=setting_type,
+ synapse_client=synapse_client,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_DeleteProjectSetting: {self.id}"
+ )
+ async def delete_project_setting_async(
+ self,
+ setting_id: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> None:
+ """Delete a project setting by its setting ID.
+
+ Arguments:
+ setting_id: The ID of the project setting to delete.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ None
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Using this function
+ Delete the upload settings for a folder:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Folder
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ folder = await Folder(id="syn123").get_async()
+ setting = await folder.get_project_setting_async(setting_type="upload")
+ if setting:
+ await folder.delete_project_setting_async(setting_id=setting['id'])
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ await delete_project_setting(
+ setting_id=setting_id,
+ synapse_client=synapse_client,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_GetStsStorageToken: {self.id}"
+ )
+ async def get_sts_storage_token_async(
+ self,
+ permission: str,
+ *,
+ output_format: str = "json",
+ min_remaining_life: Optional[int] = None,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Any:
+ """Get STS (AWS Security Token Service) credentials for direct access to
+ the storage location backing this entity. These credentials can be used
+ with AWS tools like awscli and boto3.
+
+ Arguments:
+ permission: The permission level for the token. Must be 'read_only'
+ or 'read_write'.
+ output_format: The output format for the credentials. Options:
+ 'json' (default), 'boto', 'shell', 'bash', 'cmd', 'powershell'.
+ min_remaining_life: The minimum remaining life (in seconds) for a
+ cached token before a new one is fetched.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The STS credentials in the requested format.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Using credentials with boto3
+ Get STS credentials for an STS-enabled folder and use with boto3:
+
+ import asyncio
+ import boto3
+ from synapseclient import Synapse
+ from synapseclient.models import Folder
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ folder = await Folder(id="syn123").get_async()
+ credentials = await folder.get_sts_storage_token_async(
+ permission="read_write",
+ output_format="boto",
+ )
+ s3_client = boto3.client('s3', **credentials)
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ from synapseclient.core import sts_transfer
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+
+ return await asyncio.to_thread(
+ sts_transfer.get_sts_credentials,
+ client,
+ self.id,
+ permission,
+ output_format=output_format,
+ min_remaining_life=min_remaining_life,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_IndexFilesForMigration: {self.id}"
+ )
+ async def index_files_for_migration_async(
+ self,
+ dest_storage_location_id: int,
+ db_path: Optional[str] = None,
+ *,
+ source_storage_location_ids: Optional[List[int]] = None,
+ file_version_strategy: str = "new",
+ include_table_files: bool = False,
+ continue_on_error: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> MigrationResult:
+ """Index files in this entity for migration to a new storage location.
+
+ This is the first step in migrating files to a new storage location.
+ After indexing, use `migrate_indexed_files` to perform the actual migration.
+
+ Arguments:
+ dest_storage_location_id: The destination storage location ID.
+ db_path: Path to the SQLite database file for tracking migration state.
+ If not provided, a temporary directory will be used. The path
+ can be retrieved from the returned MigrationResult.db_path.
+ source_storage_location_ids: Optional list of source storage location IDs
+ to filter which files to migrate. If None, all files are indexed.
+ file_version_strategy: Strategy for handling file versions. Options:
+ 'new' (default) - create new versions, 'all' - migrate all versions,
+ 'latest' - only migrate latest version, 'skip' - skip if file exists.
+ include_table_files: Whether to include files attached to tables.
+ continue_on_error: Whether to continue indexing if an error occurs.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A MigrationResult object containing indexing statistics and the database
+ path (accessible via result.db_path).
+
+ Example: Indexing files for migration
+ Index files in a project for migration:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Project
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ project = await Project(id="syn123").get_async()
+ result = await project.index_files_for_migration_async(
+ dest_storage_location_id=12345,
+ )
+ print(f"Database path: {result.db_path}")
+ print(f"Indexed {result.counts_by_status}")
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ return await _index_files_for_migration_async(
+ entity_id=self.id,
+ dest_storage_location_id=str(dest_storage_location_id),
+ db_path=db_path,
+ source_storage_location_ids=(
+ [str(s) for s in source_storage_location_ids]
+ if source_storage_location_ids
+ else None
+ ),
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ continue_on_error=continue_on_error,
+ synapse_client=synapse_client,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_MigrateIndexedFiles: {self.id}"
+ )
+ async def migrate_indexed_files_async(
+ self,
+ db_path: str,
+ *,
+ create_table_snapshots: bool = True,
+ continue_on_error: bool = False,
+ force: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Optional[MigrationResult]:
+ """Migrate files that have been indexed with `index_files_for_migration`.
+
+ This is the second step in migrating files to a new storage location.
+ Files must first be indexed using `index_files_for_migration`.
+
+ Arguments:
+ db_path: Path to the SQLite database file created by
+ `index_files_for_migration`. You can get this from the
+ MigrationResult.db_path returned by index_files_for_migration.
+ create_table_snapshots: Whether to create table snapshots before
+ migrating table files.
+ continue_on_error: Whether to continue migration if an error occurs.
+ force: Whether to force migration of files that have already been
+ migrated. Also bypasses interactive confirmation.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A MigrationResult object containing migration statistics, or None
+ if the user declined the confirmation prompt.
+
+ Example: Migrating indexed files
+ Migrate previously indexed files:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Project
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ project = await Project(id="syn123").get_async()
+
+ # Index first
+ index_result = await project.index_files_for_migration_async(
+ dest_storage_location_id=12345,
+ )
+
+ # Then migrate using the db_path from index result
+ result = await project.migrate_indexed_files_async(
+ db_path=index_result.db_path,
+ force=True, # Skip interactive confirmation
+ )
+ print(f"Migrated {result.counts_by_status}")
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ return await _migrate_indexed_files_async(
+ db_path=db_path,
+ create_table_snapshots=create_table_snapshots,
+ continue_on_error=continue_on_error,
+ force=force,
+ synapse_client=synapse_client,
+ )
diff --git a/synapseclient/models/project.py b/synapseclient/models/project.py
index a1a6a1c21..6686c8ac5 100644
--- a/synapseclient/models/project.py
+++ b/synapseclient/models/project.py
@@ -18,6 +18,10 @@
ContainerEntityJSONSchema,
StorableContainer,
)
+from synapseclient.models.mixins.manifest import ManifestGeneratable
+from synapseclient.models.mixins.storage_location_mixin import (
+ StorageLocationConfigurable,
+)
from synapseclient.models.protocols.project_protocol import ProjectSynchronousProtocol
from synapseclient.models.services.search import get_id
from synapseclient.models.services.storable_entity import store_entity
@@ -46,6 +50,8 @@ class Project(
AccessControllable,
StorableContainer,
ContainerEntityJSONSchema,
+ StorageLocationConfigurable,
+ ManifestGeneratable,
):
"""A Project is a top-level container for organizing data in Synapse.
diff --git a/synapseclient/models/protocols/download_list_protocol.py b/synapseclient/models/protocols/download_list_protocol.py
new file mode 100644
index 000000000..7152d4bf1
--- /dev/null
+++ b/synapseclient/models/protocols/download_list_protocol.py
@@ -0,0 +1,97 @@
+"""Protocol for the specific methods of download list classes that have synchronous counterparts
+generated at runtime."""
+
+from typing import Any, Dict, Optional, Protocol
+
+from typing_extensions import Self
+
+from synapseclient import Synapse
+
+
+class DownloadListManifestRequestSynchronousProtocol(Protocol):
+ """
+ The protocol for methods that are asynchronous but also
+ have a synchronous counterpart that may also be called.
+ """
+
+ def send_job_and_wait(
+ self,
+ post_exchange_args: Optional[Dict[str, Any]] = None,
+ timeout: int = 120,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Self:
+ """Send the job to the Asynchronous Job service and wait for it to complete.
+
+ This method sends the manifest generation request to Synapse and waits
+ for the job to complete. After completion, the `result_file_handle_id`
+ attribute will be populated.
+
+ Arguments:
+ post_exchange_args: Additional arguments to pass to the request.
+ timeout: The number of seconds to wait for the job to complete or progress
+ before raising a SynapseTimeoutError. Defaults to 120.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ This instance with `result_file_handle_id` populated.
+
+ Raises:
+ SynapseTimeoutError: If the job does not complete within the timeout.
+ SynapseError: If the job fails.
+
+ Example: Generate a manifest
+ Generate a manifest from the download list:
+
+ from synapseclient.models import DownloadListManifestRequest
+ import synapseclient
+
+ synapseclient.login()
+
+ request = DownloadListManifestRequest()
+ request.send_job_and_wait()
+ print(f"Manifest file handle: {request.result_file_handle_id}")
+ """
+ return self
+
+ def download_manifest(
+ self,
+ download_path: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> str:
+ """
+ Download the generated manifest file to a local path.
+
+ This method should be called after `send_job_and_wait()` has completed
+ successfully and `result_file_handle_id` is populated.
+
+ Arguments:
+ download_path: The local directory path where the manifest will be saved.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The full path to the downloaded manifest file.
+
+ Raises:
+ ValueError: If the manifest has not been generated yet (no result_file_handle_id).
+
+ Example: Download the manifest after generation
+ Generate and download a manifest:
+
+ from synapseclient.models import DownloadListManifestRequest
+ import synapseclient
+
+ synapseclient.login()
+
+ request = DownloadListManifestRequest()
+ request.send_job_and_wait()
+
+ manifest_path = request.download_manifest(download_path="/path/to/download")
+ print(f"Manifest downloaded to: {manifest_path}")
+ """
+ return ""
diff --git a/synapseclient/models/protocols/manifest_protocol.py b/synapseclient/models/protocols/manifest_protocol.py
new file mode 100644
index 000000000..1da447da0
--- /dev/null
+++ b/synapseclient/models/protocols/manifest_protocol.py
@@ -0,0 +1,240 @@
+"""Protocol for the specific methods of ManifestGeneratable mixin that have
+synchronous counterparts generated at runtime."""
+
+from typing import Dict, List, Optional, Protocol, Tuple
+
+from synapseclient import Synapse
+
+
+class ManifestGeneratableSynchronousProtocol(Protocol):
+ """
+ The protocol for methods that are asynchronous but also
+ have a synchronous counterpart that may also be called.
+ """
+
+ def generate_manifest(
+ self,
+ path: str,
+ manifest_scope: str = "all",
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Optional[str]:
+ """Generate a manifest TSV file for all files in this container.
+
+ This method should be called after `sync_from_synapse()` to generate
+ a manifest of all downloaded files with their metadata.
+
+ Arguments:
+ path: The directory where the manifest file(s) will be written.
+ manifest_scope: Controls manifest file generation:
+
+ - "all": Create a manifest in each directory level
+ - "root": Create a single manifest at the root path only
+ - "suppress": Do not create any manifest files
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The path to the root manifest file if created, or None if suppressed.
+
+ Raises:
+ ValueError: If the container has not been synced from Synapse.
+ ValueError: If manifest_scope is not one of 'all', 'root', 'suppress'.
+
+ Example: Generate manifest after sync
+ Generate a manifest file after syncing from Synapse:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ project = Project(id="syn123").sync_from_synapse(
+ path="/path/to/download"
+ )
+ manifest_path = project.generate_manifest(
+ path="/path/to/download",
+ manifest_scope="root"
+ )
+ print(f"Manifest created at: {manifest_path}")
+ """
+ return None
+
+ @classmethod
+ def from_manifest(
+ cls,
+ manifest_path: str,
+ parent_id: str,
+ dry_run: bool = False,
+ merge_existing_annotations: bool = True,
+ associate_activity_to_new_version: bool = False,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> List:
+ """Upload files to Synapse from a manifest TSV file.
+
+ This method reads a manifest TSV file and uploads all files defined in it
+ to Synapse. The manifest file must contain at minimum the 'path' and 'parent'
+ columns.
+
+ Arguments:
+ manifest_path: Path to the manifest TSV file.
+ parent_id: The Synapse ID of the parent container (Project or Folder)
+ where files will be uploaded if not specified in the manifest.
+ dry_run: If True, validate the manifest but do not upload.
+ merge_existing_annotations: If True, merge annotations with existing
+ annotations on the file. If False, replace existing annotations.
+ associate_activity_to_new_version: If True, copy the activity
+ (provenance) from the previous version to the new version.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ List of File objects that were uploaded.
+
+ Example: Upload files from a manifest
+ Upload files from a manifest TSV file:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ files = Project.from_manifest(
+ manifest_path="/path/to/manifest.tsv",
+ parent_id="syn123"
+ )
+ for file in files:
+ print(f"Uploaded: {file.name} ({file.id})")
+ """
+ return []
+
+ @staticmethod
+ def validate_manifest(
+ manifest_path: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Tuple[bool, List[str]]:
+ """Validate a manifest TSV file without uploading.
+
+ This method validates a manifest file to ensure it is properly formatted
+ and all paths exist.
+
+ Arguments:
+ manifest_path: Path to the manifest TSV file.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ Tuple of (is_valid, list_of_error_messages). If the manifest is valid,
+ is_valid will be True and the list will be empty.
+
+ Example: Validate a manifest file
+ Validate a manifest file before uploading:
+
+ from synapseclient.models import Project
+
+ is_valid, errors = Project.validate_manifest(
+ manifest_path="/path/to/manifest.tsv"
+ )
+ if is_valid:
+ print("Manifest is valid")
+ else:
+ for error in errors:
+ print(f"Error: {error}")
+ """
+ return (True, [])
+
+ def get_manifest_data(
+ self,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Tuple[List[str], List[Dict[str, str]]]:
+ """Get manifest data for all files in this container.
+
+ This method extracts metadata from all files that have been synced
+ to this container. The data can be used to generate a manifest file
+ or for other purposes.
+
+ Arguments:
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ Tuple of (keys, data) where keys is a list of column headers
+ and data is a list of dictionaries, one per file, containing
+ the file metadata.
+
+ Raises:
+ ValueError: If the container has not been synced from Synapse.
+
+ Example: Get manifest data
+ Get manifest data for all files in a project:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ project = Project(id="syn123").sync_from_synapse(
+ path="/path/to/download"
+ )
+ keys, data = project.get_manifest_data()
+ for row in data:
+ print(f"File: {row['name']} at {row['path']}")
+ """
+ return ([], [])
+
+ @staticmethod
+ def generate_download_list_manifest(
+ download_path: str,
+ csv_separator: str = ",",
+ include_header: bool = True,
+ timeout: int = 120,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> str:
+ """Generate a manifest file from the current user's download list.
+
+ This method creates a CSV manifest containing metadata about all files in
+ the user's download list. The manifest is generated server-side by Synapse
+ and then downloaded to the specified path.
+
+ This is interoperable with the Synapse download list feature and provides
+ a way to export the download list as a manifest file that can be used for
+ bulk operations.
+
+ Arguments:
+ download_path: The local directory path where the manifest will be saved.
+ csv_separator: The delimiter character for the CSV file.
+ Defaults to "," for comma-separated values. Use "\t" for tab-separated.
+ include_header: Whether to include column headers in the first row.
+ Defaults to True.
+ timeout: The number of seconds to wait for the job to complete.
+ Defaults to 120 seconds.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The full path to the downloaded manifest file.
+
+ Example: Generate manifest from download list
+ Generate a manifest from your Synapse download list:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ # Generate manifest from download list
+ manifest_path = Project.generate_download_list_manifest(
+ download_path="/path/to/download"
+ )
+ print(f"Manifest downloaded to: {manifest_path}")
+ """
+ return ""
diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py
index 0352132d1..245836adf 100644
--- a/synapseclient/models/protocols/storable_container_protocol.py
+++ b/synapseclient/models/protocols/storable_container_protocol.py
@@ -29,6 +29,7 @@ def sync_from_synapse(
link_hops: int = 1,
queue: asyncio.Queue = None,
include_types: Optional[List[str]] = None,
+ generate_manifest: str = "suppress",
*,
synapse_client: Optional[Synapse] = None,
) -> Self:
@@ -40,9 +41,8 @@ def sync_from_synapse(
If you only want to retrieve the full tree of metadata about your
container specify `download_file` as False.
- This works similar to [synapseutils.syncFromSynapse][], however, this does not
- currently support the writing of data to a manifest TSV file. This will be a
- future enhancement.
+ This works similar to [synapseutils.syncFromSynapse][] and supports
+ generating a manifest TSV file with file metadata.
Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets,
DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The
@@ -74,6 +74,13 @@ def sync_from_synapse(
include_types: Must be a list of entity types (ie. ["folder","file"]) which
can be found
[here](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/EntityType.html)
+ generate_manifest: Controls manifest file generation. Options:
+
+ - "all": Create a manifest in each directory level
+ - "root": Create a single manifest at the root path only
+ - "suppress": (Default) Do not create any manifest files
+
+ A path must be specified for manifest generation.
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
diff --git a/synapseclient/models/protocols/storage_location_mixin_protocol.py b/synapseclient/models/protocols/storage_location_mixin_protocol.py
new file mode 100644
index 000000000..7403972a6
--- /dev/null
+++ b/synapseclient/models/protocols/storage_location_mixin_protocol.py
@@ -0,0 +1,279 @@
+"""Protocol for the specific methods of StorageLocationConfigurable mixin that have
+synchronous counterparts generated at runtime."""
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Protocol, Union
+
+from synapseclient import Synapse
+
+if TYPE_CHECKING:
+ from synapseclient.models.services.migration_types import MigrationResult
+
+
+class StorageLocationConfigurableSynchronousProtocol(Protocol):
+ """
+ The protocol for methods that are asynchronous but also
+ have a synchronous counterpart that may also be called.
+ """
+
+ def set_storage_location(
+ self,
+ storage_location_id: Optional[Union[int, List[int]]] = None,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Dict[str, Any]:
+ """Set the upload storage location for this entity. This configures where
+ files uploaded to this entity will be stored.
+
+ Arguments:
+ storage_location_id: The storage location ID(s) to set. Can be a single
+ ID, a list of IDs (first is default, max 10), or None to use
+ Synapse default storage.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The project setting dict returned from Synapse.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Setting storage location on a folder
+ Set storage location on a folder:
+
+ from synapseclient.models import Folder
+
+ import synapseclient
+ synapseclient.login()
+
+ folder = Folder(id="syn123").get()
+ setting = folder.set_storage_location(storage_location_id=12345)
+ print(setting)
+ """
+ return {}
+
+ def get_project_setting(
+ self,
+ setting_type: str = "upload",
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Optional[Dict[str, Any]]:
+ """Get the project setting for this entity.
+
+ Arguments:
+ setting_type: The type of setting to retrieve. One of:
+ 'upload', 'external_sync', 'requester_pays'. Default: 'upload'.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The project setting as a dictionary, or None if no setting exists.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Getting project settings
+ Get the upload settings for a folder:
+
+ from synapseclient.models import Folder
+
+ import synapseclient
+ synapseclient.login()
+
+ folder = Folder(id="syn123").get()
+ setting = folder.get_project_setting(setting_type="upload")
+ if setting:
+ print(f"Storage locations: {setting.get('locations')}")
+ """
+ return {}
+
+ def delete_project_setting(
+ self,
+ setting_id: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> None:
+ """Delete a project setting by its setting ID.
+
+ Arguments:
+ setting_id: The ID of the project setting to delete.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ None
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Deleting a project setting
+ Delete the upload settings for a folder:
+
+ from synapseclient.models import Folder
+
+ import synapseclient
+ synapseclient.login()
+
+ folder = Folder(id="syn123").get()
+ setting = folder.get_project_setting(setting_type="upload")
+ if setting:
+ folder.delete_project_setting(setting_id=setting['id'])
+ """
+ return None
+
+ def get_sts_storage_token(
+ self,
+ permission: str,
+ *,
+ output_format: str = "json",
+ min_remaining_life: Optional[int] = None,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Any:
+ """Get STS (AWS Security Token Service) credentials for direct access to
+ the storage location backing this entity. These credentials can be used
+ with AWS tools like awscli and boto3.
+
+ Arguments:
+ permission: The permission level for the token. Must be 'read_only'
+ or 'read_write'.
+ output_format: The output format for the credentials. Options:
+ 'json' (default), 'boto', 'shell', 'bash', 'cmd', 'powershell'.
+ min_remaining_life: The minimum remaining life (in seconds) for a
+ cached token before a new one is fetched.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The STS credentials in the requested format.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Using credentials with boto3
+ Get STS credentials for an STS-enabled folder and use with boto3:
+
+ import boto3
+ from synapseclient.models import Folder
+
+ import synapseclient
+ synapseclient.login()
+
+ folder = Folder(id="syn123").get()
+ credentials = folder.get_sts_storage_token(
+ permission="read_write",
+ output_format="boto",
+ )
+ s3_client = boto3.client('s3', **credentials)
+ """
+ return {}
+
+ def index_files_for_migration(
+ self,
+ dest_storage_location_id: int,
+ db_path: Optional[str] = None,
+ *,
+ source_storage_location_ids: Optional[List[int]] = None,
+ file_version_strategy: str = "new",
+ include_table_files: bool = False,
+ continue_on_error: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> "MigrationResult":
+ """Index files in this entity for migration to a new storage location.
+
+ This is the first step in migrating files to a new storage location.
+ After indexing, use `migrate_indexed_files` to perform the actual migration.
+
+ Arguments:
+ dest_storage_location_id: The destination storage location ID.
+ db_path: Path to the SQLite database file for tracking migration state.
+ If not provided, a temporary directory will be used. The path
+ can be retrieved from the returned MigrationResult.db_path.
+ source_storage_location_ids: Optional list of source storage location IDs
+ to filter which files to migrate. If None, all files are indexed.
+ file_version_strategy: Strategy for handling file versions. Options:
+ 'new' (default) - create new versions, 'all' - migrate all versions,
+ 'latest' - only migrate latest version, 'skip' - skip if file exists.
+ include_table_files: Whether to include files attached to tables.
+ continue_on_error: Whether to continue indexing if an error occurs.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A MigrationResult object containing indexing statistics and the database
+ path (accessible via result.db_path).
+
+ Example: Indexing files for migration
+ Index files in a project for migration:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ project = Project(id="syn123").get()
+ result = project.index_files_for_migration(
+ dest_storage_location_id=12345,
+ )
+ print(f"Database path: {result.db_path}")
+ print(f"Indexed {result.counts_by_status}")
+ """
+ return None
+
+ def migrate_indexed_files(
+ self,
+ db_path: str,
+ *,
+ create_table_snapshots: bool = True,
+ continue_on_error: bool = False,
+ force: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Optional["MigrationResult"]:
+ """Migrate files that have been indexed with `index_files_for_migration`.
+
+ This is the second step in migrating files to a new storage location.
+ Files must first be indexed using `index_files_for_migration`.
+
+ Arguments:
+ db_path: Path to the SQLite database file created by
+ `index_files_for_migration`. You can get this from the
+ MigrationResult.db_path returned by index_files_for_migration.
+ create_table_snapshots: Whether to create table snapshots before
+ migrating table files.
+ continue_on_error: Whether to continue migration if an error occurs.
+ force: Whether to force migration of files that have already been
+ migrated. Also bypasses interactive confirmation.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A MigrationResult object containing migration statistics, or None
+ if the user declined the confirmation prompt.
+
+ Example: Migrating indexed files
+ Migrate previously indexed files:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ project = Project(id="syn123").get()
+
+ # Index first
+ index_result = project.index_files_for_migration(
+ dest_storage_location_id=12345,
+ )
+
+ # Then migrate using the db_path from index result
+ result = project.migrate_indexed_files(
+ db_path=index_result.db_path,
+ force=True, # Skip interactive confirmation
+ )
+ print(f"Migrated {result.counts_by_status}")
+ """
+ return None
diff --git a/synapseclient/models/protocols/storage_location_protocol.py b/synapseclient/models/protocols/storage_location_protocol.py
new file mode 100644
index 000000000..e602daaa6
--- /dev/null
+++ b/synapseclient/models/protocols/storage_location_protocol.py
@@ -0,0 +1,159 @@
+"""Protocol for the specific methods of StorageLocation that have synchronous counterparts
+generated at runtime."""
+
+from typing import TYPE_CHECKING, Optional, Protocol, Tuple
+
+from synapseclient import Synapse
+
+if TYPE_CHECKING:
+ from synapseclient.models import Folder
+ from synapseclient.models.storage_location import StorageLocation
+
+
+class StorageLocationSynchronousProtocol(Protocol):
+ """
+ The protocol for methods that are asynchronous but also
+ have a synchronous counterpart that may also be called.
+ """
+
+ def store(
+ self,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> "StorageLocation":
+ """Create this storage location in Synapse. Storage locations are immutable;
+ this always creates a new one. If a storage location with identical properties
+ already exists for this user, the existing one is returned (idempotent).
+
+ Arguments:
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The StorageLocation object with server-assigned fields populated.
+
+ Raises:
+ ValueError: If `storage_type` is not set.
+
+ Example: Creating an external S3 storage location
+ Create a storage location backed by your own S3 bucket:
+
+ from synapseclient.models import StorageLocation, StorageLocationType
+
+ import synapseclient
+ synapseclient.login()
+
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket="my-external-synapse-bucket",
+ base_key="path/within/bucket",
+ ).store()
+
+ print(f"Storage location ID: {storage.storage_location_id}")
+ """
+ return self
+
+ def get(
+ self,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> "StorageLocation":
+ """Retrieve this storage location from Synapse by its ID. Only the creator of
+ a StorageLocationSetting can retrieve it by its id.
+
+ Arguments:
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The StorageLocation object populated with data from Synapse.
+
+ Raises:
+ ValueError: If `storage_location_id` is not set.
+
+ Example: Retrieving a storage location
+ Retrieve a storage location by ID:
+
+ from synapseclient.models import StorageLocation
+
+ import synapseclient
+ synapseclient.login()
+
+ storage = StorageLocation(storage_location_id=12345).get()
+ print(f"Type: {storage.storage_type}, Bucket: {storage.bucket}")
+ """
+ return self
+
+ @classmethod
+ def setup_s3(
+ cls,
+ *,
+ parent: str,
+ folder_name: Optional[str] = None,
+ folder: Optional["Folder"] = None,
+ bucket_name: Optional[str] = None,
+ base_key: Optional[str] = None,
+ sts_enabled: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Tuple["Folder", "StorageLocation"]:
+ """Convenience method to create a folder backed by S3 storage. This will:
+
+ 1. Create or retrieve the folder
+ 2. Create the storage location setting
+ 3. Apply the storage location to the folder via project settings
+
+ Arguments:
+ parent: The parent project or folder ID (e.g., "syn123").
+ folder_name: Name for a new folder. Either `folder_name` or `folder`
+ must be provided.
+ folder: An existing Folder object or Synapse ID. Either `folder_name`
+ or `folder` must be provided.
+ bucket_name: The S3 bucket name. If None, uses Synapse default storage.
+ base_key: The base key (prefix) within the bucket. Optional.
+ sts_enabled: Whether to enable STS credentials for this storage location.
+ Default: False.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A tuple of (Folder, StorageLocation).
+
+ Raises:
+ ValueError: If neither `folder_name` nor `folder` is provided, or if both
+ are provided.
+
+ Example: Creating an STS-enabled folder with external S3 storage
+ Create a folder with STS-enabled storage:
+
+ from synapseclient.models import StorageLocation
+
+ import synapseclient
+ synapseclient.login()
+
+ folder, storage = StorageLocation.setup_s3(
+ folder_name="my-sts-folder",
+ parent="syn123",
+ bucket_name="my-external-synapse-bucket",
+ base_key="path/within/bucket",
+ sts_enabled=True,
+ )
+ print(f"Folder: {folder.id}, Storage: {storage.storage_location_id}")
+
+ Example: Using an existing folder
+ Apply S3 storage to an existing folder:
+
+ from synapseclient.models import StorageLocation, Folder
+
+ import synapseclient
+ synapseclient.login()
+
+ existing_folder = Folder(id="syn456").get()
+ folder, storage = StorageLocation.setup_s3(
+ folder=existing_folder,
+ bucket_name="my-bucket",
+ )
+ """
+ return None
diff --git a/synapseclient/models/services/__init__.py b/synapseclient/models/services/__init__.py
index d1e7227ca..fea05d199 100644
--- a/synapseclient/models/services/__init__.py
+++ b/synapseclient/models/services/__init__.py
@@ -1,3 +1,16 @@
+from synapseclient.models.services.migration import (
+ index_files_for_migration_async,
+ migrate_indexed_files_async,
+)
+from synapseclient.models.services.migration_types import (
+ MigrationEntry,
+ MigrationError,
+ MigrationKey,
+ MigrationResult,
+ MigrationSettings,
+ MigrationStatus,
+ MigrationType,
+)
from synapseclient.models.services.search import get_id
from synapseclient.models.services.storable_entity import store_entity
from synapseclient.models.services.storable_entity_components import (
@@ -5,4 +18,18 @@
store_entity_components,
)
-__all__ = ["store_entity_components", "store_entity", "FailureStrategy", "get_id"]
+__all__ = [
+ "store_entity_components",
+ "store_entity",
+ "FailureStrategy",
+ "get_id",
+ "index_files_for_migration_async",
+ "migrate_indexed_files_async",
+ "MigrationResult",
+ "MigrationStatus",
+ "MigrationType",
+ "MigrationKey",
+ "MigrationEntry",
+ "MigrationSettings",
+ "MigrationError",
+]
diff --git a/synapseclient/models/services/migration.py b/synapseclient/models/services/migration.py
new file mode 100644
index 000000000..0186e8b77
--- /dev/null
+++ b/synapseclient/models/services/migration.py
@@ -0,0 +1,1650 @@
+"""
+Async migration service for migrating files between storage locations.
+
+This module provides native async implementations of the migration functionality,
+replacing the threading-based approach in synapseutils.migrate_functions.
+"""
+
+import asyncio
+import collections.abc
+import json
+import logging
+import os
+import sys
+import tempfile
+import traceback
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ AsyncGenerator,
+ Dict,
+ List,
+ Optional,
+ Set,
+ Tuple,
+ Union,
+)
+
+from synapseclient.api.entity_services import get_children
+from synapseclient.api.file_services import get_file_handle_for_download_async
+from synapseclient.api.table_services import create_table_snapshot, get_columns
+from synapseclient.core import utils
+from synapseclient.core.constants import concrete_types
+from synapseclient.core.upload.multipart_upload import (
+ MAX_NUMBER_OF_PARTS,
+ multipart_copy,
+)
+from synapseclient.models.table_components import (
+ AppendableRowSetRequest,
+ PartialRow,
+ PartialRowSet,
+ TableUpdateTransaction,
+)
+
+from .migration_types import (
+ IndexingError,
+ MigrationError,
+ MigrationKey,
+ MigrationResult,
+ MigrationSettings,
+ MigrationStatus,
+ MigrationType,
+)
+
+if TYPE_CHECKING:
+ from synapseclient import Synapse
+
+# Default part size for multipart copy (100 MB)
+DEFAULT_PART_SIZE = 100 * utils.MB
+
+# Batch size for database operations
+BATCH_SIZE = 500
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Temp Directory Helpers
+# =============================================================================
+
+
+def _get_default_db_path(entity_id: str) -> str:
+ """Generate a default temp database path for migration tracking.
+
+ Arguments:
+ entity_id: The Synapse entity ID being migrated.
+
+ Returns:
+ Path to a SQLite database file in a temp directory.
+ """
+ temp_dir = tempfile.mkdtemp(prefix="synapse_migration_")
+ return os.path.join(temp_dir, f"migration_{entity_id}.db")
+
+
+# =============================================================================
+# Column Name Helpers (replaces legacy synapseclient.table functions)
+# =============================================================================
+
+
+def _escape_column_name(column: Union[str, collections.abc.Mapping]) -> str:
+ """Escape a column name for use in a Synapse table query statement.
+
+ Arguments:
+ column: A string column name or a dictionary with a 'name' key.
+
+ Returns:
+ Escaped column name wrapped in double quotes.
+ """
+ col_name = (
+ column["name"] if isinstance(column, collections.abc.Mapping) else str(column)
+ )
+ escaped_name = col_name.replace('"', '""')
+ return f'"{escaped_name}"'
+
+
+def _join_column_names(columns: List[Any]) -> str:
+ """Join column names into a comma-delimited list for table queries.
+
+ Arguments:
+ columns: A list of column names or column objects with 'name' keys.
+
+ Returns:
+ Comma-separated string of escaped column names.
+ """
+ return ",".join(_escape_column_name(c) for c in columns)
+
+
+# =============================================================================
+# Database Helper Functions (Synchronous - wrapped with asyncio.to_thread)
+# =============================================================================
+
+
+def _ensure_schema(cursor) -> None:
+ """Ensure the SQLite database has the required schema."""
+ # Settings table - stores JSON configuration
+ cursor.execute(
+ "CREATE TABLE IF NOT EXISTS migration_settings (settings TEXT NOT NULL)"
+ )
+
+ # Main migrations table
+ cursor.execute(
+ """
+ CREATE TABLE IF NOT EXISTS migrations (
+ id TEXT NOT NULL,
+ type INTEGER NOT NULL,
+ version INTEGER NULL,
+ row_id INTEGER NULL,
+ col_id INTEGER NULL,
+ parent_id NULL,
+ status INTEGER NOT NULL,
+ exception TEXT NULL,
+ from_storage_location_id NULL,
+ from_file_handle_id TEXT NULL,
+ to_file_handle_id TEXT NULL,
+ file_size INTEGER NULL,
+ PRIMARY KEY (id, type, row_id, col_id, version)
+ )
+ """
+ )
+
+ # Indexes for common queries
+ cursor.execute("CREATE INDEX IF NOT EXISTS ix_status ON migrations(status)")
+ cursor.execute(
+ "CREATE INDEX IF NOT EXISTS ix_file_handle_ids "
+ "ON migrations(from_file_handle_id, to_file_handle_id)"
+ )
+
+
+def _initialize_database(
+ db_path: str,
+ root_id: str,
+ dest_storage_location_id: str,
+ source_storage_location_ids: List[str],
+ file_version_strategy: str,
+ include_table_files: bool,
+) -> None:
+ """Initialize the migration database with schema and settings.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ root_id: The root entity ID being migrated.
+ dest_storage_location_id: Destination storage location ID.
+ source_storage_location_ids: List of source storage location IDs to filter.
+ file_version_strategy: Strategy for handling file versions.
+ include_table_files: Whether to include table-attached files.
+ """
+ import sqlite3
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+
+ # Check if settings already exist
+ existing = cursor.execute("SELECT settings FROM migration_settings").fetchone()
+
+ settings = MigrationSettings(
+ root_id=root_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ )
+
+ if existing:
+ # Verify settings match
+ existing_settings = json.loads(existing[0])
+ if existing_settings.get("root_id") != root_id:
+ raise ValueError(
+ f"Root entity ID mismatch: database has {existing_settings.get('root_id')}, "
+ f"but {root_id} was provided"
+ )
+ if (
+ existing_settings.get("dest_storage_location_id")
+ != dest_storage_location_id
+ ):
+ raise ValueError(
+ f"Destination storage location mismatch: database has "
+ f"{existing_settings.get('dest_storage_location_id')}, "
+ f"but {dest_storage_location_id} was provided"
+ )
+ else:
+ # Insert new settings
+ settings_json = json.dumps(
+ {
+ "root_id": settings.root_id,
+ "dest_storage_location_id": settings.dest_storage_location_id,
+ "source_storage_location_ids": settings.source_storage_location_ids,
+ "file_version_strategy": settings.file_version_strategy,
+ "include_table_files": settings.include_table_files,
+ }
+ )
+ cursor.execute(
+ "INSERT INTO migration_settings (settings) VALUES (?)",
+ (settings_json,),
+ )
+
+ conn.commit()
+
+
+def _retrieve_index_settings(db_path: str) -> Optional[Dict[str, Any]]:
+ """Retrieve index settings from the database.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+
+ Returns:
+ Dictionary of settings or None if not found.
+ """
+ import sqlite3
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+
+ row = cursor.execute("SELECT settings FROM migration_settings").fetchone()
+ if row:
+ return json.loads(row[0])
+ return None
+
+
+def _check_indexed(db_path: str, entity_id: str) -> bool:
+ """Check if an entity has already been indexed.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ entity_id: The entity ID to check.
+
+ Returns:
+ True if the entity is already indexed, False otherwise.
+ """
+ import sqlite3
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ row = cursor.execute(
+ "SELECT 1 FROM migrations WHERE id = ? LIMIT 1",
+ (entity_id,),
+ ).fetchone()
+ return row is not None
+
+
+def _mark_container_indexed(
+ db_path: str,
+ entity_id: str,
+ parent_id: Optional[str],
+ migration_type: MigrationType,
+) -> None:
+ """Mark a container (Project or Folder) as indexed.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ entity_id: The entity ID.
+ parent_id: The parent entity ID.
+ migration_type: The type of container.
+ """
+ import sqlite3
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ cursor.execute(
+ """
+ INSERT OR IGNORE INTO migrations (id, type, parent_id, status)
+ VALUES (?, ?, ?, ?)
+ """,
+ (
+ entity_id,
+ migration_type.value,
+ parent_id,
+ MigrationStatus.INDEXED.value,
+ ),
+ )
+ conn.commit()
+
+
+def _insert_file_migration(
+ db_path: str,
+ entity_id: str,
+ version: Optional[int],
+ parent_id: Optional[str],
+ from_storage_location_id: int,
+ from_file_handle_id: str,
+ file_size: int,
+ status: MigrationStatus,
+) -> None:
+ """Insert a file migration entry.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ entity_id: The file entity ID.
+ version: The file version (None for new version).
+ parent_id: The parent entity ID.
+ from_storage_location_id: Source storage location ID.
+ from_file_handle_id: Source file handle ID.
+ file_size: File size in bytes.
+ status: Migration status.
+ """
+ import sqlite3
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ cursor.execute(
+ """
+ INSERT OR IGNORE INTO migrations (
+ id, type, version, parent_id,
+ from_storage_location_id, from_file_handle_id,
+ file_size, status
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+ """,
+ (
+ entity_id,
+ MigrationType.FILE.value,
+ version,
+ parent_id,
+ from_storage_location_id,
+ from_file_handle_id,
+ file_size,
+ status.value,
+ ),
+ )
+ conn.commit()
+
+
+def _insert_table_file_migration(
+ db_path: str,
+ entity_id: str,
+ row_id: int,
+ col_id: int,
+ row_version: int,
+ parent_id: Optional[str],
+ from_storage_location_id: int,
+ from_file_handle_id: str,
+ file_size: int,
+ status: MigrationStatus,
+) -> None:
+ """Insert a table-attached file migration entry.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ entity_id: The table entity ID.
+ row_id: The table row ID.
+ col_id: The table column ID.
+ row_version: The row version.
+ parent_id: The parent entity ID.
+ from_storage_location_id: Source storage location ID.
+ from_file_handle_id: Source file handle ID.
+ file_size: File size in bytes.
+ status: Migration status.
+ """
+ import sqlite3
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ cursor.execute(
+ """
+ INSERT OR IGNORE INTO migrations (
+ id, type, row_id, col_id, version, parent_id,
+ from_storage_location_id, from_file_handle_id,
+ file_size, status
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ """,
+ (
+ entity_id,
+ MigrationType.TABLE_ATTACHED_FILE.value,
+ row_id,
+ col_id,
+ row_version,
+ parent_id,
+ from_storage_location_id,
+ from_file_handle_id,
+ file_size,
+ status.value,
+ ),
+ )
+ conn.commit()
+
+
+def _record_indexing_error(
+ db_path: str,
+ entity_id: str,
+ parent_id: Optional[str],
+ exception: Exception,
+) -> None:
+ """Record an indexing error in the database.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ entity_id: The entity ID that failed.
+ parent_id: The parent entity ID.
+ exception: The exception that occurred.
+ """
+ import sqlite3
+
+ tb_str = "".join(
+ traceback.format_exception(type(exception), exception, exception.__traceback__)
+ )
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ cursor.execute(
+ """
+ INSERT OR IGNORE INTO migrations (
+ id, type, parent_id, status, exception
+ ) VALUES (?, ?, ?, ?, ?)
+ """,
+ (
+ entity_id,
+ MigrationType.FILE.value, # Default type for errors
+ parent_id,
+ MigrationStatus.ERRORED.value,
+ tb_str,
+ ),
+ )
+ conn.commit()
+
+
+def _check_file_handle_exists(db_path: str, from_file_handle_id: str) -> Optional[str]:
+ """Check if a file handle has already been copied.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ from_file_handle_id: The source file handle ID.
+
+ Returns:
+ The destination file handle ID if found, None otherwise.
+ """
+ import sqlite3
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ row = cursor.execute(
+ """
+ SELECT to_file_handle_id FROM migrations
+ WHERE from_file_handle_id = ? AND to_file_handle_id IS NOT NULL
+ """,
+ (from_file_handle_id,),
+ ).fetchone()
+ return row[0] if row else None
+
+
+def _query_migration_batch(
+ db_path: str,
+ last_id: str,
+ last_version: int,
+ last_row_id: int,
+ last_col_id: int,
+ pending_file_handles: Set[str],
+ completed_file_handles: Set[str],
+ limit: int,
+) -> List[Dict[str, Any]]:
+ """Query the next batch of items to migrate.
+
+ This matches the original synapseutils query logic:
+ - Forward progress through entities ordered by id, type, row_id, col_id, version
+ - Backtracking to pick up files with completed file handles that were skipped
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ last_id: Last processed entity ID.
+ last_version: Last processed version.
+ last_row_id: Last processed row ID.
+ last_col_id: Last processed column ID.
+ pending_file_handles: Set of file handles currently being processed.
+ completed_file_handles: Set of file handles already completed.
+ limit: Maximum number of items to return.
+
+ Returns:
+ List of migration entries as dictionaries.
+ """
+ import sqlite3
+
+ if limit <= 0:
+ return []
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+
+ file_type = MigrationType.FILE.value
+ table_type = MigrationType.TABLE_ATTACHED_FILE.value
+ indexed_status = MigrationStatus.INDEXED.value
+
+ # Build the IN clauses for file handles
+ # We use string formatting for the IN clause since sqlite3 doesn't support array parameters
+ pending_in = (
+ "('" + "','".join(pending_file_handles) + "')"
+ if pending_file_handles
+ else "('')"
+ )
+ completed_in = (
+ "('" + "','".join(completed_file_handles) + "')"
+ if completed_file_handles
+ else "('')"
+ )
+
+ # Match the original synapseutils query structure exactly
+ # This handles:
+ # 1. Forward progress: entities after the current position
+ # 2. Backtracking: entities before current position that share completed file handles
+ query = f"""
+ SELECT
+ id,
+ type,
+ version,
+ row_id,
+ col_id,
+ from_file_handle_id,
+ file_size
+ FROM migrations
+ WHERE
+ status = :indexed_status
+ AND (
+ (
+ ((id > :id AND type IN (:file_type, :table_type))
+ OR (id = :id AND type = :file_type AND version IS NOT NULL AND version > :version)
+ OR (id = :id AND type = :table_type AND (row_id > :row_id OR (row_id = :row_id AND col_id > :col_id))))
+ AND from_file_handle_id NOT IN {pending_in}
+ ) OR
+ (
+ id <= :id
+ AND from_file_handle_id IN {completed_in}
+ )
+ )
+ ORDER BY
+ id,
+ type,
+ row_id,
+ col_id,
+ version
+ LIMIT :limit
+ """
+
+ params = {
+ "indexed_status": indexed_status,
+ "id": last_id,
+ "file_type": file_type,
+ "table_type": table_type,
+ "version": last_version,
+ "row_id": last_row_id,
+ "col_id": last_col_id,
+ "limit": limit,
+ }
+
+ results = cursor.execute(query, params)
+
+ batch = []
+ for row in results:
+ batch.append(
+ {
+ "id": row[0],
+ "type": MigrationType(row[1]),
+ "version": row[2],
+ "row_id": row[3],
+ "col_id": row[4],
+ "from_file_handle_id": row[5],
+ "file_size": row[6],
+ }
+ )
+ return batch
+
+
+def _update_migration_success(
+ db_path: str,
+ key: MigrationKey,
+ to_file_handle_id: str,
+) -> None:
+ """Update a migration entry as successful.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ key: The migration key.
+ to_file_handle_id: The destination file handle ID.
+ """
+ import sqlite3
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+
+ update_sql = """
+ UPDATE migrations SET status = ?, to_file_handle_id = ?
+ WHERE id = ? AND type = ?
+ """
+ params = [
+ MigrationStatus.MIGRATED.value,
+ to_file_handle_id,
+ key.id,
+ key.type.value,
+ ]
+
+ if key.version is not None:
+ update_sql += " AND version = ?"
+ params.append(key.version)
+ else:
+ update_sql += " AND version IS NULL"
+
+ if key.row_id is not None:
+ update_sql += " AND row_id = ?"
+ params.append(key.row_id)
+
+ if key.col_id is not None:
+ update_sql += " AND col_id = ?"
+ params.append(key.col_id)
+
+ cursor.execute(update_sql, tuple(params))
+ conn.commit()
+
+
+def _update_migration_error(
+ db_path: str,
+ key: MigrationKey,
+ exception: Exception,
+) -> None:
+ """Update a migration entry with an error.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ key: The migration key.
+ exception: The exception that occurred.
+ """
+ import sqlite3
+
+ tb_str = "".join(
+ traceback.format_exception(type(exception), exception, exception.__traceback__)
+ )
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+
+ update_sql = """
+ UPDATE migrations SET status = ?, exception = ?
+ WHERE id = ? AND type = ?
+ """
+ params = [MigrationStatus.ERRORED.value, tb_str, key.id, key.type.value]
+
+ if key.version is not None:
+ update_sql += " AND version = ?"
+ params.append(key.version)
+ else:
+ update_sql += " AND version IS NULL"
+
+ if key.row_id is not None:
+ update_sql += " AND row_id = ?"
+ params.append(key.row_id)
+
+ if key.col_id is not None:
+ update_sql += " AND col_id = ?"
+ params.append(key.col_id)
+
+ cursor.execute(update_sql, tuple(params))
+ conn.commit()
+
+
+def _confirm_migration(
+ db_path: str, dest_storage_location_id: str, force: bool
+) -> bool:
+ """Confirm migration with user if in interactive mode.
+
+ Arguments:
+ db_path: Path to the SQLite database file.
+ dest_storage_location_id: Destination storage location ID.
+ force: Whether to skip confirmation.
+
+ Returns:
+ True if migration should proceed, False otherwise.
+ """
+ import sqlite3
+
+ if force:
+ return True
+
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ count = cursor.execute(
+ "SELECT count(*) FROM migrations WHERE status = ?",
+ (MigrationStatus.INDEXED.value,),
+ ).fetchone()[0]
+
+ if count == 0:
+ logger.info("No items for migration.")
+ return False
+
+ if sys.stdout.isatty():
+ user_input = input(
+ f"{count} items for migration to {dest_storage_location_id}. Proceed? (y/n)? "
+ )
+ return user_input.strip().lower() == "y"
+ else:
+ logger.info(
+ "%s items for migration. "
+ "force option not used, and console input not available to confirm migration, aborting. "
+ "Use the force option or run from an interactive shell to proceed with migration.",
+ count,
+ )
+ return False
+
+
+def _get_part_size(file_size: int) -> int:
+ """Calculate the part size for multipart copy.
+
+ Arguments:
+ file_size: The file size in bytes.
+
+ Returns:
+ The part size in bytes.
+ """
+ import math
+
+ # Ensure we don't exceed max parts
+ min_part_size = math.ceil(file_size / MAX_NUMBER_OF_PARTS)
+ return max(DEFAULT_PART_SIZE, min_part_size)
+
+
+# =============================================================================
+# Storage Location Validation
+# =============================================================================
+
+
+async def _verify_storage_location_ownership_async(
+ storage_location_id: str,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Verify the user owns the destination storage location.
+
+ Arguments:
+ storage_location_id: The storage location ID to verify.
+ synapse_client: The Synapse client.
+
+ Raises:
+ ValueError: If the user does not own the storage location.
+ """
+ try:
+ await synapse_client.rest_get_async(f"/storageLocation/{storage_location_id}")
+ except Exception as ex:
+ raise ValueError(
+ f"Unable to verify ownership of storage location {storage_location_id}. "
+ f"You must be the creator of the destination storage location. Error: {ex}"
+ ) from ex
+
+
+def _include_file_in_migration(
+ file_handle: Dict[str, Any],
+ source_storage_location_ids: List[str],
+ dest_storage_location_id: str,
+) -> Optional[MigrationStatus]:
+ """Determine if a file should be included in migration.
+
+ Only S3 file handles can be migrated. External URLs and other file handle types
+ are skipped.
+
+ Arguments:
+ file_handle: The file handle metadata.
+ source_storage_location_ids: List of source storage locations to filter.
+ dest_storage_location_id: Destination storage location ID.
+
+ Returns:
+ MigrationStatus if file should be included, None otherwise.
+ """
+ # Only S3 file handles can be migrated
+ if file_handle.get("concreteType") != concrete_types.S3_FILE_HANDLE:
+ return None
+
+ from_storage_location_id = str(file_handle.get("storageLocationId", 1))
+
+ # Check if file matches the migration criteria:
+ # - If source_storage_location_ids is specified, from_storage_location must be in it
+ # OR already at the destination
+ # - If not specified, include all files not already at destination
+ if source_storage_location_ids:
+ if (
+ from_storage_location_id not in source_storage_location_ids
+ and from_storage_location_id != dest_storage_location_id
+ ):
+ return None
+
+ # Already at destination - mark as already migrated
+ if from_storage_location_id == dest_storage_location_id:
+ return MigrationStatus.ALREADY_MIGRATED
+
+ return MigrationStatus.INDEXED
+
+
+# =============================================================================
+# Public API Functions
+# =============================================================================
+
+
+async def index_files_for_migration_async(
+ entity_id: str,
+ dest_storage_location_id: str,
+ db_path: Optional[str] = None,
+ *,
+ source_storage_location_ids: Optional[List[str]] = None,
+ file_version_strategy: str = "new",
+ include_table_files: bool = False,
+ continue_on_error: bool = False,
+ synapse_client: Optional["Synapse"] = None,
+) -> MigrationResult:
+ """Index files for migration to a new storage location.
+
+ This is the first step in migrating files to a new storage location.
+ After indexing, use `migrate_indexed_files_async` to perform the actual migration.
+
+ Arguments:
+ entity_id: The Synapse entity ID to migrate (Project, Folder, File, or Table).
+ dest_storage_location_id: The destination storage location ID.
+ db_path: Path to create SQLite database. If None, uses temp directory.
+ source_storage_location_ids: Optional list of source storage locations to filter.
+ file_version_strategy: Strategy for file versions: "new", "all", "latest", "skip".
+ include_table_files: Whether to include files attached to tables.
+ continue_on_error: Whether to continue on individual errors.
+ synapse_client: Optional Synapse client instance.
+
+ Returns:
+ MigrationResult object for inspecting the index.
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+
+ # Validate parameters
+ valid_strategies = {"new", "all", "latest", "skip"}
+ if file_version_strategy not in valid_strategies:
+ raise ValueError(
+ f"Invalid file_version_strategy: {file_version_strategy}, "
+ f"must be one of {valid_strategies}"
+ )
+
+ if file_version_strategy == "skip" and not include_table_files:
+ raise ValueError(
+ "Skipping both file entities and table attached files, nothing to migrate"
+ )
+
+ # Convert to strings
+ dest_storage_location_id = str(dest_storage_location_id)
+ source_storage_location_ids = [str(s) for s in (source_storage_location_ids or [])]
+
+ # Verify ownership
+ await _verify_storage_location_ownership_async(
+ storage_location_id=dest_storage_location_id,
+ synapse_client=client,
+ )
+
+ # Create database path if not provided
+ if db_path is None:
+ db_path = _get_default_db_path(entity_id)
+
+ # Initialize database
+ await asyncio.to_thread(
+ _initialize_database,
+ db_path,
+ entity_id,
+ dest_storage_location_id,
+ source_storage_location_ids,
+ file_version_strategy,
+ include_table_files,
+ )
+
+ # Get entity and start indexing
+ entity = await client.get_async(entity_id, downloadFile=False)
+
+ try:
+ await _index_entity_async(
+ entity=entity,
+ parent_id=None,
+ db_path=db_path,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ continue_on_error=continue_on_error,
+ synapse_client=client,
+ )
+ except IndexingError as ex:
+ logger.exception(
+ "Aborted due to failure to index entity %s of type %s. "
+ "Use continue_on_error=True to skip individual failures.",
+ ex.entity_id,
+ ex.concrete_type,
+ )
+ raise ex
+
+ return MigrationResult(db_path=db_path, synapse_client=client)
+
+
+async def migrate_indexed_files_async(
+ db_path: str,
+ *,
+ create_table_snapshots: bool = True,
+ continue_on_error: bool = False,
+ force: bool = False,
+ max_concurrent_copies: Optional[int] = None,
+ synapse_client: Optional["Synapse"] = None,
+) -> Optional[MigrationResult]:
+ """Migrate files that have been indexed.
+
+ This is the second step in migrating files to a new storage location.
+ Files must first be indexed using `index_files_for_migration_async`.
+
+ Arguments:
+ db_path: Path to SQLite database created by index_files_for_migration_async.
+ create_table_snapshots: Whether to create table snapshots before migrating.
+ continue_on_error: Whether to continue on individual migration errors.
+ force: Whether to skip interactive confirmation.
+ max_concurrent_copies: Maximum concurrent file copy operations.
+ synapse_client: Optional Synapse client instance.
+
+ Returns:
+ MigrationResult object or None if migration was aborted.
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+
+ # Retrieve settings
+ settings = await asyncio.to_thread(_retrieve_index_settings, db_path)
+ if settings is None:
+ raise ValueError(
+ f"Unable to retrieve existing index settings from '{db_path}'. "
+ "Either this path does not represent a previously created migration index "
+ "or the file is corrupt."
+ )
+
+ dest_storage_location_id = settings["dest_storage_location_id"]
+
+ # Confirm migration
+ confirmed = await asyncio.to_thread(
+ _confirm_migration, db_path, dest_storage_location_id, force
+ )
+ if not confirmed:
+ logger.info("Migration aborted.")
+ return None
+
+ # Determine concurrency
+ max_concurrent = max_concurrent_copies or max(client.max_threads // 2, 1)
+
+ # Execute migration
+ await _execute_migration_async(
+ db_path=db_path,
+ dest_storage_location_id=dest_storage_location_id,
+ create_table_snapshots=create_table_snapshots,
+ continue_on_error=continue_on_error,
+ max_concurrent=max_concurrent,
+ synapse_client=client,
+ )
+
+ return MigrationResult(db_path=db_path, synapse_client=client)
+
+
+# =============================================================================
+# Indexing Implementation
+# =============================================================================
+
+
+async def _index_entity_async(
+ entity: Any,
+ parent_id: Optional[str],
+ db_path: str,
+ dest_storage_location_id: str,
+ source_storage_location_ids: List[str],
+ file_version_strategy: str,
+ include_table_files: bool,
+ continue_on_error: bool,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Recursively index an entity and its children.
+
+ Arguments:
+ entity: The Synapse entity object.
+ parent_id: The parent entity ID.
+ db_path: Path to the SQLite database.
+ dest_storage_location_id: Destination storage location ID.
+ source_storage_location_ids: List of source storage locations to filter.
+ file_version_strategy: Strategy for file versions.
+ include_table_files: Whether to include table-attached files.
+ continue_on_error: Whether to continue on errors.
+ synapse_client: The Synapse client.
+ """
+ entity_id = utils.id_of(entity)
+ concrete_type = utils.concrete_type_of(entity)
+
+ # Check if already indexed
+ is_indexed = await asyncio.to_thread(_check_indexed, db_path, entity_id)
+ if is_indexed:
+ return
+
+ try:
+ if concrete_type == concrete_types.FILE_ENTITY:
+ if file_version_strategy != "skip":
+ await _index_file_entity_async(
+ entity_id=entity_id,
+ parent_id=parent_id,
+ db_path=db_path,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ synapse_client=synapse_client,
+ )
+
+ elif concrete_type == concrete_types.TABLE_ENTITY:
+ if include_table_files:
+ await _index_table_entity_async(
+ entity_id=entity_id,
+ parent_id=parent_id,
+ db_path=db_path,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ synapse_client=synapse_client,
+ )
+
+ elif concrete_type in (
+ concrete_types.FOLDER_ENTITY,
+ concrete_types.PROJECT_ENTITY,
+ ):
+ await _index_container_async(
+ entity_id=entity_id,
+ parent_id=parent_id,
+ db_path=db_path,
+ concrete_type=concrete_type,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ continue_on_error=continue_on_error,
+ synapse_client=synapse_client,
+ )
+
+ except IndexingError:
+ raise
+ except Exception as ex:
+ if continue_on_error:
+ logger.warning("Error indexing entity %s: %s", entity_id, ex)
+ await asyncio.to_thread(
+ _record_indexing_error, db_path, entity_id, parent_id, ex
+ )
+ else:
+ raise IndexingError(entity_id, concrete_type) from ex
+
+
+async def _index_file_entity_async(
+ entity_id: str,
+ parent_id: Optional[str],
+ db_path: str,
+ dest_storage_location_id: str,
+ source_storage_location_ids: List[str],
+ file_version_strategy: str,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Index a file entity for migration.
+
+ Arguments:
+ entity_id: The file entity ID.
+ parent_id: The parent entity ID.
+ db_path: Path to the SQLite database.
+ dest_storage_location_id: Destination storage location ID.
+ source_storage_location_ids: List of source storage locations to filter.
+ file_version_strategy: Strategy for file versions.
+ synapse_client: The Synapse client.
+ """
+ logger.info("Indexing file entity %s", entity_id)
+
+ entity_versions: List[Tuple[Any, Optional[int]]] = []
+
+ if file_version_strategy == "new":
+ entity = await synapse_client.get_async(entity_id, downloadFile=False)
+ entity_versions.append((entity, None))
+
+ elif file_version_strategy == "all":
+ # Get all versions
+ async for version in _get_version_numbers_async(entity_id, synapse_client):
+ entity = await synapse_client.get_async(
+ entity_id, version=version, downloadFile=False
+ )
+ entity_versions.append((entity, version))
+
+ elif file_version_strategy == "latest":
+ entity = await synapse_client.get_async(entity_id, downloadFile=False)
+ entity_versions.append((entity, entity.versionNumber))
+
+ for entity, version in entity_versions:
+ file_handle = entity._file_handle
+ status = _include_file_in_migration(
+ file_handle, source_storage_location_ids, dest_storage_location_id
+ )
+ if status:
+ await asyncio.to_thread(
+ _insert_file_migration,
+ db_path,
+ entity_id,
+ version,
+ parent_id,
+ file_handle["storageLocationId"],
+ entity.dataFileHandleId,
+ file_handle["contentSize"],
+ status,
+ )
+
+
+async def _get_version_numbers_async(
+ entity_id: str,
+ synapse_client: "Synapse",
+) -> AsyncGenerator[int, None]:
+ """Get all version numbers for an entity.
+
+ Arguments:
+ entity_id: The entity ID.
+ synapse_client: The Synapse client.
+
+ Yields:
+ Version numbers.
+ """
+ offset = 0
+ limit = 100
+
+ while True:
+ response = await synapse_client.rest_get_async(
+ f"/entity/{entity_id}/version?offset={offset}&limit={limit}"
+ )
+ results = response.get("results", [])
+
+ for version_info in results:
+ yield version_info["versionNumber"]
+
+ if len(results) < limit:
+ break
+ offset += limit
+
+
+async def _index_table_entity_async(
+ entity_id: str,
+ parent_id: Optional[str],
+ db_path: str,
+ dest_storage_location_id: str,
+ source_storage_location_ids: List[str],
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Index a table entity's file attachments for migration.
+
+ Arguments:
+ entity_id: The table entity ID.
+ parent_id: The parent entity ID.
+ db_path: Path to the SQLite database.
+ dest_storage_location_id: Destination storage location ID.
+ source_storage_location_ids: List of source storage locations to filter.
+ synapse_client: The Synapse client.
+ """
+ logger.info("Indexing table entity %s", entity_id)
+
+ # Get file handle columns using the async API
+ columns = await get_columns(table_id=entity_id, synapse_client=synapse_client)
+ file_handle_columns = [c for c in columns if c.column_type == "FILEHANDLEID"]
+
+ if not file_handle_columns:
+ return
+
+ # Query table for file handles using local helper
+ file_column_select = _join_column_names(file_handle_columns)
+
+ # tableQuery is still a synchronous method on the Synapse client
+ results = await asyncio.to_thread(
+ synapse_client.tableQuery,
+ f"SELECT {file_column_select} FROM {entity_id}",
+ )
+
+ for row in results:
+ row_id, row_version = row[:2]
+ file_handle_ids = row[2:]
+
+ for i, file_handle_id in enumerate(file_handle_ids):
+ if not file_handle_id:
+ continue
+
+ col_id = file_handle_columns[i].id
+
+ # Get file handle metadata using the async API
+ fh_response = await get_file_handle_for_download_async(
+ file_handle_id=str(file_handle_id),
+ synapse_id=entity_id,
+ entity_type="TableEntity",
+ synapse_client=synapse_client,
+ )
+ file_handle = fh_response["fileHandle"]
+
+ status = _include_file_in_migration(
+ file_handle, source_storage_location_ids, dest_storage_location_id
+ )
+ if status:
+ await asyncio.to_thread(
+ _insert_table_file_migration,
+ db_path,
+ entity_id,
+ row_id,
+ int(col_id),
+ row_version,
+ parent_id,
+ file_handle["storageLocationId"],
+ file_handle_id,
+ file_handle["contentSize"],
+ status,
+ )
+
+
+async def _index_container_async(
+ entity_id: str,
+ parent_id: Optional[str],
+ db_path: str,
+ concrete_type: str,
+ dest_storage_location_id: str,
+ source_storage_location_ids: List[str],
+ file_version_strategy: str,
+ include_table_files: bool,
+ continue_on_error: bool,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Index a container (Project or Folder) and its children.
+
+ Arguments:
+ entity_id: The container entity ID.
+ parent_id: The parent entity ID.
+ db_path: Path to the SQLite database.
+ concrete_type: The concrete type of the container.
+ dest_storage_location_id: Destination storage location ID.
+ source_storage_location_ids: List of source storage locations to filter.
+ file_version_strategy: Strategy for file versions.
+ include_table_files: Whether to include table-attached files.
+ continue_on_error: Whether to continue on errors.
+ synapse_client: The Synapse client.
+ """
+ logger.info("Indexing container %s", entity_id)
+
+ # Determine included types
+ include_types = []
+ if file_version_strategy != "skip":
+ include_types.extend(["folder", "file"])
+ if include_table_files:
+ include_types.append("table")
+
+ # Get children using the async API
+ children = []
+ async for child in get_children(
+ parent=entity_id,
+ include_types=include_types,
+ synapse_client=synapse_client,
+ ):
+ children.append(child)
+
+ # Use bounded concurrency for indexing children
+ semaphore = asyncio.Semaphore(10)
+
+ async def index_child(child: Dict[str, Any]) -> None:
+ async with semaphore:
+ child_entity = await synapse_client.get_async(
+ child["id"], downloadFile=False
+ )
+ await _index_entity_async(
+ entity=child_entity,
+ parent_id=entity_id,
+ db_path=db_path,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ continue_on_error=continue_on_error,
+ synapse_client=synapse_client,
+ )
+
+ # Process children with as_completed for progress tracking
+ tasks = [asyncio.create_task(index_child(child)) for child in children]
+ for task in asyncio.as_completed(tasks):
+ await task
+
+ # Mark container as indexed
+ migration_type = (
+ MigrationType.PROJECT
+ if concrete_type == concrete_types.PROJECT_ENTITY
+ else MigrationType.FOLDER
+ )
+ await asyncio.to_thread(
+ _mark_container_indexed, db_path, entity_id, parent_id, migration_type
+ )
+
+
+# =============================================================================
+# Migration Execution
+# =============================================================================
+
+
+async def _execute_migration_async(
+ db_path: str,
+ dest_storage_location_id: str,
+ create_table_snapshots: bool,
+ continue_on_error: bool,
+ max_concurrent: int,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Execute the actual file migration.
+
+ Arguments:
+ db_path: Path to the SQLite database.
+ dest_storage_location_id: Destination storage location ID.
+ create_table_snapshots: Whether to create table snapshots.
+ continue_on_error: Whether to continue on errors.
+ max_concurrent: Maximum concurrent operations.
+ synapse_client: The Synapse client.
+ """
+ pending_file_handles: Set[str] = set()
+ completed_file_handles: Set[str] = set()
+ pending_keys: Set[MigrationKey] = set()
+ table_snapshots_created: Set[str] = set()
+
+ semaphore = asyncio.Semaphore(max_concurrent)
+ active_tasks: Set[asyncio.Task] = set()
+
+ last_id = ""
+ last_version = -1
+ last_row_id = -1
+ last_col_id = -1
+
+ while True:
+ # Query next batch
+ batch = await asyncio.to_thread(
+ _query_migration_batch,
+ db_path,
+ last_id,
+ last_version,
+ last_row_id,
+ last_col_id,
+ pending_file_handles,
+ completed_file_handles,
+ min(BATCH_SIZE, max_concurrent - len(active_tasks)),
+ )
+
+ if not batch and not active_tasks:
+ break
+
+ # Process batch items
+ for item in batch:
+ key = MigrationKey(
+ id=item["id"],
+ type=item["type"],
+ version=item["version"],
+ row_id=item["row_id"],
+ col_id=item["col_id"],
+ )
+
+ if key in pending_keys:
+ continue
+
+ pending_keys.add(key)
+ from_file_handle_id = item["from_file_handle_id"]
+
+ # Check for existing copy
+ to_file_handle_id = await asyncio.to_thread(
+ _check_file_handle_exists, db_path, from_file_handle_id
+ )
+
+ if not to_file_handle_id:
+ pending_file_handles.add(from_file_handle_id)
+
+ # Create table snapshot if needed using the async API
+ if (
+ item["type"] == MigrationType.TABLE_ATTACHED_FILE
+ and create_table_snapshots
+ and item["id"] not in table_snapshots_created
+ ):
+ await create_table_snapshot(
+ table_id=item["id"],
+ synapse_client=synapse_client,
+ )
+ table_snapshots_created.add(item["id"])
+
+ # Create migration task
+ task = asyncio.create_task(
+ _migrate_item_async(
+ key=key,
+ from_file_handle_id=from_file_handle_id,
+ to_file_handle_id=to_file_handle_id,
+ file_size=item["file_size"] or 0,
+ dest_storage_location_id=dest_storage_location_id,
+ semaphore=semaphore,
+ synapse_client=synapse_client,
+ )
+ )
+ active_tasks.add(task)
+
+ # Update tracking for next batch
+ last_id = item["id"]
+ last_version = item["version"] if item["version"] is not None else -1
+ last_row_id = item["row_id"] if item["row_id"] is not None else -1
+ last_col_id = item["col_id"] if item["col_id"] is not None else -1
+
+ # Wait for tasks if at capacity or end of batch
+ if active_tasks and (
+ len(active_tasks) >= max_concurrent or len(batch) < BATCH_SIZE
+ ):
+ done, active_tasks = await asyncio.wait(
+ active_tasks,
+ return_when=asyncio.FIRST_COMPLETED,
+ )
+
+ for completed_task in done:
+ try:
+ result = completed_task.result()
+ key = result["key"]
+ from_fh_id = result["from_file_handle_id"]
+ to_fh_id = result["to_file_handle_id"]
+
+ # Update database
+ await asyncio.to_thread(
+ _update_migration_success, db_path, key, to_fh_id
+ )
+
+ completed_file_handles.add(from_fh_id)
+ pending_file_handles.discard(from_fh_id)
+ pending_keys.discard(key)
+
+ except Exception as ex:
+ if hasattr(ex, "key"):
+ key = ex.key
+ await asyncio.to_thread(
+ _update_migration_error, db_path, key, ex.__cause__ or ex
+ )
+ pending_keys.discard(key)
+
+ if not continue_on_error:
+ # Cancel remaining tasks
+ for task in active_tasks:
+ task.cancel()
+ raise
+
+ # Wait for any remaining tasks
+ if active_tasks:
+ done, _ = await asyncio.wait(active_tasks)
+ for completed_task in done:
+ try:
+ result = completed_task.result()
+ await asyncio.to_thread(
+ _update_migration_success,
+ db_path,
+ result["key"],
+ result["to_file_handle_id"],
+ )
+ except Exception as ex:
+ if hasattr(ex, "key"):
+ await asyncio.to_thread(
+ _update_migration_error, db_path, ex.key, ex.__cause__ or ex
+ )
+ if not continue_on_error:
+ raise
+
+
+async def _migrate_item_async(
+ key: MigrationKey,
+ from_file_handle_id: str,
+ to_file_handle_id: Optional[str],
+ file_size: int,
+ dest_storage_location_id: str,
+ semaphore: asyncio.Semaphore,
+ *,
+ synapse_client: "Synapse",
+) -> Dict[str, Any]:
+ """Migrate a single item.
+
+ Arguments:
+ key: The migration key.
+ from_file_handle_id: Source file handle ID.
+ to_file_handle_id: Destination file handle ID (if already copied).
+ file_size: File size in bytes.
+ dest_storage_location_id: Destination storage location ID.
+ semaphore: Concurrency semaphore.
+ synapse_client: The Synapse client.
+
+ Returns:
+ Dictionary with key, from_file_handle_id, to_file_handle_id.
+ """
+ async with semaphore:
+ try:
+ # Copy file handle if needed
+ if not to_file_handle_id:
+ source_association = {
+ "fileHandleId": from_file_handle_id,
+ "associateObjectId": key.id,
+ "associateObjectType": (
+ "FileEntity"
+ if key.type == MigrationType.FILE
+ else "TableEntity"
+ ),
+ }
+
+ # Use thread for multipart_copy (it uses threading internally)
+ to_file_handle_id = await asyncio.to_thread(
+ multipart_copy,
+ synapse_client,
+ source_association,
+ dest_storage_location_id,
+ part_size=_get_part_size(file_size),
+ )
+
+ # Update entity with new file handle
+ if key.type == MigrationType.FILE:
+ if key.version is None:
+ await _create_new_file_version_async(
+ entity_id=key.id,
+ to_file_handle_id=to_file_handle_id,
+ synapse_client=synapse_client,
+ )
+ else:
+ await _update_file_version_async(
+ entity_id=key.id,
+ version=key.version,
+ from_file_handle_id=from_file_handle_id,
+ to_file_handle_id=to_file_handle_id,
+ synapse_client=synapse_client,
+ )
+ elif key.type == MigrationType.TABLE_ATTACHED_FILE:
+ await _update_table_file_async(
+ entity_id=key.id,
+ row_id=key.row_id,
+ col_id=key.col_id,
+ to_file_handle_id=to_file_handle_id,
+ synapse_client=synapse_client,
+ )
+
+ return {
+ "key": key,
+ "from_file_handle_id": from_file_handle_id,
+ "to_file_handle_id": to_file_handle_id,
+ }
+
+ except Exception as ex:
+ error = MigrationError(key, from_file_handle_id, to_file_handle_id)
+ error.__cause__ = ex
+ raise error
+
+
+async def _create_new_file_version_async(
+ entity_id: str,
+ to_file_handle_id: str,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Create a new version of a file entity with the new file handle.
+
+ Arguments:
+ entity_id: The file entity ID.
+ to_file_handle_id: The new file handle ID.
+ synapse_client: The Synapse client.
+ """
+ entity = await synapse_client.get_async(entity_id, downloadFile=False)
+ entity.dataFileHandleId = to_file_handle_id
+ await synapse_client.store_async(entity)
+
+
+async def _update_file_version_async(
+ entity_id: str,
+ version: int,
+ from_file_handle_id: str,
+ to_file_handle_id: str,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Update an existing file version's file handle.
+
+ Arguments:
+ entity_id: The file entity ID.
+ version: The version number.
+ from_file_handle_id: The original file handle ID.
+ to_file_handle_id: The new file handle ID.
+ synapse_client: The Synapse client.
+ """
+ await synapse_client.rest_put_async(
+ f"/entity/{entity_id}/version/{version}/filehandle",
+ body=json.dumps(
+ {
+ "oldFileHandleId": from_file_handle_id,
+ "newFileHandleId": to_file_handle_id,
+ }
+ ),
+ )
+
+
+async def _update_table_file_async(
+ entity_id: str,
+ row_id: int,
+ col_id: int,
+ to_file_handle_id: str,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Update a table cell with a new file handle.
+
+ Arguments:
+ entity_id: The table entity ID.
+ row_id: The row ID.
+ col_id: The column ID.
+ to_file_handle_id: The new file handle ID.
+ synapse_client: The Synapse client.
+ """
+ # Create the partial row update using new OOP models
+ partial_row = PartialRow(
+ row_id=str(row_id),
+ values=[{"key": str(col_id), "value": to_file_handle_id}],
+ )
+ partial_row_set = PartialRowSet(
+ table_id=entity_id,
+ rows=[partial_row],
+ )
+ appendable_request = AppendableRowSetRequest(
+ entity_id=entity_id,
+ to_append=partial_row_set,
+ )
+
+ # Execute the update using TableUpdateTransaction
+ transaction = TableUpdateTransaction(
+ entity_id=entity_id,
+ changes=[appendable_request],
+ )
+ await transaction.send_job_and_wait_async(synapse_client=synapse_client)
diff --git a/synapseclient/models/services/migration_types.py b/synapseclient/models/services/migration_types.py
new file mode 100644
index 000000000..a20cc008d
--- /dev/null
+++ b/synapseclient/models/services/migration_types.py
@@ -0,0 +1,371 @@
+"""
+Data classes and enums for the async migration service.
+
+These types are used to track the state of file migrations between storage locations.
+"""
+
+import asyncio
+import csv
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional
+
+from synapseclient.core.constants import concrete_types
+
+if TYPE_CHECKING:
+ from synapseclient import Synapse
+
+
+class MigrationStatus(Enum):
+ """Status of a migration entry in the tracking database."""
+
+ INDEXED = 1
+ """The file has been indexed and is ready to be migrated."""
+
+ MIGRATED = 2
+ """The file has been successfully migrated to the new storage location."""
+
+ ALREADY_MIGRATED = 3
+ """The file was already at the destination storage location."""
+
+ ERRORED = 4
+ """An error occurred during indexing or migration."""
+
+
+class MigrationType(Enum):
+ """Type of entity being tracked in the migration database."""
+
+ PROJECT = 1
+ """A project container (used for tracking indexed containers)."""
+
+ FOLDER = 2
+ """A folder container (used for tracking indexed containers)."""
+
+ FILE = 3
+ """A file entity."""
+
+ TABLE_ATTACHED_FILE = 4
+ """A file attached to a table column."""
+
+ @classmethod
+ def from_concrete_type(cls, concrete_type: str) -> "MigrationType":
+ """Convert a Synapse concrete type string to a MigrationType.
+
+ Arguments:
+ concrete_type: The concrete type string from Synapse API.
+
+ Returns:
+ The corresponding MigrationType enum value.
+
+ Raises:
+ ValueError: If the concrete type is not recognized.
+ """
+ if concrete_type == concrete_types.PROJECT_ENTITY:
+ return cls.PROJECT
+ elif concrete_type == concrete_types.FOLDER_ENTITY:
+ return cls.FOLDER
+ elif concrete_type == concrete_types.FILE_ENTITY:
+ return cls.FILE
+ elif concrete_type == concrete_types.TABLE_ENTITY:
+ return cls.TABLE_ATTACHED_FILE
+
+ raise ValueError(f"Unhandled concrete type: {concrete_type}")
+
+
+@dataclass
+class MigrationKey:
+ """Unique identifier for a migration entry in the tracking database.
+
+ Attributes:
+ id: The Synapse entity ID.
+ type: The type of entity being migrated.
+ version: The file version number (None for new versions or containers).
+ row_id: The table row ID (for table attached files).
+ col_id: The table column ID (for table attached files).
+ """
+
+ id: str
+ type: MigrationType
+ version: Optional[int] = None
+ row_id: Optional[int] = None
+ col_id: Optional[int] = None
+
+ def __hash__(self) -> int:
+ return hash((self.id, self.type, self.version, self.row_id, self.col_id))
+
+ def __eq__(self, other: object) -> bool:
+ if not isinstance(other, MigrationKey):
+ return False
+ return (
+ self.id == other.id
+ and self.type == other.type
+ and self.version == other.version
+ and self.row_id == other.row_id
+ and self.col_id == other.col_id
+ )
+
+
+@dataclass
+class MigrationEntry:
+ """A single migration entry with full details.
+
+ Attributes:
+ key: The unique identifier for this migration entry.
+ parent_id: The parent entity ID.
+ from_storage_location_id: The original storage location ID.
+ from_file_handle_id: The original file handle ID.
+ to_file_handle_id: The new file handle ID after migration.
+ file_size: The file size in bytes.
+ status: The current migration status.
+ exception: Stack trace if an error occurred.
+ """
+
+ key: MigrationKey
+ parent_id: Optional[str] = None
+ from_storage_location_id: Optional[int] = None
+ from_file_handle_id: Optional[str] = None
+ to_file_handle_id: Optional[str] = None
+ file_size: Optional[int] = None
+ status: MigrationStatus = MigrationStatus.INDEXED
+ exception: Optional[str] = None
+
+
+@dataclass
+class MigrationSettings:
+ """Settings for a migration index stored in the database.
+
+ Attributes:
+ root_id: The root entity ID being migrated.
+ dest_storage_location_id: The destination storage location ID.
+ source_storage_location_ids: List of source storage location IDs to filter.
+ file_version_strategy: Strategy for handling file versions.
+ include_table_files: Whether to include files attached to tables.
+ """
+
+ root_id: str
+ dest_storage_location_id: str
+ source_storage_location_ids: List[str] = field(default_factory=list)
+ file_version_strategy: str = "new"
+ include_table_files: bool = False
+
+
+@dataclass
+class MigrationResult:
+ """Result of a migration operation - proxy to the SQLite tracking database.
+
+ This class provides methods to query the migration database for status counts,
+ individual migration entries, and CSV export.
+
+ Attributes:
+ db_path: Path to the SQLite database file.
+ synapse_client: Optional Synapse client for column name lookups.
+ """
+
+ db_path: str
+ synapse_client: Optional["Synapse"] = None
+
+ @property
+ def counts_by_status(self) -> Dict[str, int]:
+ """Get counts by migration status (synchronous).
+
+ Returns:
+ Dictionary mapping status names to counts.
+ """
+ return self.get_counts_by_status()
+
+ def get_counts_by_status(self) -> Dict[str, int]:
+ """Get counts by migration status (synchronous).
+
+ Returns:
+ Dictionary mapping status names to counts.
+ """
+ import sqlite3
+
+ with sqlite3.connect(self.db_path) as conn:
+ cursor = conn.cursor()
+
+ # Only count FILE and TABLE_ATTACHED_FILE entries
+ result = cursor.execute(
+ "SELECT status, count(*) FROM migrations "
+ "WHERE type IN (?, ?) GROUP BY status",
+ (MigrationType.FILE.value, MigrationType.TABLE_ATTACHED_FILE.value),
+ )
+
+ counts = {status.name: 0 for status in MigrationStatus}
+ for row in result:
+ status_value = row[0]
+ count = row[1]
+ counts[MigrationStatus(status_value).name] = count
+
+ return counts
+
+ async def get_counts_by_status_async(self) -> Dict[str, int]:
+ """Get counts by migration status (asynchronous).
+
+ Returns:
+ Dictionary mapping status names to counts.
+ """
+ return await asyncio.to_thread(self.get_counts_by_status)
+
+ def get_migrations(self) -> Iterator[Dict[str, Any]]:
+ """Iterate over all migration entries (synchronous).
+
+ Yields:
+ Dictionary for each migration entry with keys:
+ id, type, version, row_id, col_name, from_storage_location_id,
+ from_file_handle_id, to_file_handle_id, file_size, status, exception.
+ """
+ import sqlite3
+
+ with sqlite3.connect(self.db_path) as conn:
+ cursor = conn.cursor()
+
+ batch_size = 500
+ rowid = -1
+ column_names_cache: Dict[int, str] = {}
+
+ while True:
+ results = cursor.execute(
+ """
+ SELECT
+ rowid,
+ id,
+ type,
+ version,
+ row_id,
+ col_id,
+ from_storage_location_id,
+ from_file_handle_id,
+ to_file_handle_id,
+ file_size,
+ status,
+ exception
+ FROM migrations
+ WHERE
+ rowid > ?
+ AND type IN (?, ?)
+ ORDER BY rowid
+ LIMIT ?
+ """,
+ (
+ rowid,
+ MigrationType.FILE.value,
+ MigrationType.TABLE_ATTACHED_FILE.value,
+ batch_size,
+ ),
+ )
+
+ rows = results.fetchall()
+ if not rows:
+ break
+
+ for row in rows:
+ rowid = row[0]
+ col_id = row[5]
+
+ # Resolve column name if needed
+ col_name = None
+ if col_id is not None and self.synapse_client:
+ if col_id not in column_names_cache:
+ try:
+ col_info = self.synapse_client.restGET(
+ f"/column/{col_id}"
+ )
+ column_names_cache[col_id] = col_info.get("name", "")
+ except Exception:
+ column_names_cache[col_id] = ""
+ col_name = column_names_cache[col_id]
+
+ yield {
+ "id": row[1],
+ "type": (
+ "file" if row[2] == MigrationType.FILE.value else "table"
+ ),
+ "version": row[3],
+ "row_id": row[4],
+ "col_name": col_name,
+ "from_storage_location_id": row[6],
+ "from_file_handle_id": row[7],
+ "to_file_handle_id": row[8],
+ "file_size": row[9],
+ "status": MigrationStatus(row[10]).name,
+ "exception": row[11],
+ }
+
+ async def get_migrations_async(self) -> List[Dict[str, Any]]:
+ """Get all migration entries (asynchronous).
+
+ Returns:
+ List of dictionaries for each migration entry.
+ """
+ # Convert to list since generators can't be returned from to_thread
+ return await asyncio.to_thread(lambda: list(self.get_migrations()))
+
+ def as_csv(self, path: str) -> None:
+ """Export migration results to a CSV file (synchronous).
+
+ Arguments:
+ path: Path to write the CSV file.
+ """
+ fieldnames = [
+ "id",
+ "type",
+ "version",
+ "row_id",
+ "col_name",
+ "from_storage_location_id",
+ "from_file_handle_id",
+ "to_file_handle_id",
+ "file_size",
+ "status",
+ "exception",
+ ]
+
+ with open(path, "w", newline="") as csvfile:
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+ writer.writeheader()
+ for migration in self.get_migrations():
+ writer.writerow(migration)
+
+ async def as_csv_async(self, path: str) -> None:
+ """Export migration results to a CSV file (asynchronous).
+
+ Arguments:
+ path: Path to write the CSV file.
+ """
+ await asyncio.to_thread(self.as_csv, path)
+
+
+class MigrationError(Exception):
+ """Error during a migration operation.
+
+ Attributes:
+ key: The migration key that failed.
+ from_file_handle_id: The source file handle ID.
+ to_file_handle_id: The destination file handle ID (if partially complete).
+ """
+
+ def __init__(
+ self,
+ key: MigrationKey,
+ from_file_handle_id: str,
+ to_file_handle_id: Optional[str] = None,
+ ):
+ self.key = key
+ self.from_file_handle_id = from_file_handle_id
+ self.to_file_handle_id = to_file_handle_id
+ super().__init__(f"Migration failed for {key.id}")
+
+
+class IndexingError(Exception):
+ """Error during an indexing operation.
+
+ Attributes:
+ entity_id: The entity ID that failed to index.
+ concrete_type: The concrete type of the entity.
+ """
+
+ def __init__(self, entity_id: str, concrete_type: str):
+ self.entity_id = entity_id
+ self.concrete_type = concrete_type
+ super().__init__(f"Indexing failed for {entity_id} ({concrete_type})")
diff --git a/synapseclient/models/storage_location.py b/synapseclient/models/storage_location.py
new file mode 100644
index 000000000..664276855
--- /dev/null
+++ b/synapseclient/models/storage_location.py
@@ -0,0 +1,600 @@
+"""StorageLocation model for managing storage location settings in Synapse."""
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+
+from synapseclient import Synapse
+from synapseclient.api.storage_location_services import (
+ create_storage_location_setting,
+ get_storage_location_setting,
+)
+from synapseclient.core.async_utils import async_to_sync, otel_trace_method
+from synapseclient.models.protocols.storage_location_protocol import (
+ StorageLocationSynchronousProtocol,
+)
+
+if TYPE_CHECKING:
+ from synapseclient.models import Folder
+
+
+class StorageLocationType(str, Enum):
+ """Enumeration of storage location types supported by Synapse.
+
+ Each type maps to a specific concreteType suffix in the REST API.
+
+ Attributes:
+ SYNAPSE_S3: Synapse-managed S3 storage (default).
+ EXTERNAL_S3: User-owned Amazon S3 bucket accessed by Synapse.
+ EXTERNAL_GOOGLE_CLOUD: User-owned Google Cloud Storage bucket.
+ EXTERNAL_SFTP: External SFTP server not accessed by Synapse.
+ EXTERNAL_OBJECT_STORE: S3-like bucket (e.g., AWS S3 or OpenStack) not
+ accessed by Synapse.
+ PROXY: A proxy server that controls access to storage.
+ """
+
+ SYNAPSE_S3 = "S3StorageLocationSetting"
+ EXTERNAL_S3 = "ExternalS3StorageLocationSetting"
+ EXTERNAL_GOOGLE_CLOUD = "ExternalGoogleCloudStorageLocationSetting"
+ EXTERNAL_SFTP = "ExternalStorageLocationSetting"
+ EXTERNAL_OBJECT_STORE = "ExternalObjectStorageLocationSetting"
+ PROXY = "ProxyStorageLocationSettings"
+
+
+class UploadType(str, Enum):
+ """Enumeration of upload types for storage locations.
+
+ Attributes:
+ S3: Amazon S3 compatible upload.
+ GOOGLE_CLOUD_STORAGE: Google Cloud Storage upload.
+ SFTP: SFTP upload.
+ HTTPS: HTTPS upload (typically used with proxy storage).
+ NONE: No upload type specified.
+ """
+
+ S3 = "S3"
+ GOOGLE_CLOUD_STORAGE = "GOOGLECLOUDSTORAGE"
+ SFTP = "SFTP"
+ HTTPS = "HTTPS"
+ NONE = "NONE"
+
+
+# Mapping from StorageLocationType to default UploadType
+_STORAGE_TYPE_TO_UPLOAD_TYPE: Dict[StorageLocationType, UploadType] = {
+ StorageLocationType.SYNAPSE_S3: UploadType.S3,
+ StorageLocationType.EXTERNAL_S3: UploadType.S3,
+ StorageLocationType.EXTERNAL_GOOGLE_CLOUD: UploadType.GOOGLE_CLOUD_STORAGE,
+ StorageLocationType.EXTERNAL_SFTP: UploadType.SFTP,
+ StorageLocationType.EXTERNAL_OBJECT_STORE: UploadType.S3,
+ StorageLocationType.PROXY: UploadType.HTTPS,
+}
+
+# Mapping from concreteType suffix to StorageLocationType
+_CONCRETE_TYPE_TO_STORAGE_TYPE: Dict[str, StorageLocationType] = {
+ storage_type.value: storage_type for storage_type in StorageLocationType
+}
+
+
+@dataclass()
+@async_to_sync
+class StorageLocation(StorageLocationSynchronousProtocol):
+ """A storage location setting describes where files are uploaded to and
+ downloaded from via Synapse. Storage location settings may be created for
+ external locations, such as user-owned Amazon S3 buckets, Google Cloud
+ Storage buckets, SFTP servers, or proxy storage.
+
+ Attributes:
+ storage_location_id: (Read Only) The unique ID for this storage location,
+ assigned by the server on creation.
+ storage_type: The type of storage location. Required when creating a new
+ storage location via `store()`. Determines the `concreteType` sent to
+ the Synapse REST API.
+ banner: The banner text to display to a user every time a file is uploaded.
+ This field is optional.
+ description: A description of the storage location. This description is
+ shown when a user has to choose which upload destination to use.
+
+ Attributes:
+ bucket: The name of the S3 or Google Cloud Storage bucket. Applicable to
+ SYNAPSE_S3, EXTERNAL_S3, EXTERNAL_GOOGLE_CLOUD, and
+ EXTERNAL_OBJECT_STORE types.
+ base_key: The optional base key (prefix/folder) within the bucket.
+ Applicable to SYNAPSE_S3, EXTERNAL_S3, and EXTERNAL_GOOGLE_CLOUD types.
+ sts_enabled: Whether STS (AWS Security Token Service) is enabled on this
+ storage location. Applicable to SYNAPSE_S3 and EXTERNAL_S3 types.
+ endpoint_url: The endpoint URL of the S3 service. Applicable to
+ EXTERNAL_S3 (default: https://s3.amazonaws.com) and
+ EXTERNAL_OBJECT_STORE types.
+
+ Attributes:
+ url: The base URL for uploading to the external destination. Applicable to
+ EXTERNAL_SFTP type.
+ supports_subfolders: Whether the destination supports creating subfolders
+ under the base url. Applicable to EXTERNAL_SFTP type. Default: False.
+
+ Attributes:
+ proxy_url: The HTTPS URL of the proxy used for upload and download.
+ Applicable to PROXY type.
+ secret_key: The encryption key used to sign all pre-signed URLs used to
+ communicate with the proxy. Applicable to PROXY type.
+ benefactor_id: An Entity ID (such as a Project ID). When set, any user with
+ the 'create' permission on the given benefactorId will be allowed to
+ create ProxyFileHandle using its storage location ID. Applicable to
+ PROXY type.
+
+ Attributes:
+ upload_type: (Read Only) The upload type for this storage location.
+ Automatically derived from `storage_type`.
+ etag: (Read Only) Synapse employs an Optimistic Concurrency Control (OCC)
+ scheme. The E-Tag changes every time the setting is updated.
+ created_on: (Read Only) The date this storage location setting was created.
+ created_by: (Read Only) The ID of the user that created this storage
+ location setting.
+
+ Example: Creating an external S3 storage location
+ Create a storage location backed by your own S3 bucket:
+
+ from synapseclient.models import StorageLocation, StorageLocationType
+
+ import synapseclient
+ synapseclient.login()
+
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket="my-external-synapse-bucket",
+ base_key="path/within/bucket",
+ ).store()
+
+ print(f"Storage location ID: {storage.storage_location_id}")
+
+ Example: Creating an STS-enabled S3 storage location with a folder
+ Use the convenience classmethod to create a folder with STS-enabled
+ storage:
+
+ from synapseclient.models import StorageLocation
+
+ import synapseclient
+ synapseclient.login()
+
+ folder, storage = StorageLocation.setup_s3(
+ folder_name="my-sts-folder",
+ parent="syn123",
+ bucket_name="my-external-synapse-bucket",
+ base_key="path/within/bucket",
+ sts_enabled=True,
+ )
+ print(f"Folder: {folder.id}, Storage: {storage.storage_location_id}")
+
+ Example: Creating a Google Cloud storage location
+ Create a storage location backed by your own GCS bucket:
+
+ from synapseclient.models import StorageLocation, StorageLocationType
+
+ import synapseclient
+ synapseclient.login()
+
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_GOOGLE_CLOUD,
+ bucket="my-gcs-bucket",
+ base_key="path/within/bucket",
+ ).store()
+ """
+
+ # Core fields - present on all storage locations
+ storage_location_id: Optional[int] = None
+ """(Read Only) The unique ID for this storage location, assigned by the server
+ on creation."""
+
+ storage_type: Optional[StorageLocationType] = None
+ """The type of storage location. Required when creating a new storage location
+ via `store()`. Determines the `concreteType` sent to the Synapse REST API."""
+
+ banner: Optional[str] = None
+ """The banner text to display to a user every time a file is uploaded."""
+
+ description: Optional[str] = None
+ """A description of the storage location. This description is shown when a user
+ has to choose which upload destination to use."""
+
+ # S3/GCS specific fields
+ bucket: Optional[str] = None
+ """The name of the S3 or Google Cloud Storage bucket. Applicable to SYNAPSE_S3,
+ EXTERNAL_S3, EXTERNAL_GOOGLE_CLOUD, and EXTERNAL_OBJECT_STORE types."""
+
+ base_key: Optional[str] = None
+ """The optional base key (prefix/folder) within the bucket. Applicable to
+ SYNAPSE_S3, EXTERNAL_S3, and EXTERNAL_GOOGLE_CLOUD types."""
+
+ sts_enabled: Optional[bool] = None
+ """Whether STS (AWS Security Token Service) is enabled on this storage location.
+ Applicable to SYNAPSE_S3 and EXTERNAL_S3 types."""
+
+ endpoint_url: Optional[str] = None
+ """The endpoint URL of the S3 service. Applicable to EXTERNAL_S3
+ (default: https://s3.amazonaws.com) and EXTERNAL_OBJECT_STORE types."""
+
+ # SFTP specific fields
+ url: Optional[str] = None
+ """The base URL for uploading to the external destination. Applicable to
+ EXTERNAL_SFTP type."""
+
+ supports_subfolders: Optional[bool] = None
+ """Whether the destination supports creating subfolders under the base url.
+ Applicable to EXTERNAL_SFTP type. Default: False."""
+
+ # Proxy specific fields
+ proxy_url: Optional[str] = None
+ """The HTTPS URL of the proxy used for upload and download. Applicable to
+ PROXY type."""
+
+ secret_key: Optional[str] = None
+ """The encryption key used to sign all pre-signed URLs used to communicate
+ with the proxy. Applicable to PROXY type."""
+
+ benefactor_id: Optional[str] = None
+ """An Entity ID (such as a Project ID). When set, any user with the 'create'
+ permission on the given benefactorId will be allowed to create ProxyFileHandle
+ using its storage location ID. Applicable to PROXY type."""
+
+ # Read-only fields
+ upload_type: Optional[UploadType] = field(default=None, repr=False, compare=False)
+ """(Read Only) The upload type for this storage location. Automatically derived
+ from `storage_type`."""
+
+ etag: Optional[str] = field(default=None, compare=False)
+ """(Read Only) Synapse employs an Optimistic Concurrency Control (OCC) scheme.
+ The E-Tag changes every time the setting is updated."""
+
+ created_on: Optional[str] = field(default=None, compare=False)
+ """(Read Only) The date this storage location setting was created."""
+
+ created_by: Optional[int] = field(default=None, compare=False)
+ """(Read Only) The ID of the user that created this storage location setting."""
+
+ def fill_from_dict(self, synapse_response: Dict[str, Any]) -> "StorageLocation":
+ """Converts a response from the REST API into this dataclass.
+
+ Arguments:
+ synapse_response: The response from the REST API.
+
+ Returns:
+ The StorageLocation object.
+ """
+ self.storage_location_id = synapse_response.get("storageLocationId", None)
+ self.banner = synapse_response.get("banner", None)
+ self.description = synapse_response.get("description", None)
+ self.etag = synapse_response.get("etag", None)
+ self.created_on = synapse_response.get("createdOn", None)
+ self.created_by = synapse_response.get("createdBy", None)
+
+ # Parse upload type
+ upload_type_str = synapse_response.get("uploadType", None)
+ if upload_type_str:
+ try:
+ self.upload_type = UploadType(upload_type_str)
+ except ValueError:
+ self.upload_type = None
+
+ # Parse storage type from concreteType
+ concrete_type = synapse_response.get("concreteType", "")
+ if concrete_type:
+ # Extract the suffix after the last dot
+ type_suffix = concrete_type.split(".")[-1] if "." in concrete_type else ""
+ if type_suffix in _CONCRETE_TYPE_TO_STORAGE_TYPE:
+ self.storage_type = _CONCRETE_TYPE_TO_STORAGE_TYPE[type_suffix]
+
+ # S3/GCS fields
+ self.bucket = synapse_response.get("bucket", None)
+ self.base_key = synapse_response.get("baseKey", None)
+ self.sts_enabled = synapse_response.get("stsEnabled", None)
+ self.endpoint_url = synapse_response.get("endpointUrl", None)
+
+ # SFTP fields
+ self.url = synapse_response.get("url", None)
+ self.supports_subfolders = synapse_response.get("supportsSubfolders", None)
+
+ # Proxy fields
+ self.proxy_url = synapse_response.get("proxyUrl", None)
+ self.secret_key = synapse_response.get("secretKey", None)
+ self.benefactor_id = synapse_response.get("benefactorId", None)
+
+ return self
+
+ def _to_synapse_request(self) -> Dict[str, Any]:
+ """Convert this dataclass to a request body for the REST API.
+
+ Returns:
+ A dictionary suitable for the REST API.
+ """
+ if not self.storage_type:
+ raise ValueError(
+ "storage_type is required when creating a storage location"
+ )
+
+ # Build the concrete type
+ concrete_type = (
+ f"org.sagebionetworks.repo.model.project.{self.storage_type.value}"
+ )
+
+ # Determine upload type
+ upload_type = self.upload_type or _STORAGE_TYPE_TO_UPLOAD_TYPE.get(
+ self.storage_type, UploadType.S3
+ )
+
+ body: Dict[str, Any] = {
+ "concreteType": concrete_type,
+ "uploadType": upload_type.value,
+ }
+
+ # Add optional common fields
+ if self.banner is not None:
+ body["banner"] = self.banner
+ if self.description is not None:
+ body["description"] = self.description
+
+ # Add type-specific fields
+ if self.storage_type in (
+ StorageLocationType.SYNAPSE_S3,
+ StorageLocationType.EXTERNAL_S3,
+ StorageLocationType.EXTERNAL_GOOGLE_CLOUD,
+ StorageLocationType.EXTERNAL_OBJECT_STORE,
+ ):
+ if self.bucket is not None:
+ body["bucket"] = self.bucket
+ if self.base_key is not None:
+ body["baseKey"] = self.base_key
+
+ if self.storage_type in (
+ StorageLocationType.SYNAPSE_S3,
+ StorageLocationType.EXTERNAL_S3,
+ ):
+ if self.sts_enabled is not None:
+ body["stsEnabled"] = self.sts_enabled
+
+ if self.storage_type in (
+ StorageLocationType.EXTERNAL_S3,
+ StorageLocationType.EXTERNAL_OBJECT_STORE,
+ ):
+ if self.endpoint_url is not None:
+ body["endpointUrl"] = self.endpoint_url
+
+ if self.storage_type == StorageLocationType.EXTERNAL_SFTP:
+ if self.url is not None:
+ body["url"] = self.url
+ if self.supports_subfolders is not None:
+ body["supportsSubfolders"] = self.supports_subfolders
+
+ if self.storage_type == StorageLocationType.PROXY:
+ if self.proxy_url is not None:
+ body["proxyUrl"] = self.proxy_url
+ if self.secret_key is not None:
+ body["secretKey"] = self.secret_key
+ if self.benefactor_id is not None:
+ body["benefactorId"] = self.benefactor_id
+
+ return body
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"StorageLocation_Store: {self.storage_type}"
+ )
+ async def store_async(
+ self,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> "StorageLocation":
+ """Create this storage location in Synapse. Storage locations are immutable;
+ this always creates a new one. If a storage location with identical properties
+ already exists for this user, the existing one is returned (idempotent).
+
+ Arguments:
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The StorageLocation object with server-assigned fields populated.
+
+ Raises:
+ ValueError: If `storage_type` is not set.
+
+ Example: Using this function
+ Create an external S3 storage location:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import StorageLocation, StorageLocationType
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ storage = await StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket="my-bucket",
+ base_key="my/prefix",
+ ).store_async()
+ print(f"Created storage location: {storage.storage_location_id}")
+
+ asyncio.run(main())
+ """
+ body = self._to_synapse_request()
+ response = await create_storage_location_setting(
+ body=body,
+ synapse_client=synapse_client,
+ )
+ self.fill_from_dict(response)
+ return self
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"StorageLocation_Get: {self.storage_location_id}"
+ )
+ async def get_async(
+ self,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> "StorageLocation":
+ """Retrieve this storage location from Synapse by its ID. Only the creator of
+ a StorageLocationSetting can retrieve it by its id.
+
+ Arguments:
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The StorageLocation object populated with data from Synapse.
+
+ Raises:
+ ValueError: If `storage_location_id` is not set.
+
+ Example: Using this function
+ Retrieve a storage location by ID:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import StorageLocation
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ storage = await StorageLocation(storage_location_id=12345).get_async()
+ print(f"Type: {storage.storage_type}, Bucket: {storage.bucket}")
+
+ asyncio.run(main())
+ """
+ if not self.storage_location_id:
+ raise ValueError(
+ "storage_location_id is required to retrieve a storage location"
+ )
+
+ response = await get_storage_location_setting(
+ storage_location_id=self.storage_location_id,
+ synapse_client=synapse_client,
+ )
+ self.fill_from_dict(response)
+ return self
+
+ @classmethod
+ async def setup_s3_async(
+ cls,
+ *,
+ parent: str,
+ folder_name: Optional[str] = None,
+ folder: Optional[Union["Folder", str]] = None,
+ bucket_name: Optional[str] = None,
+ base_key: Optional[str] = None,
+ sts_enabled: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Tuple["Folder", "StorageLocation"]:
+ """Convenience method to create a folder backed by S3 storage. This will:
+
+ 1. Create or retrieve the folder
+ 2. Create the storage location setting
+ 3. Apply the storage location to the folder via project settings
+
+ Arguments:
+ parent: The parent project or folder ID (e.g., "syn123").
+ folder_name: Name for a new folder. Either `folder_name` or `folder`
+ must be provided.
+ folder: An existing Folder object or Synapse ID. Either `folder_name`
+ or `folder` must be provided.
+ bucket_name: The S3 bucket name. If None, uses Synapse default storage.
+ base_key: The base key (prefix) within the bucket. Optional.
+ sts_enabled: Whether to enable STS credentials for this storage location.
+ Default: False.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A tuple of (Folder, StorageLocation).
+
+ Raises:
+ ValueError: If neither `folder_name` nor `folder` is provided, or if both
+ are provided.
+
+ Example: Using this function
+ Create an STS-enabled folder with external S3 storage:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import StorageLocation
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ folder, storage = await StorageLocation.setup_s3_async(
+ folder_name="my-sts-folder",
+ parent="syn123",
+ bucket_name="my-external-synapse-bucket",
+ base_key="path/within/bucket",
+ sts_enabled=True,
+ )
+ print(f"Folder: {folder.id}, Storage: {storage.storage_location_id}")
+
+ asyncio.run(main())
+
+ Example: Using existing folder
+ Apply S3 storage to an existing folder:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import StorageLocation
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ folder, storage = await StorageLocation.setup_s3_async(
+ folder="syn456",
+ bucket_name="my-bucket",
+ )
+
+ asyncio.run(main())
+ """
+ # Import here to avoid circular imports
+ from synapseclient.models import Folder as FolderModel
+
+ # Validate parameters
+ if folder_name and folder:
+ raise ValueError(
+ "folder and folder_name are mutually exclusive, only one should be passed"
+ )
+ if not folder_name and not folder:
+ raise ValueError("Either folder or folder_name is required")
+
+ # Create or get the folder
+ if folder_name:
+ target_folder = await FolderModel(
+ name=folder_name, parent_id=parent
+ ).store_async(synapse_client=synapse_client)
+ elif isinstance(folder, str):
+ target_folder = await FolderModel(id=folder).get_async(
+ synapse_client=synapse_client
+ )
+ else:
+ target_folder = folder
+
+ # Determine storage type
+ if bucket_name:
+ storage_type = StorageLocationType.EXTERNAL_S3
+ else:
+ storage_type = StorageLocationType.SYNAPSE_S3
+
+ # Create the storage location
+ storage_location = await cls(
+ storage_type=storage_type,
+ bucket=bucket_name,
+ base_key=base_key,
+ sts_enabled=sts_enabled,
+ ).store_async(synapse_client=synapse_client)
+
+ # Apply the storage location to the folder
+ await target_folder.set_storage_location_async(
+ storage_location_id=storage_location.storage_location_id,
+ synapse_client=synapse_client,
+ )
+
+ return target_folder, storage_location
diff --git a/tests/unit/synapseclient/api/unit_test_storage_location_services.py b/tests/unit/synapseclient/api/unit_test_storage_location_services.py
new file mode 100644
index 000000000..bebc80d50
--- /dev/null
+++ b/tests/unit/synapseclient/api/unit_test_storage_location_services.py
@@ -0,0 +1,215 @@
+"""Unit tests for storage_location_services utility functions."""
+
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+import synapseclient.api.storage_location_services as storage_location_services
+
+
+class TestCreateStorageLocationSetting:
+ """Tests for create_storage_location_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_create_storage_location_setting(self, mock_synapse):
+ """Test create_storage_location_setting creates a storage location."""
+ # GIVEN a mock client that returns a storage location
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_post_async.return_value = {
+ "storageLocationId": 12345,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-bucket",
+ }
+
+ # WHEN I call create_storage_location_setting
+ body = {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-bucket",
+ }
+ result = await storage_location_services.create_storage_location_setting(
+ body=body,
+ synapse_client=None,
+ )
+
+ # THEN I expect the storage location to be returned
+ assert result["storageLocationId"] == 12345
+ assert result["bucket"] == "my-bucket"
+ mock_client.rest_post_async.assert_awaited_once()
+
+
+class TestGetStorageLocationSetting:
+ """Tests for get_storage_location_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_get_storage_location_setting(self, mock_synapse):
+ """Test get_storage_location_setting retrieves a storage location."""
+ # GIVEN a mock client that returns a storage location
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_get_async.return_value = {
+ "storageLocationId": 12345,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-bucket",
+ }
+
+ # WHEN I call get_storage_location_setting
+ result = await storage_location_services.get_storage_location_setting(
+ storage_location_id=12345,
+ synapse_client=None,
+ )
+
+ # THEN I expect the storage location to be returned
+ assert result["storageLocationId"] == 12345
+ assert result["bucket"] == "my-bucket"
+ mock_client.rest_get_async.assert_awaited_once_with(
+ uri="/storageLocation/12345",
+ )
+
+
+class TestGetProjectSetting:
+ """Tests for get_project_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_get_project_setting_exists(self, mock_synapse):
+ """Test get_project_setting when setting exists."""
+ # GIVEN a mock client that returns a project setting
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_get_async.return_value = {
+ "id": "setting123",
+ "projectId": "syn456",
+ "settingsType": "upload",
+ "locations": [12345],
+ }
+
+ # WHEN I call get_project_setting
+ result = await storage_location_services.get_project_setting(
+ project_id="syn456",
+ setting_type="upload",
+ synapse_client=None,
+ )
+
+ # THEN I expect the project setting to be returned
+ assert result["id"] == "setting123"
+ assert result["locations"] == [12345]
+ mock_client.rest_get_async.assert_awaited_once_with(
+ uri="/projectSettings/syn456/type/upload",
+ )
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_get_project_setting_not_exists(self, mock_synapse):
+ """Test get_project_setting when setting does not exist."""
+ # GIVEN a mock client that returns empty response
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_get_async.return_value = ""
+
+ # WHEN I call get_project_setting
+ result = await storage_location_services.get_project_setting(
+ project_id="syn456",
+ setting_type="upload",
+ synapse_client=None,
+ )
+
+ # THEN I expect None to be returned
+ assert result is None
+
+
+class TestCreateProjectSetting:
+ """Tests for create_project_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_create_project_setting(self, mock_synapse):
+ """Test create_project_setting creates a project setting."""
+ # GIVEN a mock client that returns a project setting
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_post_async.return_value = {
+ "id": "setting123",
+ "projectId": "syn456",
+ "settingsType": "upload",
+ "locations": [12345],
+ }
+
+ # WHEN I call create_project_setting
+ body = {
+ "concreteType": "org.sagebionetworks.repo.model.project.UploadDestinationListSetting",
+ "settingsType": "upload",
+ "locations": [12345],
+ "projectId": "syn456",
+ }
+ result = await storage_location_services.create_project_setting(
+ body=body,
+ synapse_client=None,
+ )
+
+ # THEN I expect the project setting to be returned
+ assert result["id"] == "setting123"
+ mock_client.rest_post_async.assert_awaited_once()
+
+
+class TestUpdateProjectSetting:
+ """Tests for update_project_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_update_project_setting(self, mock_synapse):
+ """Test update_project_setting updates a project setting."""
+ # GIVEN a mock client that returns an updated project setting
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_put_async.return_value = {
+ "id": "setting123",
+ "projectId": "syn456",
+ "settingsType": "upload",
+ "locations": [12345, 67890],
+ }
+
+ # WHEN I call update_project_setting
+ body = {
+ "id": "setting123",
+ "projectId": "syn456",
+ "settingsType": "upload",
+ "locations": [12345, 67890],
+ }
+ result = await storage_location_services.update_project_setting(
+ body=body,
+ synapse_client=None,
+ )
+
+ # THEN I expect the updated project setting to be returned
+ assert result["locations"] == [12345, 67890]
+ mock_client.rest_put_async.assert_awaited_once()
+
+
+class TestDeleteProjectSetting:
+ """Tests for delete_project_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_delete_project_setting(self, mock_synapse):
+ """Test delete_project_setting deletes a project setting."""
+ # GIVEN a mock client
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_delete_async.return_value = None
+
+ # WHEN I call delete_project_setting
+ await storage_location_services.delete_project_setting(
+ setting_id="setting123",
+ synapse_client=None,
+ )
+
+ # THEN I expect the delete to be called
+ mock_client.rest_delete_async.assert_awaited_once_with(
+ uri="/projectSettings/setting123",
+ )
diff --git a/tests/unit/synapseclient/models/unit_test_manifest.py b/tests/unit/synapseclient/models/unit_test_manifest.py
new file mode 100644
index 000000000..4c65ac7c3
--- /dev/null
+++ b/tests/unit/synapseclient/models/unit_test_manifest.py
@@ -0,0 +1,499 @@
+"""Unit tests for the synapseclient.models.mixins.manifest module."""
+
+import datetime
+import os
+import tempfile
+
+import pytest
+
+from synapseclient.models.mixins.manifest import (
+ DEFAULT_GENERATED_MANIFEST_KEYS,
+ MANIFEST_FILENAME,
+ _convert_manifest_data_items_to_string_list,
+ _convert_manifest_data_row_to_dict,
+ _extract_entity_metadata_for_file,
+ _get_entity_provenance_dict_for_file,
+ _manifest_filename,
+ _parse_manifest_value,
+ _validate_manifest_required_fields,
+ _write_manifest_data,
+)
+
+
+class TestManifestConstants:
+ """Tests for manifest constants."""
+
+ def test_manifest_filename_constant(self):
+ """Test the MANIFEST_FILENAME constant."""
+ assert MANIFEST_FILENAME == "SYNAPSE_METADATA_MANIFEST.tsv"
+
+ def test_default_manifest_keys(self):
+ """Test the DEFAULT_GENERATED_MANIFEST_KEYS constant."""
+ expected_keys = [
+ "path",
+ "parent",
+ "name",
+ "id",
+ "synapseStore",
+ "contentType",
+ "used",
+ "executed",
+ "activityName",
+ "activityDescription",
+ ]
+ assert DEFAULT_GENERATED_MANIFEST_KEYS == expected_keys
+
+
+class TestManifestFilename:
+ """Tests for _manifest_filename function."""
+
+ def test_manifest_filename(self):
+ """Test generating manifest filename."""
+ # GIVEN a path
+ path = "/path/to/directory"
+
+ # WHEN we generate the manifest filename
+ result = _manifest_filename(path)
+
+ # THEN it should be the path joined with MANIFEST_FILENAME
+ assert result == os.path.join(path, MANIFEST_FILENAME)
+
+
+class TestConvertManifestDataItemsToStringList:
+ """Tests for _convert_manifest_data_items_to_string_list function."""
+
+ def test_single_string(self):
+ """Test converting a single string."""
+ # GIVEN a list with a single string
+ items = ["hello"]
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list(items)
+
+ # THEN it should return the string directly
+ assert result == "hello"
+
+ def test_multiple_strings(self):
+ """Test converting multiple strings."""
+ # GIVEN a list with multiple strings
+ items = ["a", "b", "c"]
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list(items)
+
+ # THEN it should return a bracketed list
+ assert result == "[a,b,c]"
+
+ def test_string_with_comma(self):
+ """Test converting a string with comma."""
+ # GIVEN a single item with comma (no quotes needed for single item)
+ items = ["hello,world"]
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list(items)
+
+ # THEN it should return the string directly
+ assert result == "hello,world"
+
+ def test_multiple_strings_with_comma(self):
+ """Test converting multiple strings where one has a comma."""
+ # GIVEN multiple strings where one contains commas
+ items = ["string,with,commas", "string without commas"]
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list(items)
+
+ # THEN the comma-containing string should be quoted
+ assert result == '["string,with,commas",string without commas]'
+
+ def test_datetime(self):
+ """Test converting a datetime."""
+ # GIVEN a datetime value
+ dt = datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list([dt])
+
+ # THEN it should return ISO format
+ assert result == "2020-01-01T00:00:00Z"
+
+ def test_multiple_datetimes(self):
+ """Test converting multiple datetimes."""
+ # GIVEN multiple datetime values
+ dt1 = datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)
+ dt2 = datetime.datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list([dt1, dt2])
+
+ # THEN it should return a bracketed list of ISO dates
+ assert result == "[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]"
+
+ def test_boolean_true(self):
+ """Test converting True."""
+ # GIVEN a True value
+ items = [True]
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list(items)
+
+ # THEN it should return "True"
+ assert result == "True"
+
+ def test_boolean_false(self):
+ """Test converting False."""
+ # GIVEN a False value
+ items = [False]
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list(items)
+
+ # THEN it should return "False"
+ assert result == "False"
+
+ def test_integer(self):
+ """Test converting an integer."""
+ # GIVEN an integer value
+ items = [1]
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list(items)
+
+ # THEN it should return the string representation
+ assert result == "1"
+
+ def test_float(self):
+ """Test converting a float."""
+ # GIVEN a float value
+ items = [1.5]
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list(items)
+
+ # THEN it should return the string representation
+ assert result == "1.5"
+
+ def test_empty_list(self):
+ """Test converting an empty list."""
+ # GIVEN an empty list
+ items = []
+
+ # WHEN we convert to string
+ result = _convert_manifest_data_items_to_string_list(items)
+
+ # THEN it should return an empty string
+ assert result == ""
+
+
+class TestConvertManifestDataRowToDict:
+ """Tests for _convert_manifest_data_row_to_dict function."""
+
+ def test_simple_row(self):
+ """Test converting a simple row."""
+ # GIVEN a row with simple values
+ row = {"path": "/path/to/file", "name": "file.txt"}
+ keys = ["path", "name"]
+
+ # WHEN we convert it
+ result = _convert_manifest_data_row_to_dict(row, keys)
+
+ # THEN it should return the same values
+ assert result == {"path": "/path/to/file", "name": "file.txt"}
+
+ def test_row_with_list(self):
+ """Test converting a row with a list value."""
+ # GIVEN a row with a list value
+ row = {"annotations": ["a", "b", "c"]}
+ keys = ["annotations"]
+
+ # WHEN we convert it
+ result = _convert_manifest_data_row_to_dict(row, keys)
+
+ # THEN the list should be converted to a string
+ assert result == {"annotations": "[a,b,c]"}
+
+ def test_missing_key(self):
+ """Test converting a row with a missing key."""
+ # GIVEN a row missing a key
+ row = {"path": "/path/to/file"}
+ keys = ["path", "name"]
+
+ # WHEN we convert it
+ result = _convert_manifest_data_row_to_dict(row, keys)
+
+ # THEN the missing key should be empty string
+ assert result == {"path": "/path/to/file", "name": ""}
+
+
+class TestParseManifestValue:
+ """Tests for _parse_manifest_value function."""
+
+ def test_simple_string(self):
+ """Test parsing a simple string."""
+ assert _parse_manifest_value("hello") == "hello"
+
+ def test_list_syntax(self):
+ """Test parsing list syntax."""
+ assert _parse_manifest_value("[a,b,c]") == ["a", "b", "c"]
+
+ def test_list_with_quoted_string(self):
+ """Test parsing list with quoted string containing comma."""
+ result = _parse_manifest_value('["hello,world",other]')
+ assert result == ["hello,world", "other"]
+
+ def test_boolean_true(self):
+ """Test parsing 'true' string."""
+ assert _parse_manifest_value("true") is True
+ assert _parse_manifest_value("True") is True
+ assert _parse_manifest_value("TRUE") is True
+
+ def test_boolean_false(self):
+ """Test parsing 'false' string."""
+ assert _parse_manifest_value("false") is False
+ assert _parse_manifest_value("False") is False
+ assert _parse_manifest_value("FALSE") is False
+
+ def test_integer(self):
+ """Test parsing an integer string."""
+ assert _parse_manifest_value("123") == 123
+
+ def test_float(self):
+ """Test parsing a float string."""
+ assert _parse_manifest_value("1.5") == 1.5
+
+ def test_non_numeric_string(self):
+ """Test that non-numeric strings stay as strings."""
+ assert _parse_manifest_value("hello123") == "hello123"
+
+
+class TestWriteManifestData:
+ """Tests for _write_manifest_data function."""
+
+ def test_write_simple_manifest(self):
+ """Test writing a simple manifest file."""
+ # GIVEN simple data
+ keys = ["path", "name", "id"]
+ data = [
+ {"path": "/path/to/file1.txt", "name": "file1.txt", "id": "syn123"},
+ {"path": "/path/to/file2.txt", "name": "file2.txt", "id": "syn456"},
+ ]
+
+ # WHEN we write it to a temp file
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
+ filename = f.name
+
+ try:
+ _write_manifest_data(filename, keys, data)
+
+ # THEN the file should contain the expected content
+ with open(filename, "r") as f:
+ content = f.read()
+
+ lines = content.strip().split("\n")
+ assert len(lines) == 3 # header + 2 data rows
+ assert lines[0] == "path\tname\tid"
+ assert lines[1] == "/path/to/file1.txt\tfile1.txt\tsyn123"
+ assert lines[2] == "/path/to/file2.txt\tfile2.txt\tsyn456"
+ finally:
+ os.unlink(filename)
+
+
+class TestValidateManifestRequiredFields:
+ """Tests for _validate_manifest_required_fields function."""
+
+ def test_valid_manifest(self):
+ """Test validating a valid manifest file."""
+ # GIVEN a valid manifest file
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
+ f.write("path\tparent\n")
+ f.write(f"{f.name}\tsyn123\n")
+ filename = f.name
+
+ try:
+ # Create the file referenced in path column
+ with open(filename, "a") as f:
+ pass # File already exists
+
+ # WHEN we validate it
+ is_valid, errors = _validate_manifest_required_fields(filename)
+
+ # THEN it should be valid
+ assert is_valid is True
+ assert errors == []
+ finally:
+ os.unlink(filename)
+
+ def test_missing_file(self):
+ """Test validating a non-existent manifest file."""
+ # WHEN we validate a non-existent file
+ is_valid, errors = _validate_manifest_required_fields("/nonexistent/file.tsv")
+
+ # THEN it should be invalid
+ assert is_valid is False
+ assert len(errors) == 1
+ assert "not found" in errors[0]
+
+ def test_missing_required_field(self):
+ """Test validating a manifest missing a required field."""
+ # GIVEN a manifest missing the 'parent' field
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
+ f.write("path\tname\n")
+ f.write("/path/to/file.txt\tfile.txt\n")
+ filename = f.name
+
+ try:
+ # WHEN we validate it
+ is_valid, errors = _validate_manifest_required_fields(filename)
+
+ # THEN it should be invalid
+ assert is_valid is False
+ assert any("parent" in e for e in errors)
+ finally:
+ os.unlink(filename)
+
+ def test_empty_path(self):
+ """Test validating a manifest with empty path."""
+ # GIVEN a manifest with empty path
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
+ f.write("path\tparent\n")
+ f.write("\tsyn123\n")
+ filename = f.name
+
+ try:
+ # WHEN we validate it
+ is_valid, errors = _validate_manifest_required_fields(filename)
+
+ # THEN it should be invalid
+ assert is_valid is False
+ assert any("'path' is empty" in e for e in errors)
+ finally:
+ os.unlink(filename)
+
+ def test_invalid_parent_id(self):
+ """Test validating a manifest with invalid parent ID."""
+ # GIVEN a manifest with invalid parent ID
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
+ f.write("path\tparent\n")
+ f.write(f"{f.name}\tinvalid_parent\n")
+ filename = f.name
+
+ try:
+ # WHEN we validate it
+ is_valid, errors = _validate_manifest_required_fields(filename)
+
+ # THEN it should be invalid
+ assert is_valid is False
+ assert any("not a valid Synapse ID" in e for e in errors)
+ finally:
+ os.unlink(filename)
+
+
+class TestExtractEntityMetadataForFile:
+ """Tests for _extract_entity_metadata_for_file function."""
+
+ def test_extract_basic_metadata(self):
+ """Test extracting basic file metadata."""
+
+ # GIVEN a mock File object
+ class MockFile:
+ def __init__(self):
+ self.parent_id = "syn123"
+ self.path = "/path/to/file.txt"
+ self.name = "file.txt"
+ self.id = "syn456"
+ self.synapse_store = True
+ self.content_type = "text/plain"
+ self.annotations = None
+ self.activity = None
+
+ file = MockFile()
+
+ # WHEN we extract metadata
+ keys, data = _extract_entity_metadata_for_file([file])
+
+ # THEN we should get the expected data
+ assert "path" in keys
+ assert "parent" in keys
+ assert "name" in keys
+ assert "id" in keys
+ assert len(data) == 1
+ assert data[0]["path"] == "/path/to/file.txt"
+ assert data[0]["parent"] == "syn123"
+ assert data[0]["name"] == "file.txt"
+ assert data[0]["id"] == "syn456"
+
+ def test_extract_with_annotations(self):
+ """Test extracting metadata with annotations."""
+
+ # GIVEN a mock File object with annotations
+ class MockFile:
+ def __init__(self):
+ self.parent_id = "syn123"
+ self.path = "/path/to/file.txt"
+ self.name = "file.txt"
+ self.id = "syn456"
+ self.synapse_store = True
+ self.content_type = "text/plain"
+ self.annotations = {"study": ["Study1"], "dataType": ["RNA-seq"]}
+ self.activity = None
+
+ file = MockFile()
+
+ # WHEN we extract metadata
+ keys, data = _extract_entity_metadata_for_file([file])
+
+ # THEN annotation keys should be included
+ assert "study" in keys
+ assert "dataType" in keys
+ assert data[0]["study"] == ["Study1"]
+ assert data[0]["dataType"] == ["RNA-seq"]
+
+
+class TestGetEntityProvenanceDictForFile:
+ """Tests for _get_entity_provenance_dict_for_file function."""
+
+ def test_no_activity(self):
+ """Test extracting provenance when there is no activity."""
+
+ # GIVEN a mock File object with no activity
+ class MockFile:
+ def __init__(self):
+ self.activity = None
+
+ file = MockFile()
+
+ # WHEN we extract provenance
+ result = _get_entity_provenance_dict_for_file(file)
+
+ # THEN we should get an empty dict
+ assert result == {}
+
+ def test_with_activity(self):
+ """Test extracting provenance when there is an activity."""
+
+ # GIVEN mock objects
+ class MockUsedEntity:
+ def format_for_manifest(self):
+ return "syn789"
+
+ class MockActivity:
+ def __init__(self):
+ self.name = "Analysis"
+ self.description = "Processing data"
+ self.used = [MockUsedEntity()]
+ self.executed = []
+
+ class MockFile:
+ def __init__(self):
+ self.activity = MockActivity()
+
+ file = MockFile()
+
+ # WHEN we extract provenance
+ result = _get_entity_provenance_dict_for_file(file)
+
+ # THEN we should get the expected dict
+ assert result["activityName"] == "Analysis"
+ assert result["activityDescription"] == "Processing data"
+ assert result["used"] == "syn789"
+ assert result["executed"] == ""
diff --git a/tests/unit/synapseclient/models/unit_test_storage_location.py b/tests/unit/synapseclient/models/unit_test_storage_location.py
new file mode 100644
index 000000000..400e28566
--- /dev/null
+++ b/tests/unit/synapseclient/models/unit_test_storage_location.py
@@ -0,0 +1,355 @@
+"""Unit tests for the synapseclient.models.StorageLocation class."""
+
+import pytest
+
+from synapseclient.models import StorageLocation, StorageLocationType, UploadType
+
+
+class TestStorageLocation:
+ """Unit tests for basic StorageLocation model functionality."""
+
+ def test_storage_location_type_enum_values(self):
+ """Test that StorageLocationType enum has correct values."""
+ assert StorageLocationType.SYNAPSE_S3.value == "S3StorageLocationSetting"
+ assert (
+ StorageLocationType.EXTERNAL_S3.value == "ExternalS3StorageLocationSetting"
+ )
+ assert (
+ StorageLocationType.EXTERNAL_GOOGLE_CLOUD.value
+ == "ExternalGoogleCloudStorageLocationSetting"
+ )
+ assert (
+ StorageLocationType.EXTERNAL_SFTP.value == "ExternalStorageLocationSetting"
+ )
+ assert (
+ StorageLocationType.EXTERNAL_OBJECT_STORE.value
+ == "ExternalObjectStorageLocationSetting"
+ )
+ assert StorageLocationType.PROXY.value == "ProxyStorageLocationSettings"
+
+ def test_upload_type_enum_values(self):
+ """Test that UploadType enum has correct values."""
+ assert UploadType.S3.value == "S3"
+ assert UploadType.GOOGLE_CLOUD_STORAGE.value == "GOOGLECLOUDSTORAGE"
+ assert UploadType.SFTP.value == "SFTP"
+ assert UploadType.HTTPS.value == "HTTPS"
+ assert UploadType.NONE.value == "NONE"
+
+ def test_to_synapse_request_external_s3(self):
+ """Test generating a request body for EXTERNAL_S3 storage location."""
+ # GIVEN an EXTERNAL_S3 storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket="my-bucket",
+ base_key="my/prefix",
+ sts_enabled=True,
+ banner="Upload banner",
+ description="Test storage location",
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-bucket",
+ "baseKey": "my/prefix",
+ "stsEnabled": True,
+ "banner": "Upload banner",
+ "description": "Test storage location",
+ }
+
+ def test_to_synapse_request_synapse_s3(self):
+ """Test generating a request body for SYNAPSE_S3 storage location."""
+ # GIVEN a SYNAPSE_S3 storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.SYNAPSE_S3,
+ sts_enabled=False,
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.S3StorageLocationSetting",
+ "uploadType": "S3",
+ "stsEnabled": False,
+ }
+
+ def test_to_synapse_request_google_cloud(self):
+ """Test generating a request body for EXTERNAL_GOOGLE_CLOUD storage location."""
+ # GIVEN a EXTERNAL_GOOGLE_CLOUD storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_GOOGLE_CLOUD,
+ bucket="my-gcs-bucket",
+ base_key="gcs/prefix",
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting",
+ "uploadType": "GOOGLECLOUDSTORAGE",
+ "bucket": "my-gcs-bucket",
+ "baseKey": "gcs/prefix",
+ }
+
+ def test_to_synapse_request_sftp(self):
+ """Test generating a request body for EXTERNAL_SFTP storage location."""
+ # GIVEN an EXTERNAL_SFTP storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_SFTP,
+ url="sftp://example.com/path",
+ supports_subfolders=True,
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting",
+ "uploadType": "SFTP",
+ "url": "sftp://example.com/path",
+ "supportsSubfolders": True,
+ }
+
+ def test_to_synapse_request_proxy(self):
+ """Test generating a request body for PROXY storage location."""
+ # GIVEN a PROXY storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.PROXY,
+ proxy_url="https://proxy.example.com",
+ secret_key="my-secret-key",
+ benefactor_id="syn123",
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ProxyStorageLocationSettings",
+ "uploadType": "HTTPS",
+ "proxyUrl": "https://proxy.example.com",
+ "secretKey": "my-secret-key",
+ "benefactorId": "syn123",
+ }
+
+ def test_to_synapse_request_external_object_store(self):
+ """Test generating a request body for EXTERNAL_OBJECT_STORE storage location."""
+ # GIVEN an EXTERNAL_OBJECT_STORE storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_OBJECT_STORE,
+ bucket="my-s3-like-bucket",
+ endpoint_url="https://s3.custom.com",
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalObjectStorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-s3-like-bucket",
+ "endpointUrl": "https://s3.custom.com",
+ }
+
+ def test_to_synapse_request_missing_storage_type(self):
+ """Test that _to_synapse_request raises ValueError when storage_type is missing."""
+ # GIVEN a storage location without a storage_type
+ storage = StorageLocation(
+ bucket="my-bucket",
+ )
+
+ # THEN it should raise ValueError
+ with pytest.raises(ValueError, match="storage_type is required"):
+ storage._to_synapse_request()
+
+ def test_fill_from_dict_external_s3(self):
+ """Test filling from a REST API response for EXTERNAL_S3."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 12345,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-bucket",
+ "baseKey": "my/prefix",
+ "stsEnabled": True,
+ "banner": "Upload banner",
+ "description": "Test storage location",
+ "etag": "abc123",
+ "createdOn": "2024-01-01T00:00:00.000Z",
+ "createdBy": 123456,
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 12345
+ assert storage.storage_type == StorageLocationType.EXTERNAL_S3
+ assert storage.upload_type == UploadType.S3
+ assert storage.bucket == "my-bucket"
+ assert storage.base_key == "my/prefix"
+ assert storage.sts_enabled is True
+ assert storage.banner == "Upload banner"
+ assert storage.description == "Test storage location"
+ assert storage.etag == "abc123"
+ assert storage.created_on == "2024-01-01T00:00:00.000Z"
+ assert storage.created_by == 123456
+
+ def test_fill_from_dict_synapse_s3(self):
+ """Test filling from a REST API response for SYNAPSE_S3."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 1,
+ "concreteType": "org.sagebionetworks.repo.model.project.S3StorageLocationSetting",
+ "uploadType": "S3",
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 1
+ assert storage.storage_type == StorageLocationType.SYNAPSE_S3
+
+ def test_fill_from_dict_google_cloud(self):
+ """Test filling from a REST API response for EXTERNAL_GOOGLE_CLOUD."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 67890,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting",
+ "uploadType": "GOOGLECLOUDSTORAGE",
+ "bucket": "my-gcs-bucket",
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 67890
+ assert storage.storage_type == StorageLocationType.EXTERNAL_GOOGLE_CLOUD
+ assert storage.upload_type == UploadType.GOOGLE_CLOUD_STORAGE
+ assert storage.bucket == "my-gcs-bucket"
+
+ def test_fill_from_dict_sftp(self):
+ """Test filling from a REST API response for EXTERNAL_SFTP."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 11111,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting",
+ "uploadType": "SFTP",
+ "url": "sftp://example.com/path",
+ "supportsSubfolders": True,
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 11111
+ assert storage.storage_type == StorageLocationType.EXTERNAL_SFTP
+ assert storage.upload_type == UploadType.SFTP
+ assert storage.url == "sftp://example.com/path"
+ assert storage.supports_subfolders is True
+
+ def test_fill_from_dict_proxy(self):
+ """Test filling from a REST API response for PROXY."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 22222,
+ "concreteType": "org.sagebionetworks.repo.model.project.ProxyStorageLocationSettings",
+ "uploadType": "HTTPS",
+ "proxyUrl": "https://proxy.example.com",
+ "secretKey": "my-secret-key",
+ "benefactorId": "syn123",
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 22222
+ assert storage.storage_type == StorageLocationType.PROXY
+ assert storage.upload_type == UploadType.HTTPS
+ assert storage.proxy_url == "https://proxy.example.com"
+ assert storage.secret_key == "my-secret-key"
+ assert storage.benefactor_id == "syn123"
+
+
+class TestStorageLocationAsync:
+ """Async unit tests for StorageLocation model."""
+
+ @pytest.mark.asyncio
+ async def test_get_async_missing_id(self):
+ """Test that get_async raises ValueError when storage_location_id is missing."""
+ # GIVEN a storage location without an ID
+ storage = StorageLocation()
+
+ # THEN it should raise ValueError
+ with pytest.raises(ValueError, match="storage_location_id is required"):
+ await storage.get_async()
+
+ @pytest.mark.asyncio
+ async def test_store_async_missing_storage_type(self):
+ """Test that store_async raises ValueError when storage_type is missing."""
+ # GIVEN a storage location without a storage_type
+ storage = StorageLocation(bucket="my-bucket")
+
+ # THEN it should raise ValueError
+ with pytest.raises(ValueError, match="storage_type is required"):
+ await storage.store_async()
+
+
+class TestSetupS3:
+ """Tests for the setup_s3 convenience method."""
+
+ @pytest.mark.asyncio
+ async def test_setup_s3_async_requires_folder_or_folder_name(self):
+ """Test that setup_s3_async raises ValueError when neither folder nor folder_name is provided."""
+ # WHEN I call setup_s3_async without folder or folder_name
+ # THEN it should raise ValueError
+ with pytest.raises(
+ ValueError, match="Either folder or folder_name is required"
+ ):
+ await StorageLocation.setup_s3_async(parent="syn123")
+
+ @pytest.mark.asyncio
+ async def test_setup_s3_async_folder_and_folder_name_mutually_exclusive(self):
+ """Test that setup_s3_async raises ValueError when both folder and folder_name are provided."""
+ from synapseclient.models import Folder
+
+ # GIVEN both folder and folder_name
+ folder = Folder(id="syn456")
+
+ # WHEN I call setup_s3_async with both
+ # THEN it should raise ValueError
+ with pytest.raises(
+ ValueError, match="folder and folder_name are mutually exclusive"
+ ):
+ await StorageLocation.setup_s3_async(
+ parent="syn123", folder_name="test", folder=folder
+ )
From e264f1a47b704dd0bfe7b02bfcc2ff2e440c2871 Mon Sep 17 00:00:00 2001
From: BryanFauble <17128019+BryanFauble@users.noreply.github.com>
Date: Wed, 4 Feb 2026 22:31:00 +0000
Subject: [PATCH 02/31] Add download list file
---
synapseclient/models/download_list.py | 224 ++++++++++++++++++++++++++
1 file changed, 224 insertions(+)
create mode 100644 synapseclient/models/download_list.py
diff --git a/synapseclient/models/download_list.py b/synapseclient/models/download_list.py
new file mode 100644
index 000000000..e1c0eb866
--- /dev/null
+++ b/synapseclient/models/download_list.py
@@ -0,0 +1,224 @@
+"""Models for interacting with Synapse's Download List functionality.
+
+This module provides classes for generating manifest files from a user's download list
+using the Synapse Asynchronous Job service.
+
+See: https://rest-docs.synapse.org/rest/POST/download/list/manifest/async/start.html
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional
+
+from typing_extensions import Self
+
+from synapseclient import Synapse
+from synapseclient.core.async_utils import async_to_sync, otel_trace_method
+from synapseclient.core.constants.concrete_types import DOWNLOAD_LIST_MANIFEST_REQUEST
+from synapseclient.core.download import download_by_file_handle
+from synapseclient.core.utils import delete_none_keys
+from synapseclient.models.mixins.asynchronous_job import AsynchronousCommunicator
+from synapseclient.models.protocols.download_list_protocol import (
+ DownloadListManifestRequestSynchronousProtocol,
+)
+from synapseclient.models.table_components import CsvTableDescriptor
+
+
+@dataclass
+@async_to_sync
+class DownloadListManifestRequest(
+ DownloadListManifestRequestSynchronousProtocol, AsynchronousCommunicator
+):
+ """
+ A request to generate a manifest file (CSV) of the current user's download list.
+
+ This class uses the Synapse Asynchronous Job service to generate a manifest file
+ containing metadata about files in the user's download list. The manifest can be
+ used to download files or for record-keeping purposes.
+
+ See: https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/download/DownloadListManifestRequest.html
+
+ Attributes:
+ csv_table_descriptor: Optional CSV formatting options for the manifest.
+ result_file_handle_id: The file handle ID of the generated manifest (populated after completion).
+
+ Example: Generate a manifest from download list
+ Generate a CSV manifest from your download list:
+
+ from synapseclient.models import DownloadListManifestRequest
+ import synapseclient
+
+ synapseclient.login()
+
+ # Create and send the request
+ request = DownloadListManifestRequest()
+ request.send_job_and_wait()
+
+ print(f"Manifest file handle: {request.result_file_handle_id}")
+
+ Example: Generate manifest with custom CSV formatting
+ Use custom separator and quote characters:
+
+ from synapseclient.models import DownloadListManifestRequest, CsvTableDescriptor
+ import synapseclient
+
+ synapseclient.login()
+
+ request = DownloadListManifestRequest(
+ csv_table_descriptor=CsvTableDescriptor(
+ separator="\t", # Tab-separated
+ is_first_line_header=True
+ )
+ )
+ request.send_job_and_wait()
+ """
+
+ concrete_type: str = field(
+ default=DOWNLOAD_LIST_MANIFEST_REQUEST, repr=False, compare=False
+ )
+ """The concrete type of this request."""
+
+ csv_table_descriptor: Optional[CsvTableDescriptor] = None
+ """Optional CSV formatting options for the manifest file."""
+
+ result_file_handle_id: Optional[str] = None
+ """The file handle ID of the generated manifest file. Populated after the job completes."""
+
+ def to_synapse_request(self) -> Dict[str, Any]:
+ """
+ Convert this request to the format expected by the Synapse REST API.
+
+ Returns:
+ A dictionary containing the request body for the Synapse API.
+ """
+ request = {
+ "concreteType": self.concrete_type,
+ }
+ if self.csv_table_descriptor:
+ request[
+ "csvTableDescriptor"
+ ] = self.csv_table_descriptor.to_synapse_request()
+ delete_none_keys(request)
+ return request
+
+ def fill_from_dict(self, synapse_response: Dict[str, Any]) -> Self:
+ """
+ Populate this object from a Synapse REST API response.
+
+ Arguments:
+ synapse_response: The response from the REST API.
+
+ Returns:
+ This object with fields populated from the response.
+ """
+ self.result_file_handle_id = synapse_response.get("resultFileHandleId", None)
+ return self
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: "DownloadListManifestRequest_send_job_and_wait"
+ )
+ async def send_job_and_wait_async(
+ self,
+ post_exchange_args: Optional[Dict[str, Any]] = None,
+ timeout: int = 120,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Self:
+ """Send the job to the Asynchronous Job service and wait for it to complete.
+
+ This method sends the manifest generation request to Synapse and waits
+ for the job to complete. After completion, the `result_file_handle_id`
+ attribute will be populated.
+
+ Arguments:
+ post_exchange_args: Additional arguments to pass to the request.
+ timeout: The number of seconds to wait for the job to complete or progress
+ before raising a SynapseTimeoutError. Defaults to 120.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ This instance with `result_file_handle_id` populated.
+
+ Raises:
+ SynapseTimeoutError: If the job does not complete within the timeout.
+ SynapseError: If the job fails.
+
+ Example: Generate a manifest
+ Generate a manifest from the download list:
+
+ from synapseclient.models import DownloadListManifestRequest
+ import synapseclient
+
+ synapseclient.login()
+
+ request = DownloadListManifestRequest()
+ request.send_job_and_wait()
+ print(f"Manifest file handle: {request.result_file_handle_id}")
+ """
+ return await super().send_job_and_wait_async(
+ post_exchange_args=post_exchange_args,
+ timeout=timeout,
+ synapse_client=synapse_client,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: "DownloadListManifestRequest_download_manifest"
+ )
+ async def download_manifest_async(
+ self,
+ download_path: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> str:
+ """
+ Download the generated manifest file to a local path.
+
+ This method should be called after `send_job_and_wait()` has completed
+ successfully and `result_file_handle_id` is populated.
+
+ Arguments:
+ download_path: The local directory path where the manifest will be saved.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The full path to the downloaded manifest file.
+
+ Raises:
+ ValueError: If the manifest has not been generated yet (no result_file_handle_id).
+
+ Example: Download the manifest after generation
+ Generate and download a manifest:
+
+ from synapseclient.models import DownloadListManifestRequest
+ import synapseclient
+
+ synapseclient.login()
+
+ request = DownloadListManifestRequest()
+ request.send_job_and_wait()
+
+ manifest_path = request.download_manifest(download_path="/path/to/download")
+ print(f"Manifest downloaded to: {manifest_path}")
+ """
+ if not self.result_file_handle_id:
+ raise ValueError(
+ "Manifest has not been generated yet. "
+ "Call send_job_and_wait() before downloading."
+ )
+
+ # Download the file handle using the download module
+ # For download list manifests, the synapse_id parameter is set to the file handle ID
+ # because these manifests are not associated with a specific entity. The download
+ # service handles this case by using the file handle directly.
+ downloaded_path = await download_by_file_handle(
+ file_handle_id=self.result_file_handle_id,
+ synapse_id=self.result_file_handle_id,
+ entity_type="FileEntity",
+ destination=download_path,
+ synapse_client=synapse_client,
+ )
+
+ return downloaded_path
From d6111d31cdf821959f571d085400a2a864fad545 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Mon, 23 Feb 2026 09:56:50 -0800
Subject: [PATCH 03/31] update docs
---
.../storage_location_architecture.md | 305 +++++++++---------
1 file changed, 159 insertions(+), 146 deletions(-)
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
index 612ef7d21..47e668097 100644
--- a/docs/explanations/storage_location_architecture.md
+++ b/docs/explanations/storage_location_architecture.md
@@ -40,16 +40,17 @@ relationships, and data flows that enable flexible storage configuration.
## Overview
-The StorageLocation system enables Synapse users to configure where uploaded files
-are stored. By default, Synapse stores files in its internal S3 storage, but
+The StorageLocation setting enables Synapse users to configure a location where files are uploaded to and downloaded from via Synapse.
+By default, Synapse stores files in its internal S3 storage, but
users can configure projects and folders to use external storage backends such as
-AWS S3 buckets, Google Cloud Storage, SFTP servers, or proxy servers.
+AWS S3 buckets, Google Cloud Storage, SFTP servers, or a local file server using a proxy server.
-!!! info "Key Concepts"
- - **StorageLocation**: A configuration describing where files are stored
- - **Project Setting**: Links a storage location to a Project or Folder
- - **STS Credentials**: Temporary AWS credentials for direct S3 access
- - **Storage Migration**: Moving files between storage locations
+### Key Concepts
+- [**StorageLocationSetting**](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/project/StorageLocationSetting.html): A configuration specifying file storage and download locations.
+- [**ProjectSetting**](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/project/ProjectSetting.html): A configuration applied to projects that allows customization of file storage locations.
+- [**UploadType**](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/file/UploadType.html): An enumeration that defines the types of file upload destinations that Synapse supports.
+- **STS Credentials**: Temporary AWS credentials for direct S3 access.
+- **StorageLocation Migration**: The process of transferring the files associated with Synapse entities between storage locations while preserving the entities’ structure and identifiers.
---
@@ -113,7 +114,7 @@ classDiagram
NONE
}
- class StorageLocationConfigurable {
+ class StorageLocation {
<>
+set_storage_location(storage_location_id)
+get_project_setting(setting_type)
@@ -137,20 +138,17 @@ classDiagram
StorageLocation --> StorageLocationType : storage_type
StorageLocation --> UploadType : upload_type
- StorageLocationConfigurable <|-- Project : implements
- StorageLocationConfigurable <|-- Folder : implements
+ StorageLocation <|-- Project : implements
+ StorageLocation <|-- Folder : implements
```
### Key Components
-
-| Component | Purpose |
-|-----------|---------|
-| [StorageLocation][synapseclient.models.StorageLocation] | Data model representing a storage location setting in Synapse |
-| [StorageLocationType][synapseclient.models.StorageLocationType] | Enumeration defining the supported storage backend types |
-| [UploadType][synapseclient.models.UploadType] | Enumeration defining the upload protocol for each storage type |
-| [StorageLocationConfigurable][synapseclient.models.mixins.StorageLocationConfigurable] | Mixin providing storage management methods to entities |
+[synapseclient.models.StorageLocation] | The model representing a storage location setting in Synapse |
+[synapseclient.models.StorageLocationType] | Enumeration defining the supported storage backend types |
+[synapseclient.models.UploadType] | Enumeration defining the upload protocol for each storage type |
+[synapseclient.models.mixins.StorageLocation] | Mixin providing storage management methods to entities |
---
@@ -159,7 +157,7 @@ classDiagram
## Storage Type Mapping
Each `StorageLocationType` maps to a specific REST API `concreteType` and has a
-default `UploadType`. This mapping is bidirectional, allowing the system to parse
+default `UploadType`. This mapping allows the system to parse
responses from the API and construct requests.
```mermaid
@@ -199,23 +197,43 @@ flowchart LR
-### Type-Specific Attributes
+### Storage Type Attributes
Different storage types support different configuration attributes:
-| Attribute | SYNAPSE | EXT_S3 | EXT_GCS | EXT_SFTP | EXT_OBJ | PROXY |
-|-----------|:-------:|:------:|:-------:|:--------:|:-------:|:-----:|
-| `bucket` | ✓ | ✓ | ✓ | | ✓ | |
-| `base_key` | ✓ | ✓ | ✓ | | | |
-| `sts_enabled` | ✓ | ✓ | | | | |
-| `endpoint_url` | | ✓ | | | ✓ | |
-| `url` | | | | ✓ | | |
-| `supports_subfolders` | | | | ✓ | | |
-| `proxy_url` | | | | | | ✓ |
-| `secret_key` | | | | | | ✓ |
-| `benefactor_id` | | | | | | ✓ |
+| Attribute | Type | S3StorageLocationSetting | ExternalS3StorageLocationSetting | ExternalObjectStorageLocationSetting | ExternalStorageLocationSetting | ExternalGoogleCloudStorageLocationSetting | ProxyStorageLocationSettings |
+|-----------|------|--------------------------|----------------------------------|--------------------------------------|--------------------------------|-------------------------------------------|------------------------------|
+| **Common (all types)** |
+| `concreteType` | string (enum) | ✓ (required) | ✓ (required) | ✓ (required) | ✓ (required) | ✓ (required) | ✓ (required) |
+| `storageLocationId` | integer (int32) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `uploadType` | string | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `banner` | string | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `description` | string | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `etag` | string | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `createdOn` | string | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `createdBy` | integer (int32) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| **Type-specific** |
+| `baseKey` | string | ✓ | ✓ | — | — | ✓ | — |
+| `stsEnabled` | boolean | ✓ | ✓ | — | — | — | — |
+| `bucket` | string | — | ✓ (required) | ✓ (required) | — | ✓ (required) | — |
+| `endpointUrl` | string | — | ✓ | ✓ (required) | — | — | — |
+| `url` | string | — | — | — | ✓ | — | — |
+| `supportsSubfolders` | boolean | — | — | — | ✓ | — | — |
+| `proxyUrl` | string | — | — | — | — | — | ✓ |
+| `secretKey` | string | — | — | — | — | — | ✓ |
+| `benefactorId` | string | — | — | — | — | — | ✓ |
+
+## Summary by type
+
+| Setting type | Description | Type-specific attributes |
+|--------------|-------------|---------------------------|
+| **S3StorageLocationSetting** | Default Synapse storage on Amazon S3. | `baseKey`, `stsEnabled` |
+| **ExternalS3StorageLocationSetting** | External S3 bucket connected with Synapse (Synapse-accessed). | `bucket` (required), `baseKey`, `stsEnabled`, `endpointUrl` |
+| **ExternalObjectStorageLocationSetting** | S3-compatible object storage **not** accessed by Synapse. | `bucket` (required), `endpointUrl` (required) |
+| **ExternalStorageLocationSetting** | SFTP or HTTPS upload destination. | `url`, `supportsSubfolders` |
+| **ExternalGoogleCloudStorageLocationSetting** | External Google Cloud Storage bucket connected with Synapse. | `bucket` (required), `baseKey` |
+| **ProxyStorageLocationSettings** | HTTPS proxy for all upload/download operations. | `proxyUrl`, `secretKey`, `benefactorId` |
-**Legend:** SYNAPSE = SYNAPSE_S3, EXT_S3 = EXTERNAL_S3, EXT_GCS = EXTERNAL_GOOGLE_CLOUD, EXT_SFTP = EXTERNAL_SFTP, EXT_OBJ = EXTERNAL_OBJECT_STORE
@@ -225,7 +243,9 @@ Use this decision tree to select the appropriate storage type for your use case:
```mermaid
flowchart TB
- Start([Need custom storage?]) --> Q1{Want Synapse to
manage storage?}
+ Start{Need custom storage?}
+ Start -->|No| DEFAULT[Use default Synapse storage]
+ Start -->|Yes| Q1{Want Synapse to
manage storage?}
Q1 -->|Yes| SYNAPSE_S3[Use SYNAPSE_S3]
Q1 -->|No| Q2{What storage
backend?}
@@ -234,7 +254,7 @@ flowchart TB
Q2 -->|Google Cloud| EXTERNAL_GOOGLE_CLOUD[Use EXTERNAL_GOOGLE_CLOUD]
Q2 -->|SFTP Server| EXTERNAL_SFTP[Use EXTERNAL_SFTP]
Q2 -->|Proxy Server| PROXY[Use PROXY]
- Q2 -->|S3-compatible
non-AWS| EXTERNAL_OBJECT_STORE[Use EXTERNAL_OBJECT_STORE]
+ Q2 -->|AWS S3 | EXTERNAL_OBJECT_STORE[Use EXTERNAL_OBJECT_STORE]
Q3 -->|Yes| Q4{Need STS
credentials?}
Q3 -->|No| EXTERNAL_OBJECT_STORE
@@ -249,6 +269,7 @@ flowchart TB
EXTERNAL_SFTP --> Benefits4[Benefits:
- Legacy systems
- Synapse never touches data]
EXTERNAL_OBJECT_STORE --> Benefits5[Benefits:
- OpenStack, MinIO, etc
- Synapse never touches data]
PROXY --> Benefits6[Benefits:
- Custom access control
- Data transformation]
+ DEFAULT --> Benefits0[Benefits:
- No configuration needed
- Synapse-managed S3]
```
---
@@ -258,27 +279,14 @@ flowchart TB
## Entity Inheritance Hierarchy
Projects and Folders inherit storage configuration capabilities through the
-`StorageLocationConfigurable` mixin. This pattern allows consistent storage
+`StorageLocation` mixin. This pattern allows consistent storage
management across container entities.
```mermaid
classDiagram
direction TB
- class AccessControllable {
- <>
- +get_permissions()
- +set_permissions()
- +delete_permissions()
- }
-
- class StorableContainer {
- <>
- +sync()
- +get_children()
- }
-
- class StorageLocationConfigurable {
+ class StorageLocation {
<>
+set_storage_location()
+get_project_setting()
@@ -302,18 +310,13 @@ classDiagram
+str etag
}
- AccessControllable <|-- Project
- AccessControllable <|-- Folder
- StorableContainer <|-- Project
- StorableContainer <|-- Folder
- StorageLocationConfigurable <|-- Project
- StorageLocationConfigurable <|-- Folder
+ StorageLocation <|-- Project
+ StorageLocation <|-- Folder
```
-!!! tip "Mixin Pattern"
- The mixin pattern allows `Project` and `Folder` to share storage location
- functionality without code duplication. Both classes inherit the same
- methods from `StorageLocationConfigurable`.
+The mixin pattern allows `Project` and `Folder` to share storage location
+functionality without code duplication. Both classes inherit the same
+methods from `StorageLocation`.
---
@@ -330,7 +333,7 @@ This section contains sequence diagrams for key operations.
### Store Operation
-The `store()` method creates a new storage location in Synapse.
+The `store()` method creates a new storage location in Synapse. Creating a storage location is idempotent per user. Repeating a creation request with the same properties will return the previously created storage location rather than creating a new one.
```mermaid
sequenceDiagram
@@ -376,85 +379,78 @@ sequenceDiagram
deactivate StorageLocation
```
-!!! note "Idempotent Behavior"
- Storage locations are immutable once created. If you call `store()` with
- identical parameters, Synapse returns the existing storage location rather
- than creating a duplicate.
-
### Setup S3 Convenience Flow
The `setup_s3()` class method creates a folder with S3 storage in a single call.
+```mermaid
+sequenceDiagram
+ participant User
+ participant setup_s3 as StorageLocation.setup_s3()
+ participant StorageLocation
+ participant Folder
+ participant Mixin as StorageLocation
+ participant API as storage_location_services
+ participant Synapse as Synapse REST API
-??? example "Click to expand sequence diagram"
- ```mermaid
- sequenceDiagram
- participant User
- participant setup_s3 as StorageLocation.setup_s3()
- participant StorageLocation
- participant Folder
- participant Mixin as StorageLocationConfigurable
- participant API as storage_location_services
- participant Synapse as Synapse REST API
-
- User->>setup_s3: setup_s3(parent, folder_name, bucket_name)
- activate setup_s3
-
- Note over setup_s3: Validate: folder_name XOR folder
-
- alt folder_name provided
- setup_s3->>Folder: Folder(name, parent_id).store()
- activate Folder
- Folder->>Synapse: POST /entity
- Synapse-->>Folder: Folder response
- Folder-->>setup_s3: New Folder
- deactivate Folder
- else folder ID provided
- setup_s3->>Folder: Folder(id).get()
- activate Folder
- Folder->>Synapse: GET /entity/{id}
- Synapse-->>Folder: Folder response
- Folder-->>setup_s3: Existing Folder
- deactivate Folder
- end
+ User->>setup_s3: setup_s3(parent, folder_name, bucket_name)
+ activate setup_s3
+
+ Note over setup_s3: Validate: folder_name XOR folder
+
+ alt folder_name provided
+ setup_s3->>Folder: Folder(name, parent_id).store()
+ activate Folder
+ Folder->>Synapse: POST /entity
+ Synapse-->>Folder: Folder response
+ Folder-->>setup_s3: New Folder
+ deactivate Folder
+ else folder ID provided
+ setup_s3->>Folder: Folder(id).get()
+ activate Folder
+ Folder->>Synapse: GET /entity/{id}
+ Synapse-->>Folder: Folder response
+ Folder-->>setup_s3: Existing Folder
+ deactivate Folder
+ end
- alt bucket_name provided
- Note over setup_s3: storage_type = EXTERNAL_S3
- else bucket_name is None
- Note over setup_s3: storage_type = SYNAPSE_S3
- end
+ alt bucket_name provided
+ Note over setup_s3: storage_type = EXTERNAL_S3
+ else bucket_name is None
+ Note over setup_s3: storage_type = SYNAPSE_S3
+ end
- setup_s3->>StorageLocation: StorageLocation(...).store()
- activate StorageLocation
- StorageLocation->>Synapse: POST /storageLocation
- Synapse-->>StorageLocation: StorageLocation response
- StorageLocation-->>setup_s3: StorageLocation
- deactivate StorageLocation
-
- setup_s3->>Mixin: folder.set_storage_location(storage_location_id)
- activate Mixin
-
- Mixin->>API: get_project_setting(project_id, "upload")
- API->>Synapse: GET /projectSettings/{id}/type/upload
- Synapse-->>API: Setting or empty
-
- alt Setting exists
- API-->>Mixin: Existing setting
- Mixin->>API: update_project_setting(body)
- API->>Synapse: PUT /projectSettings
- else No setting
- Mixin->>API: create_project_setting(body)
- API->>Synapse: POST /projectSettings
- end
+ setup_s3->>StorageLocation: StorageLocation(...).store()
+ activate StorageLocation
+ StorageLocation->>Synapse: POST /storageLocation
+ Synapse-->>StorageLocation: StorageLocation response
+ StorageLocation-->>setup_s3: StorageLocation
+ deactivate StorageLocation
+
+ setup_s3->>Mixin: folder.set_storage_location(storage_location_id)
+ activate Mixin
- Synapse-->>API: Project setting response
- API-->>Mixin: Updated setting
- deactivate Mixin
+ Mixin->>API: get_project_setting(project_id, "upload")
+ API->>Synapse: GET /projectSettings/{id}/type/upload
+ Synapse-->>API: Setting or empty
+
+ alt Setting exists
+ API-->>Mixin: Existing setting
+ Mixin->>API: update_project_setting(body)
+ API->>Synapse: PUT /projectSettings
+ else No setting
+ Mixin->>API: create_project_setting(body)
+ API->>Synapse: POST /projectSettings
+ end
- setup_s3-->>User: (Folder, StorageLocation)
- deactivate setup_s3
- ```
+ Synapse-->>API: Project setting response
+ API-->>Mixin: Updated setting
+ deactivate Mixin
+
+ setup_s3-->>User: (Folder, StorageLocation)
+ deactivate setup_s3
+```
@@ -462,15 +458,24 @@ The `setup_s3()` class method creates a folder with S3 storage in a single call.
STS (AWS Security Token Service) enables direct S3 access using temporary credentials.
+When a Synapse client is constructed (`Synapse.__init__`), it creates an in-memory token cache:
+
+- `self._sts_token_store = sts_transfer.StsTokenStore()` (see `synapseclient/client.py`)
+
+The store caches STS tokens per entity and permission so repeated access to the same storage location can reuse credentials without a round-trip to the REST API.
+
```mermaid
sequenceDiagram
participant User
participant Entity as Folder/Project
- participant Mixin as StorageLocationConfigurable
+ participant Mixin as StorageLocation
participant STS as sts_transfer module
participant Client as Synapse Client
+ participant TokenStore as _sts_token_store (StsTokenStore)
participant Synapse as Synapse REST API
+ Note over Client,TokenStore: Client.__init__ creates self._sts_token_store = sts_transfer.StsTokenStore()
+
User->>Entity: get_sts_storage_token(permission, output_format)
activate Entity
@@ -483,11 +488,23 @@ sequenceDiagram
Mixin->>STS: sts_transfer.get_sts_credentials()
activate STS
- STS->>Synapse: GET /entity/{id}/sts?permission={permission}
- activate Synapse
-
- Synapse-->>STS: STS credentials response
- deactivate Synapse
+ STS->>Client: syn._sts_token_store.get_token(...)
+ activate Client
+ Client->>TokenStore: get_token(entity_id, permission, min_remaining_life)
+ activate TokenStore
+
+ alt token cached and not expired
+ TokenStore-->>Client: Cached token
+ else cache miss or token expired
+ TokenStore->>Synapse: GET /entity/{id}/sts?permission={permission}
+ activate Synapse
+ Synapse-->>TokenStore: STS credentials response
+ deactivate Synapse
+ TokenStore-->>Client: New token (cached)
+ end
+ deactivate TokenStore
+ Client-->>STS: Token
+ deactivate Client
Note over STS: Parse credentials
@@ -500,9 +517,6 @@ sequenceDiagram
else output_format == "shell" / "bash"
Note over STS: Format as export commands
STS-->>Mixin: Shell export commands
- else output_format == "dictionary"
- Note over STS: Return raw dict
- STS-->>Mixin: Dictionary
end
deactivate STS
@@ -524,7 +538,6 @@ sequenceDiagram
| `shell` / `bash` | `export AWS_ACCESS_KEY_ID=...` format | Execute in shell |
| `cmd` | Windows SET commands | Windows command prompt |
| `powershell` | PowerShell variable assignments | PowerShell scripts |
-| `dictionary` | Raw Python dict | Custom processing |
---
@@ -574,11 +587,11 @@ stateDiagram-v2
### Setting Types
-| Type | Purpose |
-|------|---------|
-| `upload` | Configures upload destination storage location(s) |
-| `external_sync` | Configures external sync settings |
-| `requester_pays` | Configures requester-pays bucket access |
+| Type | Purpose | Status |
+|------|---------|--------|
+| `upload` | Configures upload destination storage location(s) | **Supported** |
+
+Other setting types may be added in the future.
---
@@ -593,7 +606,7 @@ Synapse REST API endpoints. This layer handles serialization and error handling.
flowchart TB
subgraph "Model Layer"
SL[StorageLocation]
- SLCM[StorageLocationConfigurable Mixin]
+ SLCM[StorageLocation Mixin]
end
subgraph "API Layer (storage_location_services.py)"
@@ -781,5 +794,5 @@ sequenceDiagram
|----------|-------------|
| [Storage Location Tutorial](../tutorials/python/storage_location.md) | Step-by-step guide to using storage locations |
| [StorageLocation API Reference][synapseclient.models.StorageLocation] | Complete API documentation |
-| [StorageLocationConfigurable Mixin][synapseclient.models.mixins.StorageLocationConfigurable] | Mixin methods for Projects and Folders |
+| [StorageLocation Mixin][synapseclient.models.mixins.StorageLocation] | Mixin methods for Projects and Folders |
| [Custom Storage Locations (Synapse Docs)](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html) | Official Synapse documentation |
From 52d2b0f209bbc56d2386ec0d94a3ee73d2f8dd88 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Mon, 23 Feb 2026 09:57:43 -0800
Subject: [PATCH 04/31] remove unsupported setting type
---
synapseclient/api/storage_location_services.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/synapseclient/api/storage_location_services.py b/synapseclient/api/storage_location_services.py
index c73c7e8cc..87e9f77a9 100644
--- a/synapseclient/api/storage_location_services.py
+++ b/synapseclient/api/storage_location_services.py
@@ -76,8 +76,7 @@ async def get_project_setting(
Arguments:
project_id: The Synapse ID of the project or folder.
- setting_type: The type of setting to retrieve. One of:
- 'upload', 'external_sync', 'requester_pays'.
+ setting_type: The type of setting to retrieve. Currently supports 'upload' only.
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
From 0b04d1b22092886a6db1b2ceb02d7f1d762e7bec Mon Sep 17 00:00:00 2001
From: danlu1
Date: Tue, 24 Feb 2026 20:21:15 -0800
Subject: [PATCH 05/31] update api service and correct docstrings
---
.../api/storage_location_services.py | 66 +++++++++++--------
1 file changed, 37 insertions(+), 29 deletions(-)
diff --git a/synapseclient/api/storage_location_services.py b/synapseclient/api/storage_location_services.py
index 87e9f77a9..4ee2d7bf5 100644
--- a/synapseclient/api/storage_location_services.py
+++ b/synapseclient/api/storage_location_services.py
@@ -12,31 +12,31 @@
async def create_storage_location_setting(
- body: Dict[str, Any],
+ request: Dict[str, Any],
*,
synapse_client: Optional["Synapse"] = None,
) -> Dict[str, Any]:
- """Create a new storage location setting in Synapse.
+ """Create a new storage location in Synapse that can be linked to a project,
+ allowing users to upload their data to a storage location they own.
Storage location creation is idempotent per user - if the same user creates
a storage location with identical properties, the existing one is returned.
Arguments:
- body: The storage location setting request body containing concreteType
- and other type-specific fields.
+ request: The storage location setting matching .
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
Returns:
- The created or existing storage location setting as a dictionary.
+ The created storage location setting matching .
"""
from synapseclient import Synapse
client = Synapse.get_client(synapse_client=synapse_client)
return await client.rest_post_async(
uri="/storageLocation",
- body=json.dumps(body),
+ body=json.dumps(request),
)
@@ -56,7 +56,7 @@ async def get_storage_location_setting(
instance from the Synapse class constructor.
Returns:
- The storage location setting as a dictionary.
+ The created storage location setting matching .
"""
from synapseclient import Synapse
@@ -68,91 +68,99 @@ async def get_storage_location_setting(
async def get_project_setting(
project_id: str,
- setting_type: str,
+ project_setting_type: str,
*,
synapse_client: Optional["Synapse"] = None,
) -> Optional[Dict[str, Any]]:
- """Get the project setting for an entity.
+ """Retrieve the project setting of a particular setting type for the project or folder.
+ Only users with READ access on a project can retrieve its project settings.
Arguments:
project_id: The Synapse ID of the project or folder.
- setting_type: The type of setting to retrieve. Currently supports 'upload' only.
+ project_setting_type: The type of project setting to retrieve. Currently supports 'upload' only.
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
Returns:
- The project setting as a dictionary, or None if no setting exists.
+ The upload destination list setting matching .
+ If the storage location is Synapse S3, the response will be an empty string.
"""
from synapseclient import Synapse
client = Synapse.get_client(synapse_client=synapse_client)
response = await client.rest_get_async(
- uri=f"/projectSettings/{project_id}/type/{setting_type}",
+ uri=f"/projectSettings/{project_id}/type/{project_setting_type}",
)
- # If no project setting, an empty string is returned as the response
- return response if response else None
+ return response
async def create_project_setting(
- body: Dict[str, Any],
+ request: Dict[str, Any],
*,
synapse_client: Optional["Synapse"] = None,
) -> Dict[str, Any]:
- """Create a new project setting.
+ """Create a project setting for a project or folder.
+ Only the users with CREATE access to the project or folder can add a project setting.
+ Currently, only the "upload" project setting is supported. This is implemented using UploadDestinationListSetting matching .
+ A project can have a maximum of 10 storage locations.
Arguments:
- body: The project setting request body.
+ request: The project setting request body matching .
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
Returns:
- The created project setting as a dictionary.
+ The created project setting matching .
"""
from synapseclient import Synapse
client = Synapse.get_client(synapse_client=synapse_client)
return await client.rest_post_async(
uri="/projectSettings",
- body=json.dumps(body),
+ body=json.dumps(request),
)
async def update_project_setting(
- body: Dict[str, Any],
+ request: Dict[str, Any],
*,
synapse_client: Optional["Synapse"] = None,
-) -> Dict[str, Any]:
- """Update an existing project setting.
+) -> None:
+ """Update an existing project setting for a project or folder.
+ Only the users with UPDATE access to the project or folder can update a project setting.
+ Currently, only the "upload" project setting is supported. This is implemented using UploadDestinationListSetting matching .
+ A project can have a maximum of 10 storage locations.
Arguments:
- body: The project setting request body including the id field.
+ request: The project setting request body including the id field matching .
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
Returns:
- The updated project setting as a dictionary.
+ None
"""
from synapseclient import Synapse
client = Synapse.get_client(synapse_client=synapse_client)
return await client.rest_put_async(
uri="/projectSettings",
- body=json.dumps(body),
+ body=json.dumps(request),
)
async def delete_project_setting(
- setting_id: str,
+ project_setting_id: str,
*,
synapse_client: Optional["Synapse"] = None,
) -> None:
- """Delete a project setting.
+ """Delete a project setting for a project or folder.
+ Only the users with DELETE access to the project or folder can delete a project setting.
Arguments:
- setting_id: The ID of the project setting to delete.
+ project_setting_id: The ID of the project setting to delete.
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
@@ -164,5 +172,5 @@ async def delete_project_setting(
client = Synapse.get_client(synapse_client=synapse_client)
await client.rest_delete_async(
- uri=f"/projectSettings/{setting_id}",
+ uri=f"/projectSettings/{project_setting_id}",
)
From 4f0cf4221eb1e03510c3be71410b72c5be6b2ee7 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Fri, 6 Mar 2026 12:29:52 -0800
Subject: [PATCH 06/31] update docs
---
.../storage_location_architecture.md | 35 +++++++++++++++----
1 file changed, 28 insertions(+), 7 deletions(-)
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
index 47e668097..bf17fce9a 100644
--- a/docs/explanations/storage_location_architecture.md
+++ b/docs/explanations/storage_location_architecture.md
@@ -91,7 +91,6 @@ classDiagram
+str benefactor_id
+store() StorageLocation
+get() StorageLocation
- +setup_s3() Tuple~Folder, StorageLocation~
+fill_from_dict(dict) StorageLocation
}
@@ -114,7 +113,7 @@ classDiagram
NONE
}
- class StorageLocation {
+ class StorageLocationConfigurable {
<>
+set_storage_location(storage_location_id)
+get_project_setting(setting_type)
@@ -136,25 +135,47 @@ classDiagram
+str parent_id
}
+ class UploadDestinationListSetting {
+ <>
+ concreteType
+ id
+ projectId
+ settingsType
+ etag
+ locations
+ }
+
+ class ProjectSetting {
+ <>
+ concreteType
+ id
+ projectId
+ settingsType
+ etag
+
+ }
StorageLocation --> StorageLocationType : storage_type
StorageLocation --> UploadType : upload_type
- StorageLocation <|-- Project : implements
- StorageLocation <|-- Folder : implements
+ StorageLocationConfigurable <|-- Project : implements
+ StorageLocationConfigurable <|-- Folder : implements
```
+
### Key Components
[synapseclient.models.StorageLocation] | The model representing a storage location setting in Synapse |
[synapseclient.models.StorageLocationType] | Enumeration defining the supported storage backend types |
[synapseclient.models.UploadType] | Enumeration defining the upload protocol for each storage type |
-[synapseclient.models.mixins.StorageLocation] | Mixin providing storage management methods to entities |
+[synapseclient.models.mixins.StorageLocationConfigurable] | Mixin providing storage management methods to entities |
+[synapseclient.models.mixins.UploadDestinationListSetting] | Enumeration defining the setting type contains the list of upload locations for files in entities |
+[synapseclient.models.mixins.ProjectSetting] | Enumeration defining the project based setting |
---
-## Storage Type Mapping
+## Storage Type Mapping (TODO: double checking if EXTERNAL_HTTP works as expected)
Each `StorageLocationType` maps to a specific REST API `concreteType` and has a
default `UploadType`. This mapping allows the system to parse
@@ -748,7 +769,7 @@ sequenceDiagram
deactivate Entity
end
- rect rgb(255, 248, 240)
+ rect rgb(240, 248, 255)
Note over User,Synapse: Phase 2: Migrate Files
User->>Entity: migrate_indexed_files(db_path)
activate Entity
From e33db0b82039b80225256770b4e0f9ac067bd3ae Mon Sep 17 00:00:00 2001
From: danlu1
Date: Fri, 6 Mar 2026 12:31:15 -0800
Subject: [PATCH 07/31] update parameter name and output of get_project_setting
---
synapseclient/api/storage_location_services.py | 12 +++++++-----
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/synapseclient/api/storage_location_services.py b/synapseclient/api/storage_location_services.py
index 4ee2d7bf5..c80070dde 100644
--- a/synapseclient/api/storage_location_services.py
+++ b/synapseclient/api/storage_location_services.py
@@ -68,7 +68,7 @@ async def get_storage_location_setting(
async def get_project_setting(
project_id: str,
- project_setting_type: str,
+ setting_type: str = "upload",
*,
synapse_client: Optional["Synapse"] = None,
) -> Optional[Dict[str, Any]]:
@@ -77,22 +77,24 @@ async def get_project_setting(
Arguments:
project_id: The Synapse ID of the project or folder.
- project_setting_type: The type of project setting to retrieve. Currently supports 'upload' only.
+ setting_type: The type of project setting to retrieve. Currently supports 'upload' only.
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
Returns:
The upload destination list setting matching .
- If the storage location is Synapse S3, the response will be an empty string.
+ If the storage location is Synapse S3, the response will be None.
"""
from synapseclient import Synapse
client = Synapse.get_client(synapse_client=synapse_client)
response = await client.rest_get_async(
- uri=f"/projectSettings/{project_id}/type/{project_setting_type}",
+ uri=f"/projectSettings/{project_id}/type/{setting_type}",
)
- return response
+ return (
+ response if response else None
+ ) # if no project setting, a empty string is returned as the response
async def create_project_setting(
From 1364b994106fde5b36f0323753e357882d931938 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Fri, 6 Mar 2026 13:30:44 -0800
Subject: [PATCH 08/31] test mermaid
---
docs/explanations/storage_location_architecture.md | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
index bf17fce9a..38663c226 100644
--- a/docs/explanations/storage_location_architecture.md
+++ b/docs/explanations/storage_location_architecture.md
@@ -115,12 +115,12 @@ classDiagram
class StorageLocationConfigurable {
<>
- +set_storage_location(storage_location_id)
- +get_project_setting(setting_type)
+ +set_storage_location(storage_location_id) ProjectSetting
+ +get_project_setting(setting_type) ProjectSetting
+delete_project_setting(setting_id)
- +get_sts_storage_token(permission, output_format)
- +index_files_for_migration(dest_storage_location_id, db_path)
- +migrate_indexed_files(db_path)
+ +get_sts_storage_token(permission, output_format) dict
+ +index_files_for_migration(dest_storage_location_id, db_path) MigrationResult
+ +migrate_indexed_files(db_path) MigrationResult
}
class Project {
@@ -158,6 +158,9 @@ classDiagram
StorageLocation --> UploadType : upload_type
StorageLocationConfigurable <|-- Project : implements
StorageLocationConfigurable <|-- Folder : implements
+ StorageLocationConfigurable ..> ProjectSetting : returns
+ StorageLocationConfigurable ..> UploadDestinationListSetting : uses
+
```
From 7cf2d68d5914b596c26817e6eedf4686900c11be Mon Sep 17 00:00:00 2001
From: danlu1
Date: Fri, 6 Mar 2026 13:31:54 -0800
Subject: [PATCH 09/31] update key components
---
docs/explanations/storage_location_architecture.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
index 38663c226..7a085016b 100644
--- a/docs/explanations/storage_location_architecture.md
+++ b/docs/explanations/storage_location_architecture.md
@@ -172,7 +172,7 @@ classDiagram
[synapseclient.models.UploadType] | Enumeration defining the upload protocol for each storage type |
[synapseclient.models.mixins.StorageLocationConfigurable] | Mixin providing storage management methods to entities |
[synapseclient.models.mixins.UploadDestinationListSetting] | Enumeration defining the setting type contains the list of upload locations for files in entities |
-[synapseclient.models.mixins.ProjectSetting] | Enumeration defining the project based setting |
+[synapseclient.models.mixins.ProjectSetting] | Enumeration defining the project based setting
---
From e5937cadb2f6a7779082973b5534260badd55ec6 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Fri, 6 Mar 2026 14:16:58 -0800
Subject: [PATCH 10/31] update docs
---
.../storage_location_architecture.md | 113 +++++-------------
1 file changed, 27 insertions(+), 86 deletions(-)
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
index 7a085016b..08a8cf7d1 100644
--- a/docs/explanations/storage_location_architecture.md
+++ b/docs/explanations/storage_location_architecture.md
@@ -167,12 +167,15 @@ classDiagram
### Key Components
-[synapseclient.models.StorageLocation] | The model representing a storage location setting in Synapse |
-[synapseclient.models.StorageLocationType] | Enumeration defining the supported storage backend types |
-[synapseclient.models.UploadType] | Enumeration defining the upload protocol for each storage type |
-[synapseclient.models.mixins.StorageLocationConfigurable] | Mixin providing storage management methods to entities |
-[synapseclient.models.mixins.UploadDestinationListSetting] | Enumeration defining the setting type contains the list of upload locations for files in entities |
-[synapseclient.models.mixins.ProjectSetting] | Enumeration defining the project based setting
+
+| Component | Description |
+|-----------|-------------|
+| [synapseclient.models.StorageLocation] | The model representing a storage location setting in Synapse |
+| [synapseclient.models.StorageLocationType] | Enumeration defining the supported storage backend types |
+| [synapseclient.models.UploadType] | Enumeration defining the upload protocol for each storage type |
+| [synapseclient.models.mixins.StorageLocationConfigurable] | Mixin providing storage management methods to entities |
+| [synapseclient.models.mixins.UploadDestinationListSetting] | Dataclass defining the upload destination list setting containing storage location IDs |
+| [synapseclient.models.mixins.ProjectSetting] | Dataclass defining the base project setting structure |
---
@@ -405,79 +408,6 @@ sequenceDiagram
-### Setup S3 Convenience Flow
-
-The `setup_s3()` class method creates a folder with S3 storage in a single call.
-```mermaid
-sequenceDiagram
- participant User
- participant setup_s3 as StorageLocation.setup_s3()
- participant StorageLocation
- participant Folder
- participant Mixin as StorageLocation
- participant API as storage_location_services
- participant Synapse as Synapse REST API
-
- User->>setup_s3: setup_s3(parent, folder_name, bucket_name)
- activate setup_s3
-
- Note over setup_s3: Validate: folder_name XOR folder
-
- alt folder_name provided
- setup_s3->>Folder: Folder(name, parent_id).store()
- activate Folder
- Folder->>Synapse: POST /entity
- Synapse-->>Folder: Folder response
- Folder-->>setup_s3: New Folder
- deactivate Folder
- else folder ID provided
- setup_s3->>Folder: Folder(id).get()
- activate Folder
- Folder->>Synapse: GET /entity/{id}
- Synapse-->>Folder: Folder response
- Folder-->>setup_s3: Existing Folder
- deactivate Folder
- end
-
- alt bucket_name provided
- Note over setup_s3: storage_type = EXTERNAL_S3
- else bucket_name is None
- Note over setup_s3: storage_type = SYNAPSE_S3
- end
-
- setup_s3->>StorageLocation: StorageLocation(...).store()
- activate StorageLocation
- StorageLocation->>Synapse: POST /storageLocation
- Synapse-->>StorageLocation: StorageLocation response
- StorageLocation-->>setup_s3: StorageLocation
- deactivate StorageLocation
-
- setup_s3->>Mixin: folder.set_storage_location(storage_location_id)
- activate Mixin
-
- Mixin->>API: get_project_setting(project_id, "upload")
- API->>Synapse: GET /projectSettings/{id}/type/upload
- Synapse-->>API: Setting or empty
-
- alt Setting exists
- API-->>Mixin: Existing setting
- Mixin->>API: update_project_setting(body)
- API->>Synapse: PUT /projectSettings
- else No setting
- Mixin->>API: create_project_setting(body)
- API->>Synapse: POST /projectSettings
- end
-
- Synapse-->>API: Project setting response
- API-->>Mixin: Updated setting
- deactivate Mixin
-
- setup_s3-->>User: (Folder, StorageLocation)
- deactivate setup_s3
-```
-
-
-
### STS Token Retrieval
STS (AWS Security Token Service) enables direct S3 access using temporary credentials.
@@ -583,27 +513,38 @@ entity. The following state diagram shows the lifecycle of a project setting.
stateDiagram-v2
[*] --> NoSetting: Entity created
- NoSetting --> Created: set_storage_location()
+ NoSetting --> Created: set_storage_location()\ncreates new setting
Note right of NoSetting: Inherits from parent\nor uses Synapse default
- Created --> Updated: set_storage_location()\nwith different locations
- Updated --> Updated: set_storage_location()\nwith different locations
+ Created --> Updated: set_storage_location()\nupdates existing setting
+ Updated --> Updated: set_storage_location()\nupdates existing setting
+
+ Created --> Deleted: delete_project_setting(project_setting_id)
+ Updated --> Deleted: delete_project_setting(project_setting_id)
- Created --> Deleted: delete_project_setting()
- Updated --> Deleted: delete_project_setting()
+ Deleted --> NoSetting: Returns to default\n(inherits from parent)
- Deleted --> NoSetting: Returns to default
+ state NoSetting {
+ [*] --> Inherited
+ Inherited: No project setting exists
+ Inherited: Uses parent or Synapse default (ID=1)
+ }
state Created {
[*] --> Active
+ Active: concreteType = UploadDestinationListSetting
Active: locations = [storage_location_id]
Active: settingsType = "upload"
+ Active: projectId = entity.id
+ Active: Has id and etag
}
state Updated {
[*] --> Modified
- Modified: locations = [new_id, ...]
+ Modified: concreteType = UploadDestinationListSetting
+ Modified: locations = [new_id, ...] (max 10)
Modified: settingsType = "upload"
+ Modified: etag updated (OCC)
}
```
From f03397169cd3645fa5e8e9d3d162f3475b2f533e Mon Sep 17 00:00:00 2001
From: danlu1
Date: Fri, 6 Mar 2026 14:19:19 -0800
Subject: [PATCH 11/31] remove background color for migration flow
---
docs/explanations/storage_location_architecture.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
index 08a8cf7d1..e37fb97af 100644
--- a/docs/explanations/storage_location_architecture.md
+++ b/docs/explanations/storage_location_architecture.md
@@ -689,7 +689,7 @@ sequenceDiagram
participant MigrateFn as migrate_indexed_files
participant Synapse as Synapse REST API
- rect rgb(240, 248, 255)
+ rect
Note over User,Synapse: Phase 1: Index Files
User->>Entity: index_files_for_migration(dest_id, db_path)
activate Entity
@@ -713,7 +713,7 @@ sequenceDiagram
deactivate Entity
end
- rect rgb(240, 248, 255)
+ rect
Note over User,Synapse: Phase 2: Migrate Files
User->>Entity: migrate_indexed_files(db_path)
activate Entity
From 140249de8bda6a0ca56e51f7b6d5d74f29cd2d08 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Fri, 6 Mar 2026 15:31:48 -0800
Subject: [PATCH 12/31] remove background color for migration flow
---
.../storage_location_architecture.md | 78 +++++++++----------
1 file changed, 37 insertions(+), 41 deletions(-)
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
index e37fb97af..29a7d140b 100644
--- a/docs/explanations/storage_location_architecture.md
+++ b/docs/explanations/storage_location_architecture.md
@@ -513,16 +513,16 @@ entity. The following state diagram shows the lifecycle of a project setting.
stateDiagram-v2
[*] --> NoSetting: Entity created
- NoSetting --> Created: set_storage_location()\ncreates new setting
- Note right of NoSetting: Inherits from parent\nor uses Synapse default
+ NoSetting --> Created: set_storage_location()
+ Note right of NoSetting: Inherits from parent or uses Synapse default
- Created --> Updated: set_storage_location()\nupdates existing setting
- Updated --> Updated: set_storage_location()\nupdates existing setting
+ Created --> Updated: set_storage_location() updates existing setting
+ Updated --> Updated: set_storage_location() nupdates existing setting
Created --> Deleted: delete_project_setting(project_setting_id)
Updated --> Deleted: delete_project_setting(project_setting_id)
- Deleted --> NoSetting: Returns to default\n(inherits from parent)
+ Deleted --> NoSetting: Returns to default (inherits from parent)
state NoSetting {
[*] --> Inherited
@@ -574,7 +574,7 @@ flowchart TB
SLCM[StorageLocation Mixin]
end
- subgraph "API Layer (storage_location_services.py)"
+ subgraph "API Layer"
create_sls[create_storage_location_setting]
get_sls[get_storage_location_setting]
get_ps[get_project_setting]
@@ -689,52 +689,48 @@ sequenceDiagram
participant MigrateFn as migrate_indexed_files
participant Synapse as Synapse REST API
- rect
- Note over User,Synapse: Phase 1: Index Files
- User->>Entity: index_files_for_migration(dest_id, db_path)
- activate Entity
-
- Entity->>IndexFn: Start indexing
- activate IndexFn
-
- IndexFn->>Synapse: Query entity tree
- Synapse-->>IndexFn: File list
+ Note over User,Synapse: === Phase 1: Index Files ===
+ User->>Entity: index_files_for_migration(dest_id, db_path)
+ activate Entity
- loop For each file
- IndexFn->>Synapse: Get file metadata
- Synapse-->>IndexFn: File info
- IndexFn->>DB: Record file for migration
- end
+ Entity->>IndexFn: Start indexing
+ activate IndexFn
- IndexFn-->>Entity: MigrationResult (indexed counts)
- deactivate IndexFn
+ IndexFn->>Synapse: Query entity tree
+ Synapse-->>IndexFn: File list
- Entity-->>User: MigrationResult
- deactivate Entity
+ loop For each file
+ IndexFn->>Synapse: Get file metadata
+ Synapse-->>IndexFn: File info
+ IndexFn->>DB: Record file for migration
end
- rect
- Note over User,Synapse: Phase 2: Migrate Files
- User->>Entity: migrate_indexed_files(db_path)
- activate Entity
+ IndexFn-->>Entity: MigrationResult (indexed counts)
+ deactivate IndexFn
- Entity->>MigrateFn: Start migration
- activate MigrateFn
+ Entity-->>User: MigrationResult
+ deactivate Entity
- MigrateFn->>DB: Read indexed files
+ Note over User,Synapse: === Phase 2: Migrate Files ===
+ User->>Entity: migrate_indexed_files(db_path)
+ activate Entity
- loop For each indexed file
- MigrateFn->>Synapse: Copy file to new storage
- Synapse-->>MigrateFn: Success/Failure
- MigrateFn->>DB: Update status
- end
+ Entity->>MigrateFn: Start migration
+ activate MigrateFn
- MigrateFn-->>Entity: MigrationResult (migrated counts)
- deactivate MigrateFn
+ MigrateFn->>DB: Read indexed files
- Entity-->>User: MigrationResult
- deactivate Entity
+ loop For each indexed file
+ MigrateFn->>Synapse: Copy file to new storage
+ Synapse-->>MigrateFn: Success/Failure
+ MigrateFn->>DB: Update status
end
+
+ MigrateFn-->>Entity: MigrationResult (migrated counts)
+ deactivate MigrateFn
+
+ Entity-->>User: MigrationResult
+ deactivate Entity
```
From 0b2937df8a1599d8fe5664afb82e0399d6ad9034 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Thu, 12 Mar 2026 09:43:28 -0700
Subject: [PATCH 13/31] modify migration flow
---
.../storage_location_architecture.md | 52 +++++++++++++++----
1 file changed, 43 insertions(+), 9 deletions(-)
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
index 29a7d140b..6bae25f85 100644
--- a/docs/explanations/storage_location_architecture.md
+++ b/docs/explanations/storage_location_architecture.md
@@ -690,22 +690,56 @@ sequenceDiagram
participant Synapse as Synapse REST API
Note over User,Synapse: === Phase 1: Index Files ===
- User->>Entity: index_files_for_migration(dest_id, db_path)
+ User->>Entity: index_files_for_migration
activate Entity
- Entity->>IndexFn: Start indexing
+ Entity->>IndexFn: index_files_for_migration_async(dest_id, source_ids, file_version_strategy, include_table_files)
activate IndexFn
- IndexFn->>Synapse: Query entity tree
- Synapse-->>IndexFn: File list
+ IndexFn->>Synapse: Verify user owns destination storage location
+ Synapse-->>IndexFn: OK / error
+
+ IndexFn->>DB: Create/open DB + ensure schema
+ IndexFn->>DB: Store migration settings (root_id, dest_id, source_ids, file_version_strategy, include_table_files)
+
+ alt Entity is Project/Folder (container)
+ IndexFn->>Synapse: get_children(parent, include_types)
+ Synapse-->>IndexFn: Child references (folders/files/tables)
+
+ loop For each child (bounded concurrency)
+ IndexFn->>Synapse: get_async(child, downloadFile=false)
+ Synapse-->>IndexFn: Child entity
+ IndexFn->>IndexFn: _index_entity_async(child)
+ end
+
+ IndexFn->>DB: Mark container as indexed (PROJECT/FOLDER)
+
+ else Entity is File
+ alt file_version_strategy = new / latest / all
+ IndexFn->>Synapse: Get file handle metadata (and versions if needed)
+ Synapse-->>IndexFn: File handle(s)
+ IndexFn->>DB: Insert FILE migration rows (or ALREADY_MIGRATED)
+ else file_version_strategy = skip
+ Note over IndexFn: Skip file entities
+ end
+
+ else Entity is Table (include_table_files=true)
+ IndexFn->>Synapse: get_columns(table_id)
+ Synapse-->>IndexFn: Column list
+ IndexFn->>Synapse: Query rows for FILEHANDLEID columns (+ rowId,rowVersion)
+ Synapse-->>IndexFn: Row results (fileHandleId values)
+ loop For each row + file-handle cell
+ IndexFn->>Synapse: get_file_handle_for_download(fileHandleId, objectType=TableEntity)
+ Synapse-->>IndexFn: File handle
+ IndexFn->>DB: Insert TABLE_ATTACHED_FILE migration row (or ALREADY_MIGRATED)
+ end
+ end
- loop For each file
- IndexFn->>Synapse: Get file metadata
- Synapse-->>IndexFn: File info
- IndexFn->>DB: Record file for migration
+ opt continue_on_error=true
+ Note over IndexFn,DB: Indexing errors are recorded in DB instead of aborting
end
- IndexFn-->>Entity: MigrationResult (indexed counts)
+ IndexFn-->>Entity: MigrationResult (db_path)
deactivate IndexFn
Entity-->>User: MigrationResult
From 8d36adf9f341c5e6821860c01b4cc820019815e3 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Thu, 12 Mar 2026 09:51:55 -0700
Subject: [PATCH 14/31] remove setup_s3
---
synapseclient/models/storage_location.py | 129 +----------------------
1 file changed, 1 insertion(+), 128 deletions(-)
diff --git a/synapseclient/models/storage_location.py b/synapseclient/models/storage_location.py
index 664276855..a3ebe6f12 100644
--- a/synapseclient/models/storage_location.py
+++ b/synapseclient/models/storage_location.py
@@ -2,7 +2,7 @@
from dataclasses import dataclass, field
from enum import Enum
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional
from synapseclient import Synapse
from synapseclient.api.storage_location_services import (
@@ -14,9 +14,6 @@
StorageLocationSynchronousProtocol,
)
-if TYPE_CHECKING:
- from synapseclient.models import Folder
-
class StorageLocationType(str, Enum):
"""Enumeration of storage location types supported by Synapse.
@@ -474,127 +471,3 @@ async def main():
)
self.fill_from_dict(response)
return self
-
- @classmethod
- async def setup_s3_async(
- cls,
- *,
- parent: str,
- folder_name: Optional[str] = None,
- folder: Optional[Union["Folder", str]] = None,
- bucket_name: Optional[str] = None,
- base_key: Optional[str] = None,
- sts_enabled: bool = False,
- synapse_client: Optional[Synapse] = None,
- ) -> Tuple["Folder", "StorageLocation"]:
- """Convenience method to create a folder backed by S3 storage. This will:
-
- 1. Create or retrieve the folder
- 2. Create the storage location setting
- 3. Apply the storage location to the folder via project settings
-
- Arguments:
- parent: The parent project or folder ID (e.g., "syn123").
- folder_name: Name for a new folder. Either `folder_name` or `folder`
- must be provided.
- folder: An existing Folder object or Synapse ID. Either `folder_name`
- or `folder` must be provided.
- bucket_name: The S3 bucket name. If None, uses Synapse default storage.
- base_key: The base key (prefix) within the bucket. Optional.
- sts_enabled: Whether to enable STS credentials for this storage location.
- Default: False.
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- A tuple of (Folder, StorageLocation).
-
- Raises:
- ValueError: If neither `folder_name` nor `folder` is provided, or if both
- are provided.
-
- Example: Using this function
- Create an STS-enabled folder with external S3 storage:
-
- import asyncio
- from synapseclient import Synapse
- from synapseclient.models import StorageLocation
-
- syn = Synapse()
- syn.login()
-
- async def main():
- folder, storage = await StorageLocation.setup_s3_async(
- folder_name="my-sts-folder",
- parent="syn123",
- bucket_name="my-external-synapse-bucket",
- base_key="path/within/bucket",
- sts_enabled=True,
- )
- print(f"Folder: {folder.id}, Storage: {storage.storage_location_id}")
-
- asyncio.run(main())
-
- Example: Using existing folder
- Apply S3 storage to an existing folder:
-
- import asyncio
- from synapseclient import Synapse
- from synapseclient.models import StorageLocation
-
- syn = Synapse()
- syn.login()
-
- async def main():
- folder, storage = await StorageLocation.setup_s3_async(
- folder="syn456",
- bucket_name="my-bucket",
- )
-
- asyncio.run(main())
- """
- # Import here to avoid circular imports
- from synapseclient.models import Folder as FolderModel
-
- # Validate parameters
- if folder_name and folder:
- raise ValueError(
- "folder and folder_name are mutually exclusive, only one should be passed"
- )
- if not folder_name and not folder:
- raise ValueError("Either folder or folder_name is required")
-
- # Create or get the folder
- if folder_name:
- target_folder = await FolderModel(
- name=folder_name, parent_id=parent
- ).store_async(synapse_client=synapse_client)
- elif isinstance(folder, str):
- target_folder = await FolderModel(id=folder).get_async(
- synapse_client=synapse_client
- )
- else:
- target_folder = folder
-
- # Determine storage type
- if bucket_name:
- storage_type = StorageLocationType.EXTERNAL_S3
- else:
- storage_type = StorageLocationType.SYNAPSE_S3
-
- # Create the storage location
- storage_location = await cls(
- storage_type=storage_type,
- bucket=bucket_name,
- base_key=base_key,
- sts_enabled=sts_enabled,
- ).store_async(synapse_client=synapse_client)
-
- # Apply the storage location to the folder
- await target_folder.set_storage_location_async(
- storage_location_id=storage_location.storage_location_id,
- synapse_client=synapse_client,
- )
-
- return target_folder, storage_location
From e37340bd5c96bc766457b5d464189486abf55168 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 10:49:51 -0700
Subject: [PATCH 15/31] refine migration flow
---
.../storage_location_architecture.md | 52 ++++++++++++++-----
1 file changed, 39 insertions(+), 13 deletions(-)
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
index 6bae25f85..40c7d2292 100644
--- a/docs/explanations/storage_location_architecture.md
+++ b/docs/explanations/storage_location_architecture.md
@@ -517,7 +517,7 @@ stateDiagram-v2
Note right of NoSetting: Inherits from parent or uses Synapse default
Created --> Updated: set_storage_location() updates existing setting
- Updated --> Updated: set_storage_location() nupdates existing setting
+ Updated --> Updated: set_storage_location() updates existing setting
Created --> Deleted: delete_project_setting(project_setting_id)
Updated --> Deleted: delete_project_setting(project_setting_id)
@@ -677,8 +677,7 @@ This section covers the file migration system.
## Migration Flow
-File migration is a two-phase process that moves files from one storage location
-to another while preserving Synapse metadata.
+File migration is a two-phase process that first indexes all candidate files and then performs an asynchronous, batched migration that reuses copied file handles where possible, respects concurrency limits, snapshots affected tables when needed, and updates entities and table cells via transactional table operations while recording per-item status in a SQLite database.
```mermaid
sequenceDiagram
@@ -690,7 +689,7 @@ sequenceDiagram
participant Synapse as Synapse REST API
Note over User,Synapse: === Phase 1: Index Files ===
- User->>Entity: index_files_for_migration
+ User->>Entity: index_files_for_migration_async
activate Entity
Entity->>IndexFn: index_files_for_migration_async(dest_id, source_ids, file_version_strategy, include_table_files)
@@ -718,7 +717,7 @@ sequenceDiagram
alt file_version_strategy = new / latest / all
IndexFn->>Synapse: Get file handle metadata (and versions if needed)
Synapse-->>IndexFn: File handle(s)
- IndexFn->>DB: Insert FILE migration rows (or ALREADY_MIGRATED)
+ IndexFn->>DB: Insert/append FILE migration rows (INDEXED and ALREADY_MIGRATED)
else file_version_strategy = skip
Note over IndexFn: Skip file entities
end
@@ -728,7 +727,7 @@ sequenceDiagram
Synapse-->>IndexFn: Column list
IndexFn->>Synapse: Query rows for FILEHANDLEID columns (+ rowId,rowVersion)
Synapse-->>IndexFn: Row results (fileHandleId values)
- loop For each row + file-handle cell
+ loop For each row + file-handle cell (bounded concurrency)
IndexFn->>Synapse: get_file_handle_for_download(fileHandleId, objectType=TableEntity)
Synapse-->>IndexFn: File handle
IndexFn->>DB: Insert TABLE_ATTACHED_FILE migration row (or ALREADY_MIGRATED)
@@ -746,19 +745,46 @@ sequenceDiagram
deactivate Entity
Note over User,Synapse: === Phase 2: Migrate Files ===
- User->>Entity: migrate_indexed_files(db_path)
+ User->>Entity: migrate_indexed_files / migrate_indexed_files_async (db_path)
activate Entity
Entity->>MigrateFn: Start migration
activate MigrateFn
- MigrateFn->>DB: Read indexed files
+ MigrateFn->>DB: Open DB, ensure schema, load settings
+ MigrateFn->>User: Confirm migration (unless force=True)
+ Note over MigrateFn,DB: If not confirmed, abort and return
+
+ loop While there are indexed items
+ MigrateFn->>DB: Query next batch (respecting pending/completed handles & concurrency)
+
+ loop For each item in batch
+ MigrateFn->>MigrateFn: Skip if key or file handle already pending
+
+ MigrateFn->>DB: Check if destination file handle already exists
+ alt Existing copy found
+ Note over MigrateFn,DB: Reuse existing to_file_handle_id
+ else No existing copy
+ MigrateFn->>Synapse: Copy file to new storage (bounded concurrency)
+ Synapse-->>MigrateFn: New to_file_handle_id
+ end
+
+ alt Item is FILE (entity)
+ alt file_version_strategy = new (version is None)
+ MigrateFn->>Synapse: Create new file version with new file handle
+ else specific version
+ MigrateFn->>Synapse: Update existing version's file handle
+ end
+ else Item is TABLE_ATTACHED_FILE
+ alt create_table_snapshots=True
+ MigrateFn->>Synapse: Create table snapshot
+ end
+ MigrateFn->>Synapse: Update table cell via transactional table update (PartialRowSet/TableUpdateTransaction)
+ end
+
+ MigrateFn->>DB: Update row status to MIGRATED/ERRORED
+ end
- loop For each indexed file
- MigrateFn->>Synapse: Copy file to new storage
- Synapse-->>MigrateFn: Success/Failure
- MigrateFn->>DB: Update status
- end
MigrateFn-->>Entity: MigrationResult (migrated counts)
deactivate MigrateFn
From 3e1a790306ea808edf530a530cb1c345aedd2d9e Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 10:51:05 -0700
Subject: [PATCH 16/31] remove manifest related changes
---
docs/tutorials/python/manifest_operations.md | 328 ------
synapseclient/models/download_list.py | 224 -----
synapseclient/models/mixins/manifest.py | 950 ------------------
.../models/protocols/manifest_protocol.py | 240 -----
.../models/unit_test_manifest.py | 499 ---------
5 files changed, 2241 deletions(-)
delete mode 100644 docs/tutorials/python/manifest_operations.md
delete mode 100644 synapseclient/models/download_list.py
delete mode 100644 synapseclient/models/mixins/manifest.py
delete mode 100644 synapseclient/models/protocols/manifest_protocol.py
delete mode 100644 tests/unit/synapseclient/models/unit_test_manifest.py
diff --git a/docs/tutorials/python/manifest_operations.md b/docs/tutorials/python/manifest_operations.md
deleted file mode 100644
index 25362a347..000000000
--- a/docs/tutorials/python/manifest_operations.md
+++ /dev/null
@@ -1,328 +0,0 @@
-# Manifest Operations
-
-This tutorial covers how to work with manifest TSV files for bulk file operations in Synapse. Manifest files provide a way to track file metadata, download files with their annotations, and upload files with provenance information.
-
-## Overview
-
-A manifest file is a tab-separated values (TSV) file that contains metadata about files in Synapse. The manifest includes:
-
-- File paths and Synapse IDs
-- Parent container IDs
-- Annotations
-- Provenance information (used/executed references)
-
-## Generating Manifests During Download
-
-When syncing files from Synapse, you can automatically generate a manifest file that captures all file metadata.
-
-### Using sync_from_synapse with Manifest Generation
-
-```python
-from synapseclient.models import Project
-import synapseclient
-
-synapseclient.login()
-
-# Download a project with manifest generation at each directory level
-project = Project(id="syn123456").sync_from_synapse(
- path="/path/to/download",
- generate_manifest="all"
-)
-
-# Or generate a single manifest at the root level only
-project = Project(id="syn123456").sync_from_synapse(
- path="/path/to/download",
- generate_manifest="root"
-)
-```
-
-### Manifest Generation Options
-
-The `generate_manifest` parameter accepts three values:
-
-| Value | Description |
-|-------|-------------|
-| `"suppress"` | (Default) Do not create any manifest files |
-| `"root"` | Create a single manifest at the root download path |
-| `"all"` | Create a manifest in each directory level |
-
-### Generating Manifest Separately
-
-You can also generate a manifest after syncing:
-
-```python
-from synapseclient.models import Project
-import synapseclient
-
-synapseclient.login()
-
-# First sync without manifest
-project = Project(id="syn123456").sync_from_synapse(
- path="/path/to/download"
-)
-
-# Then generate manifest separately
-manifest_path = project.generate_manifest(
- path="/path/to/download",
- manifest_scope="root"
-)
-print(f"Manifest created at: {manifest_path}")
-```
-
-## Manifest File Format
-
-The generated manifest file (`SYNAPSE_METADATA_MANIFEST.tsv`) contains the following columns:
-
-| Column | Description |
-|--------|-------------|
-| `path` | Local file path |
-| `parent` | Synapse ID of the parent container |
-| `name` | File name in Synapse |
-| `id` | Synapse file ID |
-| `synapseStore` | Whether the file is stored in Synapse |
-| `contentType` | MIME type of the file |
-| `used` | Provenance - entities used to create this file |
-| `executed` | Provenance - code/scripts executed |
-| `activityName` | Name of the provenance activity |
-| `activityDescription` | Description of the provenance activity |
-| *custom columns* | Any annotations on the files |
-
-### Example Manifest
-
-```tsv
-path parent name id synapseStore contentType used executed activityName activityDescription study dataType
-/data/file1.csv syn123 file1.csv syn456 True text/csv Data Processing Study1 RNA-seq
-/data/file2.csv syn123 file2.csv syn789 True text/csv syn456 Analysis Processed from file1 Study1 RNA-seq
-```
-
-## Uploading Files from a Manifest
-
-You can upload files to Synapse using a manifest file:
-
-```python
-from synapseclient.models import Project
-import synapseclient
-
-synapseclient.login()
-
-# Upload files from a manifest
-files = Project.from_manifest(
- manifest_path="/path/to/manifest.tsv",
- parent_id="syn123456"
-)
-
-for file in files:
- print(f"Uploaded: {file.name} ({file.id})")
-```
-
-### Dry Run Validation
-
-Before uploading, you can validate the manifest:
-
-```python
-from synapseclient.models import Project
-
-# Validate without uploading
-is_valid, errors = Project.validate_manifest(
- manifest_path="/path/to/manifest.tsv"
-)
-
-if is_valid:
- print("Manifest is valid, ready for upload")
-else:
- for error in errors:
- print(f"Error: {error}")
-```
-
-Or use the `dry_run` option to validate the manifest and see what would be uploaded without making changes:
-
-```python
-# Dry run - validates and returns what would be uploaded, but doesn't upload
-files = Project.from_manifest(
- manifest_path="/path/to/manifest.tsv",
- parent_id="syn123456",
- dry_run=True # Validate only, no actual upload
-)
-print(f"Would upload {len(files)} files")
-```
-
-The `dry_run` parameter is useful for:
-
-- Validating manifest format before committing to an upload
-- Testing your manifest configuration
-- Previewing which files will be affected
-
-## Working with Annotations
-
-Annotations in the manifest are automatically handled:
-
-### On Download
-
-When generating a manifest, all file annotations are included as additional columns:
-
-```python
-project = Project(id="syn123456").sync_from_synapse(
- path="/path/to/download",
- generate_manifest="root"
-)
-# Annotations appear as columns in the manifest
-```
-
-### On Upload
-
-Any columns in the manifest that aren't standard fields become annotations:
-
-```tsv
-path parent study dataType specimenType
-/data/file1.csv syn123 Study1 RNA-seq tissue
-```
-
-```python
-files = Project.from_manifest(
- manifest_path="/path/to/manifest.tsv",
- parent_id="syn123456",
- merge_existing_annotations=True # Merge with existing annotations
-)
-```
-
-## Working with Provenance
-
-### On Download
-
-Provenance information is captured in the `used`, `executed`, `activityName`, and `activityDescription` columns:
-
-```python
-project = Project(id="syn123456").sync_from_synapse(
- path="/path/to/download",
- include_activity=True, # Include provenance
- generate_manifest="root"
-)
-```
-
-### On Upload
-
-You can specify provenance in the manifest:
-
-```tsv
-path parent used executed activityName activityDescription
-/data/output.csv syn123 syn456;syn789 https://github.com/repo/script.py Analysis Generated from input files
-```
-
-- Multiple references are separated by semicolons (`;`)
-- References can be Synapse IDs, URLs, or local file paths
-
-## Synapse Download List Integration
-
-The manifest functionality integrates with Synapse's Download List feature. You can generate a manifest directly from your Synapse download list, which is useful for exporting metadata about files you've queued for download in the Synapse web interface.
-
-### Generating Manifest from Download List
-
-```python
-from synapseclient.models import Project
-import synapseclient
-
-synapseclient.login()
-
-# Generate a manifest from your Synapse download list
-manifest_path = Project.generate_download_list_manifest(
- download_path="/path/to/save/manifest"
-)
-print(f"Manifest downloaded to: {manifest_path}")
-```
-
-### Custom CSV Formatting
-
-You can customize the manifest format:
-
-```python
-from synapseclient.models import Project
-import synapseclient
-
-synapseclient.login()
-
-# Generate a tab-separated manifest
-manifest_path = Project.generate_download_list_manifest(
- download_path="/path/to/save/manifest",
- csv_separator="\t", # Tab-separated
- include_header=True
-)
-```
-
-### Using DownloadListManifestRequest Directly
-
-For more control over the manifest generation process, use the `DownloadListManifestRequest` class directly:
-
-```python
-from synapseclient.models import DownloadListManifestRequest, CsvTableDescriptor
-import synapseclient
-
-synapseclient.login()
-
-# Create a request with custom CSV formatting
-request = DownloadListManifestRequest(
- csv_table_descriptor=CsvTableDescriptor(
- separator="\t",
- quote_character='"',
- is_first_line_header=True
- )
-)
-
-# Send the job and wait for completion
-request.send_job_and_wait()
-
-# Download the generated manifest
-manifest_path = request.download_manifest(download_path="/path/to/download")
-print(f"Manifest file handle: {request.result_file_handle_id}")
-```
-
-## Best Practices
-
-1. **Use `generate_manifest="root"` for simple cases** - Creates a single manifest at the root level, easier to manage.
-
-2. **Use `generate_manifest="all"` for complex hierarchies** - Creates manifests at each directory level, useful for large projects with many subdirectories.
-
-3. **Validate manifests before upload** - Use `validate_manifest()` or `dry_run=True` to catch errors early.
-
-4. **Include provenance information** - Set `include_activity=True` when syncing to capture provenance in the manifest.
-
-5. **Backup your manifest** - The manifest is a valuable record of your data and its metadata.
-
-## Async API
-
-All manifest operations are available as async methods:
-
-```python
-import asyncio
-from synapseclient.models import Project
-import synapseclient
-
-async def main():
- synapseclient.login()
-
- # Async sync with manifest
- project = Project(id="syn123456")
- await project.sync_from_synapse_async(
- path="/path/to/download",
- generate_manifest="root"
- )
-
- # Async manifest generation
- manifest_path = await project.generate_manifest_async(
- path="/path/to/download",
- manifest_scope="root"
- )
-
- # Async upload from manifest
- files = await Project.from_manifest_async(
- manifest_path="/path/to/manifest.tsv",
- parent_id="syn123456"
- )
-
-asyncio.run(main())
-```
-
-## See Also
-
-- [Download Data in Bulk](download_data_in_bulk.md)
-- [Upload Data in Bulk](upload_data_in_bulk.md)
-- [Manifest TSV Format](../../explanations/manifest_tsv.md)
diff --git a/synapseclient/models/download_list.py b/synapseclient/models/download_list.py
deleted file mode 100644
index e1c0eb866..000000000
--- a/synapseclient/models/download_list.py
+++ /dev/null
@@ -1,224 +0,0 @@
-"""Models for interacting with Synapse's Download List functionality.
-
-This module provides classes for generating manifest files from a user's download list
-using the Synapse Asynchronous Job service.
-
-See: https://rest-docs.synapse.org/rest/POST/download/list/manifest/async/start.html
-"""
-
-from dataclasses import dataclass, field
-from typing import Any, Dict, Optional
-
-from typing_extensions import Self
-
-from synapseclient import Synapse
-from synapseclient.core.async_utils import async_to_sync, otel_trace_method
-from synapseclient.core.constants.concrete_types import DOWNLOAD_LIST_MANIFEST_REQUEST
-from synapseclient.core.download import download_by_file_handle
-from synapseclient.core.utils import delete_none_keys
-from synapseclient.models.mixins.asynchronous_job import AsynchronousCommunicator
-from synapseclient.models.protocols.download_list_protocol import (
- DownloadListManifestRequestSynchronousProtocol,
-)
-from synapseclient.models.table_components import CsvTableDescriptor
-
-
-@dataclass
-@async_to_sync
-class DownloadListManifestRequest(
- DownloadListManifestRequestSynchronousProtocol, AsynchronousCommunicator
-):
- """
- A request to generate a manifest file (CSV) of the current user's download list.
-
- This class uses the Synapse Asynchronous Job service to generate a manifest file
- containing metadata about files in the user's download list. The manifest can be
- used to download files or for record-keeping purposes.
-
- See: https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/download/DownloadListManifestRequest.html
-
- Attributes:
- csv_table_descriptor: Optional CSV formatting options for the manifest.
- result_file_handle_id: The file handle ID of the generated manifest (populated after completion).
-
- Example: Generate a manifest from download list
- Generate a CSV manifest from your download list:
-
- from synapseclient.models import DownloadListManifestRequest
- import synapseclient
-
- synapseclient.login()
-
- # Create and send the request
- request = DownloadListManifestRequest()
- request.send_job_and_wait()
-
- print(f"Manifest file handle: {request.result_file_handle_id}")
-
- Example: Generate manifest with custom CSV formatting
- Use custom separator and quote characters:
-
- from synapseclient.models import DownloadListManifestRequest, CsvTableDescriptor
- import synapseclient
-
- synapseclient.login()
-
- request = DownloadListManifestRequest(
- csv_table_descriptor=CsvTableDescriptor(
- separator="\t", # Tab-separated
- is_first_line_header=True
- )
- )
- request.send_job_and_wait()
- """
-
- concrete_type: str = field(
- default=DOWNLOAD_LIST_MANIFEST_REQUEST, repr=False, compare=False
- )
- """The concrete type of this request."""
-
- csv_table_descriptor: Optional[CsvTableDescriptor] = None
- """Optional CSV formatting options for the manifest file."""
-
- result_file_handle_id: Optional[str] = None
- """The file handle ID of the generated manifest file. Populated after the job completes."""
-
- def to_synapse_request(self) -> Dict[str, Any]:
- """
- Convert this request to the format expected by the Synapse REST API.
-
- Returns:
- A dictionary containing the request body for the Synapse API.
- """
- request = {
- "concreteType": self.concrete_type,
- }
- if self.csv_table_descriptor:
- request[
- "csvTableDescriptor"
- ] = self.csv_table_descriptor.to_synapse_request()
- delete_none_keys(request)
- return request
-
- def fill_from_dict(self, synapse_response: Dict[str, Any]) -> Self:
- """
- Populate this object from a Synapse REST API response.
-
- Arguments:
- synapse_response: The response from the REST API.
-
- Returns:
- This object with fields populated from the response.
- """
- self.result_file_handle_id = synapse_response.get("resultFileHandleId", None)
- return self
-
- @otel_trace_method(
- method_to_trace_name=lambda self, **kwargs: "DownloadListManifestRequest_send_job_and_wait"
- )
- async def send_job_and_wait_async(
- self,
- post_exchange_args: Optional[Dict[str, Any]] = None,
- timeout: int = 120,
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> Self:
- """Send the job to the Asynchronous Job service and wait for it to complete.
-
- This method sends the manifest generation request to Synapse and waits
- for the job to complete. After completion, the `result_file_handle_id`
- attribute will be populated.
-
- Arguments:
- post_exchange_args: Additional arguments to pass to the request.
- timeout: The number of seconds to wait for the job to complete or progress
- before raising a SynapseTimeoutError. Defaults to 120.
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- This instance with `result_file_handle_id` populated.
-
- Raises:
- SynapseTimeoutError: If the job does not complete within the timeout.
- SynapseError: If the job fails.
-
- Example: Generate a manifest
- Generate a manifest from the download list:
-
- from synapseclient.models import DownloadListManifestRequest
- import synapseclient
-
- synapseclient.login()
-
- request = DownloadListManifestRequest()
- request.send_job_and_wait()
- print(f"Manifest file handle: {request.result_file_handle_id}")
- """
- return await super().send_job_and_wait_async(
- post_exchange_args=post_exchange_args,
- timeout=timeout,
- synapse_client=synapse_client,
- )
-
- @otel_trace_method(
- method_to_trace_name=lambda self, **kwargs: "DownloadListManifestRequest_download_manifest"
- )
- async def download_manifest_async(
- self,
- download_path: str,
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> str:
- """
- Download the generated manifest file to a local path.
-
- This method should be called after `send_job_and_wait()` has completed
- successfully and `result_file_handle_id` is populated.
-
- Arguments:
- download_path: The local directory path where the manifest will be saved.
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- The full path to the downloaded manifest file.
-
- Raises:
- ValueError: If the manifest has not been generated yet (no result_file_handle_id).
-
- Example: Download the manifest after generation
- Generate and download a manifest:
-
- from synapseclient.models import DownloadListManifestRequest
- import synapseclient
-
- synapseclient.login()
-
- request = DownloadListManifestRequest()
- request.send_job_and_wait()
-
- manifest_path = request.download_manifest(download_path="/path/to/download")
- print(f"Manifest downloaded to: {manifest_path}")
- """
- if not self.result_file_handle_id:
- raise ValueError(
- "Manifest has not been generated yet. "
- "Call send_job_and_wait() before downloading."
- )
-
- # Download the file handle using the download module
- # For download list manifests, the synapse_id parameter is set to the file handle ID
- # because these manifests are not associated with a specific entity. The download
- # service handles this case by using the file handle directly.
- downloaded_path = await download_by_file_handle(
- file_handle_id=self.result_file_handle_id,
- synapse_id=self.result_file_handle_id,
- entity_type="FileEntity",
- destination=download_path,
- synapse_client=synapse_client,
- )
-
- return downloaded_path
diff --git a/synapseclient/models/mixins/manifest.py b/synapseclient/models/mixins/manifest.py
deleted file mode 100644
index 785a9c7b9..000000000
--- a/synapseclient/models/mixins/manifest.py
+++ /dev/null
@@ -1,950 +0,0 @@
-"""Mixin for objects that can generate and read manifest TSV files."""
-
-import csv
-import datetime
-import io
-import os
-import re
-import sys
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
-
-from synapseclient import Synapse
-from synapseclient.core import utils
-from synapseclient.core.async_utils import async_to_sync, otel_trace_method
-from synapseclient.core.utils import is_synapse_id_str, is_url, topolgical_sort
-from synapseclient.models.protocols.manifest_protocol import (
- ManifestGeneratableSynchronousProtocol,
-)
-
-if TYPE_CHECKING:
- from synapseclient.models import File
-
-# When new fields are added to the manifest they will also need to be added to
-# file.py#_determine_fields_to_ignore_in_merge
-REQUIRED_FIELDS = ["path", "parent"]
-FILE_CONSTRUCTOR_FIELDS = ["name", "id", "synapseStore", "contentType"]
-STORE_FUNCTION_FIELDS = ["activityName", "activityDescription", "forceVersion"]
-PROVENANCE_FIELDS = ["used", "executed"]
-MANIFEST_FILENAME = "SYNAPSE_METADATA_MANIFEST.tsv"
-DEFAULT_GENERATED_MANIFEST_KEYS = [
- "path",
- "parent",
- "name",
- "id",
- "synapseStore",
- "contentType",
- "used",
- "executed",
- "activityName",
- "activityDescription",
-]
-ARRAY_BRACKET_PATTERN = re.compile(r"^\[.*\]$")
-SINGLE_OPEN_BRACKET_PATTERN = re.compile(r"^\[")
-SINGLE_CLOSING_BRACKET_PATTERN = re.compile(r"\]$")
-# https://stackoverflow.com/questions/18893390/splitting-on-comma-outside-quotes
-COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)")
-
-
-def _manifest_filename(path: str) -> str:
- """Get the full path to the manifest file.
-
- Arguments:
- path: The directory where the manifest file will be created.
-
- Returns:
- The full path to the manifest file.
- """
- return os.path.join(path, MANIFEST_FILENAME)
-
-
-def _convert_manifest_data_items_to_string_list(
- items: List[Union[str, datetime.datetime, bool, int, float]],
-) -> str:
- """
- Handle converting an individual key that contains a possible list of data into a
- list of strings or objects that can be written to the manifest file.
-
- This has specific logic around how to handle datetime fields.
-
- When working with datetime fields we are printing the ISO 8601 UTC representation of
- the datetime.
-
- When working with non strings we are printing the non-quoted version of the object.
-
- Example: Examples
- Several examples of how this function works.
-
- >>> _convert_manifest_data_items_to_string_list(["a", "b", "c"])
- '[a,b,c]'
- >>> _convert_manifest_data_items_to_string_list(["string,with,commas", "string without commas"])
- '["string,with,commas",string without commas]'
- >>> _convert_manifest_data_items_to_string_list(["string,with,commas"])
- 'string,with,commas'
- >>> _convert_manifest_data_items_to_string_list(
- [datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)])
- '2020-01-01T00:00:00Z'
- >>> _convert_manifest_data_items_to_string_list([True])
- 'True'
- >>> _convert_manifest_data_items_to_string_list([1])
- '1'
- >>> _convert_manifest_data_items_to_string_list([1.0])
- '1.0'
- >>> _convert_manifest_data_items_to_string_list(
- [datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc),
- datetime.datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)])
- '[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]'
-
-
- Args:
- items: The list of items to convert.
-
- Returns:
- The list of items converted to strings.
- """
- items_to_write = []
- for item in items:
- if isinstance(item, datetime.datetime):
- items_to_write.append(
- utils.datetime_to_iso(dt=item, include_milliseconds_if_zero=False)
- )
- else:
- # If a string based annotation has a comma in it
- # this will wrap the string in quotes so it won't be parsed
- # as multiple values. For example this is an annotation with 2 values:
- # [my first annotation, "my, second, annotation"]
- # This is an annotation with 4 value:
- # [my first annotation, my, second, annotation]
- if isinstance(item, str):
- if len(items) > 1 and "," in item:
- items_to_write.append(f'"{item}"')
- else:
- items_to_write.append(item)
- else:
- items_to_write.append(repr(item))
-
- if len(items_to_write) > 1:
- return f'[{",".join(items_to_write)}]'
- elif len(items_to_write) == 1:
- return items_to_write[0]
- else:
- return ""
-
-
-def _convert_manifest_data_row_to_dict(row: dict, keys: List[str]) -> dict:
- """
- Convert a row of data to a dict that can be written to a manifest file.
-
- Args:
- row: The row of data to convert.
- keys: The keys of the manifest. Used to select the rows of data.
-
- Returns:
- The dict representation of the row.
- """
- data_to_write = {}
- for key in keys:
- data_for_key = row.get(key, "")
- if isinstance(data_for_key, list):
- items_to_write = _convert_manifest_data_items_to_string_list(data_for_key)
- data_to_write[key] = items_to_write
- else:
- data_to_write[key] = data_for_key
- return data_to_write
-
-
-def _write_manifest_data(filename: str, keys: List[str], data: List[dict]) -> None:
- """
- Write a number of keys and a list of data to a manifest file. This will write
- the data out as a tab separated file.
-
- For the data we are writing to the TSV file we are not quoting the content with any
- characters. This is because the syncToSynapse function does not require strings to
- be quoted. When quote characters were included extra double quotes were being added
- to the strings when they were written to the manifest file. This was not causing
- errors, however, it was changing the content of the manifest file when changes
- were not required.
-
- Args:
- filename: The name of the file to write to.
- keys: The keys of the manifest.
- data: The data to write to the manifest. This should be a list of dicts where
- each dict represents a row of data.
- """
- with io.open(filename, "w", encoding="utf8") if filename else sys.stdout as fp:
- csv_writer = csv.DictWriter(
- fp,
- keys,
- restval="",
- extrasaction="ignore",
- delimiter="\t",
- quotechar=None,
- quoting=csv.QUOTE_NONE,
- )
- csv_writer.writeheader()
- for row in data:
- csv_writer.writerow(rowdict=_convert_manifest_data_row_to_dict(row, keys))
-
-
-def _extract_entity_metadata_for_file(
- all_files: List["File"],
-) -> Tuple[List[str], List[Dict[str, str]]]:
- """
- Extracts metadata from the list of File Entities and returns them in a form
- usable by csv.DictWriter
-
- Arguments:
- all_files: an iterable that provides File entities
-
- Returns:
- keys: a list column headers
- data: a list of dicts containing data from each row
- """
- keys = list(DEFAULT_GENERATED_MANIFEST_KEYS)
- annotation_keys = set()
- data = []
- for entity in all_files:
- row = {
- "parent": entity.parent_id,
- "path": entity.path,
- "name": entity.name,
- "id": entity.id,
- "synapseStore": entity.synapse_store,
- "contentType": entity.content_type,
- }
-
- if entity.annotations:
- annotation_keys.update(set(entity.annotations.keys()))
- row.update(
- {
- key: (val if len(val) > 0 else "")
- for key, val in entity.annotations.items()
- }
- )
-
- row_provenance = _get_entity_provenance_dict_for_file(entity=entity)
- row.update(row_provenance)
-
- data.append(row)
- keys.extend(annotation_keys)
- return keys, data
-
-
-def _get_entity_provenance_dict_for_file(entity: "File") -> Dict[str, str]:
- """
- Arguments:
- entity: File entity object
-
- Returns:
- dict: a dict with a subset of the provenance metadata for the entity.
- An empty dict is returned if the metadata does not have a provenance record.
- """
- if not entity.activity:
- return {}
-
- used_activities = []
- for used_activity in entity.activity.used:
- used_activities.append(used_activity.format_for_manifest())
-
- executed_activities = []
- for executed_activity in entity.activity.executed:
- executed_activities.append(executed_activity.format_for_manifest())
-
- return {
- "used": ";".join(used_activities),
- "executed": ";".join(executed_activities),
- "activityName": entity.activity.name or "",
- "activityDescription": entity.activity.description or "",
- }
-
-
-def _validate_manifest_required_fields(
- manifest_path: str,
-) -> Tuple[bool, List[str]]:
- """
- Validate that a manifest file exists and has the required fields.
-
- Args:
- manifest_path: Path to the manifest file.
-
- Returns:
- Tuple of (is_valid, list_of_error_messages).
- """
- errors = []
-
- if not os.path.isfile(manifest_path):
- errors.append(f"Manifest file not found: {manifest_path}")
- return (False, errors)
-
- try:
- with io.open(manifest_path, "r", encoding="utf8") as fp:
- reader = csv.DictReader(fp, delimiter="\t")
- headers = reader.fieldnames or []
-
- # Check for required fields
- for field in REQUIRED_FIELDS:
- if field not in headers:
- errors.append(f"Missing required field: {field}")
-
- # Validate each row
- row_num = 1
- for row in reader:
- row_num += 1
- path = row.get("path", "")
- parent = row.get("parent", "")
-
- if not path:
- errors.append(f"Row {row_num}: 'path' is empty")
-
- if not parent:
- errors.append(f"Row {row_num}: 'parent' is empty")
- elif not is_synapse_id_str(parent) and not is_url(parent):
- errors.append(
- f"Row {row_num}: 'parent' is not a valid Synapse ID: {parent}"
- )
-
- # Check if path exists (skip URLs)
- if path and not is_url(path):
- expanded_path = os.path.abspath(
- os.path.expandvars(os.path.expanduser(path))
- )
- if not os.path.isfile(expanded_path):
- errors.append(f"Row {row_num}: File not found: {path}")
-
- except Exception as e:
- errors.append(f"Error reading manifest file: {str(e)}")
-
- return (len(errors) == 0, errors)
-
-
-@async_to_sync
-class ManifestGeneratable(ManifestGeneratableSynchronousProtocol):
- """
- Mixin for objects that can generate and read manifest TSV files.
-
- In order to use this mixin, the class must have the following attributes:
-
- - `id`
- - `name`
- - `_synced_from_synapse`
-
- The class must also inherit from `StorableContainer` mixin which provides:
-
- - `flatten_file_list()`
- - `map_directory_to_all_contained_files()`
- """
-
- id: Optional[str] = None
- name: Optional[str] = None
- _synced_from_synapse: bool = False
-
- @otel_trace_method(
- method_to_trace_name=lambda self, **kwargs: f"{self.__class__.__name__}_generate_manifest: {self.id}"
- )
- async def generate_manifest_async(
- self,
- path: str,
- manifest_scope: str = "all",
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> Optional[str]:
- """
- Generate a manifest TSV file for all files in this container.
-
- This method should be called after `sync_from_synapse()` to generate
- a manifest of all downloaded files with their metadata.
-
- Arguments:
- path: The directory where the manifest file(s) will be written.
- manifest_scope: Controls manifest file generation:
-
- - "all": Create a manifest in each directory level
- - "root": Create a single manifest at the root path only
- - "suppress": Do not create any manifest files
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- The path to the root manifest file if created, or None if suppressed.
-
- Raises:
- ValueError: If the container has not been synced from Synapse.
- ValueError: If manifest_scope is not one of 'all', 'root', 'suppress'.
-
- Example: Generate manifest after sync
- Generate a manifest file after syncing from Synapse:
-
- from synapseclient.models import Project
-
- import synapseclient
- synapseclient.login()
-
- project = Project(id="syn123").sync_from_synapse(
- path="/path/to/download"
- )
- manifest_path = project.generate_manifest(
- path="/path/to/download",
- manifest_scope="root"
- )
- print(f"Manifest created at: {manifest_path}")
- """
- if manifest_scope not in ("all", "root", "suppress"):
- raise ValueError(
- 'Value of manifest_scope should be one of ("all", "root", "suppress")'
- )
-
- if manifest_scope == "suppress":
- return None
-
- if not self._synced_from_synapse:
- raise ValueError(
- "Container has not been synced from Synapse. "
- "Call sync_from_synapse() before generating a manifest."
- )
-
- syn = Synapse.get_client(synapse_client=synapse_client)
-
- # Expand the path
- path = os.path.expanduser(path) if path else None
- if not path:
- raise ValueError("A path must be provided to generate a manifest.")
-
- # Get all files from this container
- all_files = self.flatten_file_list()
-
- if not all_files:
- syn.logger.info(
- f"[{self.id}:{self.name}]: No files found in container, "
- "skipping manifest generation."
- )
- return None
-
- root_manifest_path = None
-
- if manifest_scope == "root":
- # Generate a single manifest at the root
- keys, data = _extract_entity_metadata_for_file(all_files=all_files)
- manifest_path = _manifest_filename(path)
- _write_manifest_data(manifest_path, keys, data)
- root_manifest_path = manifest_path
- syn.logger.info(
- f"[{self.id}:{self.name}]: Created manifest at {manifest_path}"
- )
- elif manifest_scope == "all":
- # Generate a manifest at each directory level
- directory_map = self.map_directory_to_all_contained_files(root_path=path)
-
- for directory_path, files_in_directory in directory_map.items():
- if files_in_directory:
- keys, data = _extract_entity_metadata_for_file(
- all_files=files_in_directory
- )
- manifest_path = _manifest_filename(directory_path)
- _write_manifest_data(manifest_path, keys, data)
-
- # Track the root manifest path
- if directory_path == path:
- root_manifest_path = manifest_path
-
- syn.logger.info(
- f"[{self.id}:{self.name}]: Created manifest at {manifest_path}"
- )
-
- return root_manifest_path
-
- @otel_trace_method(
- method_to_trace_name=lambda self, **kwargs: f"{self.__class__.__name__}_get_manifest_data: {self.id}"
- )
- async def get_manifest_data_async(
- self,
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> Tuple[List[str], List[Dict[str, str]]]:
- """
- Get manifest data for all files in this container.
-
- This method extracts metadata from all files that have been synced
- to this container. The data can be used to generate a manifest file
- or for other purposes.
-
- Arguments:
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- Tuple of (keys, data) where keys is a list of column headers
- and data is a list of dictionaries, one per file, containing
- the file metadata.
-
- Raises:
- ValueError: If the container has not been synced from Synapse.
-
- Example: Get manifest data
- Get manifest data for all files in a project:
-
- from synapseclient.models import Project
-
- import synapseclient
- synapseclient.login()
-
- project = Project(id="syn123").sync_from_synapse(
- path="/path/to/download"
- )
- keys, data = project.get_manifest_data()
- for row in data:
- print(f"File: {row['name']} at {row['path']}")
- """
- if not self._synced_from_synapse:
- raise ValueError(
- "Container has not been synced from Synapse. "
- "Call sync_from_synapse() before getting manifest data."
- )
-
- all_files = self.flatten_file_list()
- return _extract_entity_metadata_for_file(all_files=all_files)
-
- @classmethod
- @otel_trace_method(
- method_to_trace_name=lambda cls, **kwargs: f"{cls.__name__}_from_manifest"
- )
- async def from_manifest_async(
- cls,
- manifest_path: str,
- parent_id: str,
- dry_run: bool = False,
- merge_existing_annotations: bool = True,
- associate_activity_to_new_version: bool = False,
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> List["File"]:
- """
- Upload files to Synapse from a manifest TSV file.
-
- This method reads a manifest TSV file and uploads all files defined in it
- to Synapse. The manifest file must contain at minimum the 'path' and 'parent'
- columns.
-
- Arguments:
- manifest_path: Path to the manifest TSV file.
- parent_id: The Synapse ID of the parent container (Project or Folder)
- where files will be uploaded if not specified in the manifest.
- dry_run: If True, validate the manifest but do not upload.
- merge_existing_annotations: If True, merge annotations with existing
- annotations on the file. If False, replace existing annotations.
- associate_activity_to_new_version: If True, copy the activity
- (provenance) from the previous version to the new version.
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- List of File objects that were uploaded.
-
- Raises:
- ValueError: If the manifest file does not exist.
- ValueError: If the manifest file is missing required fields.
- IOError: If a file path in the manifest does not exist.
-
- Example: Upload files from a manifest
- Upload files from a manifest TSV file:
-
- from synapseclient.models import Project
-
- import synapseclient
- synapseclient.login()
-
- files = Project.from_manifest(
- manifest_path="/path/to/manifest.tsv",
- parent_id="syn123"
- )
- for file in files:
- print(f"Uploaded: {file.name} ({file.id})")
-
- Example: Dry run validation
- Validate a manifest without uploading:
-
- from synapseclient.models import Project
-
- import synapseclient
- synapseclient.login()
-
- files = Project.from_manifest(
- manifest_path="/path/to/manifest.tsv",
- parent_id="syn123",
- dry_run=True
- )
- print("Manifest is valid, ready for upload")
- """
- from synapseclient.models import Activity, File
-
- syn = Synapse.get_client(synapse_client=synapse_client)
-
- # Validate the manifest
- is_valid, errors = _validate_manifest_required_fields(manifest_path)
- if not is_valid:
- raise ValueError(
- "Invalid manifest file:\n" + "\n".join(f" - {e}" for e in errors)
- )
-
- # Read the manifest
- rows = []
- with io.open(manifest_path, "r", encoding="utf8") as fp:
- reader = csv.DictReader(fp, delimiter="\t")
- for row in reader:
- rows.append(row)
-
- if dry_run:
- syn.logger.info(
- f"Dry run: {len(rows)} files would be uploaded from manifest"
- )
- return []
-
- # Build dependency graph for provenance ordering
- path_to_row = {}
- upload_order = {}
-
- for row in rows:
- path = row.get("path", "")
- if path and not is_url(path):
- path = os.path.abspath(os.path.expandvars(os.path.expanduser(path)))
- path_to_row[path] = row
-
- # Collect provenance references
- all_refs = []
- used = row.get("used", "")
- if used and used.strip():
- for item in used.split(";"):
- item = item.strip()
- if item:
- if os.path.isfile(
- os.path.abspath(
- os.path.expandvars(os.path.expanduser(item))
- )
- ):
- all_refs.append(
- os.path.abspath(
- os.path.expandvars(os.path.expanduser(item))
- )
- )
-
- executed = row.get("executed", "")
- if executed and executed.strip():
- for item in executed.split(";"):
- item = item.strip()
- if item:
- if os.path.isfile(
- os.path.abspath(
- os.path.expandvars(os.path.expanduser(item))
- )
- ):
- all_refs.append(
- os.path.abspath(
- os.path.expandvars(os.path.expanduser(item))
- )
- )
-
- upload_order[path] = all_refs
-
- # Topologically sort based on provenance dependencies
- sorted_paths = topolgical_sort(upload_order)
- sorted_paths = [p[0] for p in sorted_paths]
-
- # Track uploaded files for provenance resolution
- path_to_synapse_id: Dict[str, str] = {}
- uploaded_files: List["File"] = []
-
- for path in sorted_paths:
- row = path_to_row[path]
-
- # Get parent - use manifest value or fall back to provided parent_id
- file_parent = row.get("parent", "").strip() or parent_id
-
- # Build the File object
- file = File(
- path=path,
- parent_id=file_parent,
- name=row.get("name", "").strip() or None,
- id=row.get("id", "").strip() or None,
- synapse_store=(
- row.get("synapseStore", "").strip().lower() != "false"
- if row.get("synapseStore", "").strip()
- else True
- ),
- content_type=row.get("contentType", "").strip() or None,
- merge_existing_annotations=merge_existing_annotations,
- associate_activity_to_new_version=associate_activity_to_new_version,
- )
-
- # Build annotations from extra columns
- annotations = {}
- skip_keys = set(
- REQUIRED_FIELDS
- + FILE_CONSTRUCTOR_FIELDS
- + STORE_FUNCTION_FIELDS
- + PROVENANCE_FIELDS
- )
- for key, value in row.items():
- if key not in skip_keys and value and value.strip():
- annotations[key] = _parse_manifest_value(value.strip())
- if annotations:
- file.annotations = annotations
-
- # Build provenance/activity
- used_items = []
- executed_items = []
-
- used_str = row.get("used", "")
- if used_str and used_str.strip():
- for item in used_str.split(";"):
- item = item.strip()
- if item:
- used_items.append(
- _resolve_provenance_item(item, path_to_synapse_id)
- )
-
- executed_str = row.get("executed", "")
- if executed_str and executed_str.strip():
- for item in executed_str.split(";"):
- item = item.strip()
- if item:
- executed_items.append(
- _resolve_provenance_item(item, path_to_synapse_id)
- )
-
- if used_items or executed_items:
- activity = Activity(
- name=row.get("activityName", "").strip() or None,
- description=row.get("activityDescription", "").strip() or None,
- used=used_items,
- executed=executed_items,
- )
- file.activity = activity
-
- # Upload the file
- file = await file.store_async(synapse_client=syn)
-
- # Track for provenance resolution
- path_to_synapse_id[path] = file.id
- uploaded_files.append(file)
-
- syn.logger.info(f"Uploaded: {file.name} ({file.id})")
-
- return uploaded_files
-
- @staticmethod
- @otel_trace_method(method_to_trace_name=lambda **kwargs: "validate_manifest")
- async def validate_manifest_async(
- manifest_path: str,
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> Tuple[bool, List[str]]:
- """
- Validate a manifest TSV file without uploading.
-
- This method validates a manifest file to ensure it is properly formatted
- and all paths exist.
-
- Arguments:
- manifest_path: Path to the manifest TSV file.
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- Tuple of (is_valid, list_of_error_messages). If the manifest is valid,
- is_valid will be True and the list will be empty.
-
- Example: Validate a manifest file
- Validate a manifest file before uploading:
-
- from synapseclient.models import Project
-
- is_valid, errors = Project.validate_manifest(
- manifest_path="/path/to/manifest.tsv"
- )
- if is_valid:
- print("Manifest is valid")
- else:
- for error in errors:
- print(f"Error: {error}")
- """
- return _validate_manifest_required_fields(manifest_path)
-
- @staticmethod
- async def generate_download_list_manifest_async(
- download_path: str,
- csv_separator: str = ",",
- include_header: bool = True,
- timeout: int = 120,
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> str:
- """
- Generate a manifest file from the current user's download list using the
- Synapse REST API.
-
- This method creates a CSV manifest containing metadata about all files in
- the user's download list. The manifest is generated server-side by Synapse
- and then downloaded to the specified path.
-
- This is interoperable with the Synapse download list feature and provides
- a way to export the download list as a manifest file that can be used for
- bulk operations.
-
- Arguments:
- download_path: The local directory path where the manifest will be saved.
- csv_separator: The delimiter character for the CSV file.
- Defaults to "," for comma-separated values. Use "\t" for tab-separated.
- include_header: Whether to include column headers in the first row.
- Defaults to True.
- timeout: The number of seconds to wait for the job to complete.
- Defaults to 120 seconds.
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- The full path to the downloaded manifest file.
-
- Example: Generate manifest from download list
- Generate a manifest from your Synapse download list:
-
- from synapseclient.models import Project
-
- import synapseclient
- synapseclient.login()
-
- # Generate manifest from download list
- manifest_path = Project.generate_download_list_manifest(
- download_path="/path/to/download"
- )
- print(f"Manifest downloaded to: {manifest_path}")
-
- Example: Generate tab-separated manifest
- Generate a TSV manifest from your download list:
-
- from synapseclient.models import Project
-
- import synapseclient
- synapseclient.login()
-
- manifest_path = Project.generate_download_list_manifest(
- download_path="/path/to/download",
- csv_separator="\t"
- )
-
- See Also:
- - `DownloadListManifestRequest`: The underlying request class for more
- fine-grained control over the manifest generation process.
- """
- from synapseclient.models.download_list import DownloadListManifestRequest
- from synapseclient.models.table_components import CsvTableDescriptor
-
- # Create the request with CSV formatting options
- request = DownloadListManifestRequest(
- csv_table_descriptor=CsvTableDescriptor(
- separator=csv_separator,
- is_first_line_header=include_header,
- )
- )
-
- # Send the job and wait for completion
- await request.send_job_and_wait_async(
- timeout=timeout,
- synapse_client=synapse_client,
- )
-
- # Download the manifest
- manifest_file_path = await request.download_manifest_async(
- download_path=download_path,
- synapse_client=synapse_client,
- )
-
- return manifest_file_path
-
-
-def _resolve_provenance_item(
- item: str,
- path_to_synapse_id: Dict[str, str],
-) -> Any:
- """
- Resolve a provenance item to a UsedEntity or UsedURL.
-
- Args:
- item: The provenance item string (could be a path, Synapse ID, or URL).
- path_to_synapse_id: Mapping of local file paths to their Synapse IDs.
-
- Returns:
- UsedEntity or UsedURL object.
- """
- from synapseclient.models import UsedEntity, UsedURL
-
- # Check if it's a local file path that was uploaded
- expanded_path = os.path.abspath(os.path.expandvars(os.path.expanduser(item)))
- if expanded_path in path_to_synapse_id:
- return UsedEntity(target_id=path_to_synapse_id[expanded_path])
-
- # Check if it's a URL
- if is_url(item):
- return UsedURL(url=item)
-
- # Check if it's a Synapse ID
- if is_synapse_id_str(item):
- return UsedEntity(target_id=item)
-
- # Assume it's a Synapse ID
- return UsedEntity(target_id=item)
-
-
-def _parse_manifest_value(value: str) -> Any:
- """
- Parse a manifest cell value into an appropriate Python type.
-
- Handles:
- - List syntax: [a,b,c] -> ['a', 'b', 'c']
- - Boolean strings: 'true', 'false' -> True, False
- - Numeric strings: '123' -> 123, '1.5' -> 1.5
- - Everything else: returned as string
-
- Args:
- value: The string value from the manifest.
-
- Returns:
- The parsed value.
- """
- # Check for list syntax
- if ARRAY_BRACKET_PATTERN.match(value):
- # Remove brackets
- inner = value[1:-1]
- # Split on commas outside quotes
- items = COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN.split(inner)
- result = []
- for item in items:
- item = item.strip()
- # Remove surrounding quotes if present
- if item.startswith('"') and item.endswith('"'):
- item = item[1:-1]
- result.append(item)
- return result
-
- # Check for boolean
- if value.lower() == "true":
- return True
- if value.lower() == "false":
- return False
-
- # Check for integer
- try:
- return int(value)
- except ValueError:
- pass
-
- # Check for float
- try:
- return float(value)
- except ValueError:
- pass
-
- # Return as string
- return value
diff --git a/synapseclient/models/protocols/manifest_protocol.py b/synapseclient/models/protocols/manifest_protocol.py
deleted file mode 100644
index 1da447da0..000000000
--- a/synapseclient/models/protocols/manifest_protocol.py
+++ /dev/null
@@ -1,240 +0,0 @@
-"""Protocol for the specific methods of ManifestGeneratable mixin that have
-synchronous counterparts generated at runtime."""
-
-from typing import Dict, List, Optional, Protocol, Tuple
-
-from synapseclient import Synapse
-
-
-class ManifestGeneratableSynchronousProtocol(Protocol):
- """
- The protocol for methods that are asynchronous but also
- have a synchronous counterpart that may also be called.
- """
-
- def generate_manifest(
- self,
- path: str,
- manifest_scope: str = "all",
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> Optional[str]:
- """Generate a manifest TSV file for all files in this container.
-
- This method should be called after `sync_from_synapse()` to generate
- a manifest of all downloaded files with their metadata.
-
- Arguments:
- path: The directory where the manifest file(s) will be written.
- manifest_scope: Controls manifest file generation:
-
- - "all": Create a manifest in each directory level
- - "root": Create a single manifest at the root path only
- - "suppress": Do not create any manifest files
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- The path to the root manifest file if created, or None if suppressed.
-
- Raises:
- ValueError: If the container has not been synced from Synapse.
- ValueError: If manifest_scope is not one of 'all', 'root', 'suppress'.
-
- Example: Generate manifest after sync
- Generate a manifest file after syncing from Synapse:
-
- from synapseclient.models import Project
-
- import synapseclient
- synapseclient.login()
-
- project = Project(id="syn123").sync_from_synapse(
- path="/path/to/download"
- )
- manifest_path = project.generate_manifest(
- path="/path/to/download",
- manifest_scope="root"
- )
- print(f"Manifest created at: {manifest_path}")
- """
- return None
-
- @classmethod
- def from_manifest(
- cls,
- manifest_path: str,
- parent_id: str,
- dry_run: bool = False,
- merge_existing_annotations: bool = True,
- associate_activity_to_new_version: bool = False,
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> List:
- """Upload files to Synapse from a manifest TSV file.
-
- This method reads a manifest TSV file and uploads all files defined in it
- to Synapse. The manifest file must contain at minimum the 'path' and 'parent'
- columns.
-
- Arguments:
- manifest_path: Path to the manifest TSV file.
- parent_id: The Synapse ID of the parent container (Project or Folder)
- where files will be uploaded if not specified in the manifest.
- dry_run: If True, validate the manifest but do not upload.
- merge_existing_annotations: If True, merge annotations with existing
- annotations on the file. If False, replace existing annotations.
- associate_activity_to_new_version: If True, copy the activity
- (provenance) from the previous version to the new version.
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- List of File objects that were uploaded.
-
- Example: Upload files from a manifest
- Upload files from a manifest TSV file:
-
- from synapseclient.models import Project
-
- import synapseclient
- synapseclient.login()
-
- files = Project.from_manifest(
- manifest_path="/path/to/manifest.tsv",
- parent_id="syn123"
- )
- for file in files:
- print(f"Uploaded: {file.name} ({file.id})")
- """
- return []
-
- @staticmethod
- def validate_manifest(
- manifest_path: str,
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> Tuple[bool, List[str]]:
- """Validate a manifest TSV file without uploading.
-
- This method validates a manifest file to ensure it is properly formatted
- and all paths exist.
-
- Arguments:
- manifest_path: Path to the manifest TSV file.
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- Tuple of (is_valid, list_of_error_messages). If the manifest is valid,
- is_valid will be True and the list will be empty.
-
- Example: Validate a manifest file
- Validate a manifest file before uploading:
-
- from synapseclient.models import Project
-
- is_valid, errors = Project.validate_manifest(
- manifest_path="/path/to/manifest.tsv"
- )
- if is_valid:
- print("Manifest is valid")
- else:
- for error in errors:
- print(f"Error: {error}")
- """
- return (True, [])
-
- def get_manifest_data(
- self,
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> Tuple[List[str], List[Dict[str, str]]]:
- """Get manifest data for all files in this container.
-
- This method extracts metadata from all files that have been synced
- to this container. The data can be used to generate a manifest file
- or for other purposes.
-
- Arguments:
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- Tuple of (keys, data) where keys is a list of column headers
- and data is a list of dictionaries, one per file, containing
- the file metadata.
-
- Raises:
- ValueError: If the container has not been synced from Synapse.
-
- Example: Get manifest data
- Get manifest data for all files in a project:
-
- from synapseclient.models import Project
-
- import synapseclient
- synapseclient.login()
-
- project = Project(id="syn123").sync_from_synapse(
- path="/path/to/download"
- )
- keys, data = project.get_manifest_data()
- for row in data:
- print(f"File: {row['name']} at {row['path']}")
- """
- return ([], [])
-
- @staticmethod
- def generate_download_list_manifest(
- download_path: str,
- csv_separator: str = ",",
- include_header: bool = True,
- timeout: int = 120,
- *,
- synapse_client: Optional[Synapse] = None,
- ) -> str:
- """Generate a manifest file from the current user's download list.
-
- This method creates a CSV manifest containing metadata about all files in
- the user's download list. The manifest is generated server-side by Synapse
- and then downloaded to the specified path.
-
- This is interoperable with the Synapse download list feature and provides
- a way to export the download list as a manifest file that can be used for
- bulk operations.
-
- Arguments:
- download_path: The local directory path where the manifest will be saved.
- csv_separator: The delimiter character for the CSV file.
- Defaults to "," for comma-separated values. Use "\t" for tab-separated.
- include_header: Whether to include column headers in the first row.
- Defaults to True.
- timeout: The number of seconds to wait for the job to complete.
- Defaults to 120 seconds.
- synapse_client: If not passed in and caching was not disabled by
- `Synapse.allow_client_caching(False)` this will use the last created
- instance from the Synapse class constructor.
-
- Returns:
- The full path to the downloaded manifest file.
-
- Example: Generate manifest from download list
- Generate a manifest from your Synapse download list:
-
- from synapseclient.models import Project
-
- import synapseclient
- synapseclient.login()
-
- # Generate manifest from download list
- manifest_path = Project.generate_download_list_manifest(
- download_path="/path/to/download"
- )
- print(f"Manifest downloaded to: {manifest_path}")
- """
- return ""
diff --git a/tests/unit/synapseclient/models/unit_test_manifest.py b/tests/unit/synapseclient/models/unit_test_manifest.py
deleted file mode 100644
index 4c65ac7c3..000000000
--- a/tests/unit/synapseclient/models/unit_test_manifest.py
+++ /dev/null
@@ -1,499 +0,0 @@
-"""Unit tests for the synapseclient.models.mixins.manifest module."""
-
-import datetime
-import os
-import tempfile
-
-import pytest
-
-from synapseclient.models.mixins.manifest import (
- DEFAULT_GENERATED_MANIFEST_KEYS,
- MANIFEST_FILENAME,
- _convert_manifest_data_items_to_string_list,
- _convert_manifest_data_row_to_dict,
- _extract_entity_metadata_for_file,
- _get_entity_provenance_dict_for_file,
- _manifest_filename,
- _parse_manifest_value,
- _validate_manifest_required_fields,
- _write_manifest_data,
-)
-
-
-class TestManifestConstants:
- """Tests for manifest constants."""
-
- def test_manifest_filename_constant(self):
- """Test the MANIFEST_FILENAME constant."""
- assert MANIFEST_FILENAME == "SYNAPSE_METADATA_MANIFEST.tsv"
-
- def test_default_manifest_keys(self):
- """Test the DEFAULT_GENERATED_MANIFEST_KEYS constant."""
- expected_keys = [
- "path",
- "parent",
- "name",
- "id",
- "synapseStore",
- "contentType",
- "used",
- "executed",
- "activityName",
- "activityDescription",
- ]
- assert DEFAULT_GENERATED_MANIFEST_KEYS == expected_keys
-
-
-class TestManifestFilename:
- """Tests for _manifest_filename function."""
-
- def test_manifest_filename(self):
- """Test generating manifest filename."""
- # GIVEN a path
- path = "/path/to/directory"
-
- # WHEN we generate the manifest filename
- result = _manifest_filename(path)
-
- # THEN it should be the path joined with MANIFEST_FILENAME
- assert result == os.path.join(path, MANIFEST_FILENAME)
-
-
-class TestConvertManifestDataItemsToStringList:
- """Tests for _convert_manifest_data_items_to_string_list function."""
-
- def test_single_string(self):
- """Test converting a single string."""
- # GIVEN a list with a single string
- items = ["hello"]
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list(items)
-
- # THEN it should return the string directly
- assert result == "hello"
-
- def test_multiple_strings(self):
- """Test converting multiple strings."""
- # GIVEN a list with multiple strings
- items = ["a", "b", "c"]
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list(items)
-
- # THEN it should return a bracketed list
- assert result == "[a,b,c]"
-
- def test_string_with_comma(self):
- """Test converting a string with comma."""
- # GIVEN a single item with comma (no quotes needed for single item)
- items = ["hello,world"]
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list(items)
-
- # THEN it should return the string directly
- assert result == "hello,world"
-
- def test_multiple_strings_with_comma(self):
- """Test converting multiple strings where one has a comma."""
- # GIVEN multiple strings where one contains commas
- items = ["string,with,commas", "string without commas"]
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list(items)
-
- # THEN the comma-containing string should be quoted
- assert result == '["string,with,commas",string without commas]'
-
- def test_datetime(self):
- """Test converting a datetime."""
- # GIVEN a datetime value
- dt = datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list([dt])
-
- # THEN it should return ISO format
- assert result == "2020-01-01T00:00:00Z"
-
- def test_multiple_datetimes(self):
- """Test converting multiple datetimes."""
- # GIVEN multiple datetime values
- dt1 = datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)
- dt2 = datetime.datetime(2021, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list([dt1, dt2])
-
- # THEN it should return a bracketed list of ISO dates
- assert result == "[2020-01-01T00:00:00Z,2021-01-01T00:00:00Z]"
-
- def test_boolean_true(self):
- """Test converting True."""
- # GIVEN a True value
- items = [True]
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list(items)
-
- # THEN it should return "True"
- assert result == "True"
-
- def test_boolean_false(self):
- """Test converting False."""
- # GIVEN a False value
- items = [False]
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list(items)
-
- # THEN it should return "False"
- assert result == "False"
-
- def test_integer(self):
- """Test converting an integer."""
- # GIVEN an integer value
- items = [1]
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list(items)
-
- # THEN it should return the string representation
- assert result == "1"
-
- def test_float(self):
- """Test converting a float."""
- # GIVEN a float value
- items = [1.5]
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list(items)
-
- # THEN it should return the string representation
- assert result == "1.5"
-
- def test_empty_list(self):
- """Test converting an empty list."""
- # GIVEN an empty list
- items = []
-
- # WHEN we convert to string
- result = _convert_manifest_data_items_to_string_list(items)
-
- # THEN it should return an empty string
- assert result == ""
-
-
-class TestConvertManifestDataRowToDict:
- """Tests for _convert_manifest_data_row_to_dict function."""
-
- def test_simple_row(self):
- """Test converting a simple row."""
- # GIVEN a row with simple values
- row = {"path": "/path/to/file", "name": "file.txt"}
- keys = ["path", "name"]
-
- # WHEN we convert it
- result = _convert_manifest_data_row_to_dict(row, keys)
-
- # THEN it should return the same values
- assert result == {"path": "/path/to/file", "name": "file.txt"}
-
- def test_row_with_list(self):
- """Test converting a row with a list value."""
- # GIVEN a row with a list value
- row = {"annotations": ["a", "b", "c"]}
- keys = ["annotations"]
-
- # WHEN we convert it
- result = _convert_manifest_data_row_to_dict(row, keys)
-
- # THEN the list should be converted to a string
- assert result == {"annotations": "[a,b,c]"}
-
- def test_missing_key(self):
- """Test converting a row with a missing key."""
- # GIVEN a row missing a key
- row = {"path": "/path/to/file"}
- keys = ["path", "name"]
-
- # WHEN we convert it
- result = _convert_manifest_data_row_to_dict(row, keys)
-
- # THEN the missing key should be empty string
- assert result == {"path": "/path/to/file", "name": ""}
-
-
-class TestParseManifestValue:
- """Tests for _parse_manifest_value function."""
-
- def test_simple_string(self):
- """Test parsing a simple string."""
- assert _parse_manifest_value("hello") == "hello"
-
- def test_list_syntax(self):
- """Test parsing list syntax."""
- assert _parse_manifest_value("[a,b,c]") == ["a", "b", "c"]
-
- def test_list_with_quoted_string(self):
- """Test parsing list with quoted string containing comma."""
- result = _parse_manifest_value('["hello,world",other]')
- assert result == ["hello,world", "other"]
-
- def test_boolean_true(self):
- """Test parsing 'true' string."""
- assert _parse_manifest_value("true") is True
- assert _parse_manifest_value("True") is True
- assert _parse_manifest_value("TRUE") is True
-
- def test_boolean_false(self):
- """Test parsing 'false' string."""
- assert _parse_manifest_value("false") is False
- assert _parse_manifest_value("False") is False
- assert _parse_manifest_value("FALSE") is False
-
- def test_integer(self):
- """Test parsing an integer string."""
- assert _parse_manifest_value("123") == 123
-
- def test_float(self):
- """Test parsing a float string."""
- assert _parse_manifest_value("1.5") == 1.5
-
- def test_non_numeric_string(self):
- """Test that non-numeric strings stay as strings."""
- assert _parse_manifest_value("hello123") == "hello123"
-
-
-class TestWriteManifestData:
- """Tests for _write_manifest_data function."""
-
- def test_write_simple_manifest(self):
- """Test writing a simple manifest file."""
- # GIVEN simple data
- keys = ["path", "name", "id"]
- data = [
- {"path": "/path/to/file1.txt", "name": "file1.txt", "id": "syn123"},
- {"path": "/path/to/file2.txt", "name": "file2.txt", "id": "syn456"},
- ]
-
- # WHEN we write it to a temp file
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
- filename = f.name
-
- try:
- _write_manifest_data(filename, keys, data)
-
- # THEN the file should contain the expected content
- with open(filename, "r") as f:
- content = f.read()
-
- lines = content.strip().split("\n")
- assert len(lines) == 3 # header + 2 data rows
- assert lines[0] == "path\tname\tid"
- assert lines[1] == "/path/to/file1.txt\tfile1.txt\tsyn123"
- assert lines[2] == "/path/to/file2.txt\tfile2.txt\tsyn456"
- finally:
- os.unlink(filename)
-
-
-class TestValidateManifestRequiredFields:
- """Tests for _validate_manifest_required_fields function."""
-
- def test_valid_manifest(self):
- """Test validating a valid manifest file."""
- # GIVEN a valid manifest file
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
- f.write("path\tparent\n")
- f.write(f"{f.name}\tsyn123\n")
- filename = f.name
-
- try:
- # Create the file referenced in path column
- with open(filename, "a") as f:
- pass # File already exists
-
- # WHEN we validate it
- is_valid, errors = _validate_manifest_required_fields(filename)
-
- # THEN it should be valid
- assert is_valid is True
- assert errors == []
- finally:
- os.unlink(filename)
-
- def test_missing_file(self):
- """Test validating a non-existent manifest file."""
- # WHEN we validate a non-existent file
- is_valid, errors = _validate_manifest_required_fields("/nonexistent/file.tsv")
-
- # THEN it should be invalid
- assert is_valid is False
- assert len(errors) == 1
- assert "not found" in errors[0]
-
- def test_missing_required_field(self):
- """Test validating a manifest missing a required field."""
- # GIVEN a manifest missing the 'parent' field
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
- f.write("path\tname\n")
- f.write("/path/to/file.txt\tfile.txt\n")
- filename = f.name
-
- try:
- # WHEN we validate it
- is_valid, errors = _validate_manifest_required_fields(filename)
-
- # THEN it should be invalid
- assert is_valid is False
- assert any("parent" in e for e in errors)
- finally:
- os.unlink(filename)
-
- def test_empty_path(self):
- """Test validating a manifest with empty path."""
- # GIVEN a manifest with empty path
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
- f.write("path\tparent\n")
- f.write("\tsyn123\n")
- filename = f.name
-
- try:
- # WHEN we validate it
- is_valid, errors = _validate_manifest_required_fields(filename)
-
- # THEN it should be invalid
- assert is_valid is False
- assert any("'path' is empty" in e for e in errors)
- finally:
- os.unlink(filename)
-
- def test_invalid_parent_id(self):
- """Test validating a manifest with invalid parent ID."""
- # GIVEN a manifest with invalid parent ID
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".tsv") as f:
- f.write("path\tparent\n")
- f.write(f"{f.name}\tinvalid_parent\n")
- filename = f.name
-
- try:
- # WHEN we validate it
- is_valid, errors = _validate_manifest_required_fields(filename)
-
- # THEN it should be invalid
- assert is_valid is False
- assert any("not a valid Synapse ID" in e for e in errors)
- finally:
- os.unlink(filename)
-
-
-class TestExtractEntityMetadataForFile:
- """Tests for _extract_entity_metadata_for_file function."""
-
- def test_extract_basic_metadata(self):
- """Test extracting basic file metadata."""
-
- # GIVEN a mock File object
- class MockFile:
- def __init__(self):
- self.parent_id = "syn123"
- self.path = "/path/to/file.txt"
- self.name = "file.txt"
- self.id = "syn456"
- self.synapse_store = True
- self.content_type = "text/plain"
- self.annotations = None
- self.activity = None
-
- file = MockFile()
-
- # WHEN we extract metadata
- keys, data = _extract_entity_metadata_for_file([file])
-
- # THEN we should get the expected data
- assert "path" in keys
- assert "parent" in keys
- assert "name" in keys
- assert "id" in keys
- assert len(data) == 1
- assert data[0]["path"] == "/path/to/file.txt"
- assert data[0]["parent"] == "syn123"
- assert data[0]["name"] == "file.txt"
- assert data[0]["id"] == "syn456"
-
- def test_extract_with_annotations(self):
- """Test extracting metadata with annotations."""
-
- # GIVEN a mock File object with annotations
- class MockFile:
- def __init__(self):
- self.parent_id = "syn123"
- self.path = "/path/to/file.txt"
- self.name = "file.txt"
- self.id = "syn456"
- self.synapse_store = True
- self.content_type = "text/plain"
- self.annotations = {"study": ["Study1"], "dataType": ["RNA-seq"]}
- self.activity = None
-
- file = MockFile()
-
- # WHEN we extract metadata
- keys, data = _extract_entity_metadata_for_file([file])
-
- # THEN annotation keys should be included
- assert "study" in keys
- assert "dataType" in keys
- assert data[0]["study"] == ["Study1"]
- assert data[0]["dataType"] == ["RNA-seq"]
-
-
-class TestGetEntityProvenanceDictForFile:
- """Tests for _get_entity_provenance_dict_for_file function."""
-
- def test_no_activity(self):
- """Test extracting provenance when there is no activity."""
-
- # GIVEN a mock File object with no activity
- class MockFile:
- def __init__(self):
- self.activity = None
-
- file = MockFile()
-
- # WHEN we extract provenance
- result = _get_entity_provenance_dict_for_file(file)
-
- # THEN we should get an empty dict
- assert result == {}
-
- def test_with_activity(self):
- """Test extracting provenance when there is an activity."""
-
- # GIVEN mock objects
- class MockUsedEntity:
- def format_for_manifest(self):
- return "syn789"
-
- class MockActivity:
- def __init__(self):
- self.name = "Analysis"
- self.description = "Processing data"
- self.used = [MockUsedEntity()]
- self.executed = []
-
- class MockFile:
- def __init__(self):
- self.activity = MockActivity()
-
- file = MockFile()
-
- # WHEN we extract provenance
- result = _get_entity_provenance_dict_for_file(file)
-
- # THEN we should get the expected dict
- assert result["activityName"] == "Analysis"
- assert result["activityDescription"] == "Processing data"
- assert result["used"] == "syn789"
- assert result["executed"] == ""
From 5e1159d3c82c7e1bcdd833fe9b0532c104fe132f Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 10:52:29 -0700
Subject: [PATCH 17/31] remove manifest related changes
---
synapseclient/models/__init__.py | 10 ----------
1 file changed, 10 deletions(-)
diff --git a/synapseclient/models/__init__.py b/synapseclient/models/__init__.py
index 9d5bc90b0..8aecd78cb 100644
--- a/synapseclient/models/__init__.py
+++ b/synapseclient/models/__init__.py
@@ -14,7 +14,6 @@
RecordBasedMetadataTaskProperties,
)
from synapseclient.models.dataset import Dataset, DatasetCollection, EntityRef
-from synapseclient.models.download_list import DownloadListManifestRequest
from synapseclient.models.entityview import EntityView, ViewTypeMask
from synapseclient.models.evaluation import Evaluation
from synapseclient.models.file import File, FileHandle
@@ -22,10 +21,6 @@
from synapseclient.models.form import FormData, FormGroup
from synapseclient.models.link import Link
from synapseclient.models.materializedview import MaterializedView
-from synapseclient.models.mixins.manifest import (
- DEFAULT_GENERATED_MANIFEST_KEYS,
- MANIFEST_FILENAME,
-)
from synapseclient.models.mixins.table_components import QueryMixin
from synapseclient.models.project import Project
from synapseclient.models.recordset import RecordSet
@@ -167,11 +162,6 @@
"StorageLocation",
"StorageLocationType",
"UploadType",
- # Manifest constants
- "MANIFEST_FILENAME",
- "DEFAULT_GENERATED_MANIFEST_KEYS",
- # Download List models
- "DownloadListManifestRequest",
]
# Static methods to expose as functions
From 39f57ca3408aca5c6a93ad31ae0a0ab22dddc199 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 10:52:57 -0700
Subject: [PATCH 18/31] remove manifest related changes
---
synapseclient/models/folder.py | 2 --
synapseclient/models/project.py | 2 --
2 files changed, 4 deletions(-)
diff --git a/synapseclient/models/folder.py b/synapseclient/models/folder.py
index c4d4e0718..9a6dff47e 100644
--- a/synapseclient/models/folder.py
+++ b/synapseclient/models/folder.py
@@ -18,7 +18,6 @@
ContainerEntityJSONSchema,
StorableContainer,
)
-from synapseclient.models.mixins.manifest import ManifestGeneratable
from synapseclient.models.mixins.storage_location_mixin import (
StorageLocationConfigurable,
)
@@ -52,7 +51,6 @@ class Folder(
StorableContainer,
ContainerEntityJSONSchema,
StorageLocationConfigurable,
- ManifestGeneratable,
):
"""Folder is a hierarchical container for organizing data in Synapse.
diff --git a/synapseclient/models/project.py b/synapseclient/models/project.py
index 6686c8ac5..d5a4479c2 100644
--- a/synapseclient/models/project.py
+++ b/synapseclient/models/project.py
@@ -18,7 +18,6 @@
ContainerEntityJSONSchema,
StorableContainer,
)
-from synapseclient.models.mixins.manifest import ManifestGeneratable
from synapseclient.models.mixins.storage_location_mixin import (
StorageLocationConfigurable,
)
@@ -51,7 +50,6 @@ class Project(
StorableContainer,
ContainerEntityJSONSchema,
StorageLocationConfigurable,
- ManifestGeneratable,
):
"""A Project is a top-level container for organizing data in Synapse.
From e04ab86d8fae8bce848d24e3de6873b307827aa6 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 11:02:28 -0700
Subject: [PATCH 19/31] add EnumCoercionMixin
---
synapseclient/models/mixins/__init__.py | 10 ++--------
1 file changed, 2 insertions(+), 8 deletions(-)
diff --git a/synapseclient/models/mixins/__init__.py b/synapseclient/models/mixins/__init__.py
index 491ea9616..443c34810 100644
--- a/synapseclient/models/mixins/__init__.py
+++ b/synapseclient/models/mixins/__init__.py
@@ -2,6 +2,7 @@
from synapseclient.models.mixins.access_control import AccessControllable
from synapseclient.models.mixins.asynchronous_job import AsynchronousCommunicator
+from synapseclient.models.mixins.enum_coercion import EnumCoercionMixin
from synapseclient.models.mixins.form import (
FormChangeRequest,
FormData,
@@ -20,11 +21,6 @@
JSONSchemaValidationStatistics,
ValidationException,
)
-from synapseclient.models.mixins.manifest import (
- DEFAULT_GENERATED_MANIFEST_KEYS,
- MANIFEST_FILENAME,
- ManifestGeneratable,
-)
from synapseclient.models.mixins.storable_container import StorableContainer
from synapseclient.models.mixins.storage_location_mixin import (
StorageLocationConfigurable,
@@ -32,6 +28,7 @@
__all__ = [
"AccessControllable",
+ "EnumCoercionMixin",
"StorableContainer",
"StorageLocationConfigurable",
"AsynchronousCommunicator",
@@ -49,7 +46,4 @@
"FormChangeRequest",
"FormSubmissionStatus",
"StateEnum",
- "ManifestGeneratable",
- "MANIFEST_FILENAME",
- "DEFAULT_GENERATED_MANIFEST_KEYS",
]
From 1bc03c5381ff74f13a362b760d74099278f91462 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 17:33:49 -0700
Subject: [PATCH 20/31] update parameter name
---
synapseclient/models/mixins/storage_location_mixin.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/synapseclient/models/mixins/storage_location_mixin.py b/synapseclient/models/mixins/storage_location_mixin.py
index db3c509a8..a10308ca9 100644
--- a/synapseclient/models/mixins/storage_location_mixin.py
+++ b/synapseclient/models/mixins/storage_location_mixin.py
@@ -108,7 +108,7 @@ async def main():
if existing_setting is not None:
existing_setting["locations"] = locations
await update_project_setting(
- body=existing_setting,
+ request=existing_setting,
synapse_client=synapse_client,
)
return await get_project_setting(
@@ -124,7 +124,7 @@ async def main():
"projectId": self.id,
}
return await create_project_setting(
- body=project_destination,
+ request=project_destination,
synapse_client=synapse_client,
)
@@ -363,7 +363,7 @@ async def main():
raise ValueError("The entity must have an id set.")
return await _index_files_for_migration_async(
- entity_id=self.id,
+ self,
dest_storage_location_id=str(dest_storage_location_id),
db_path=db_path,
source_storage_location_ids=(
From 5b045dd2f7588548c6afa470d774cf88291414d3 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 17:41:04 -0700
Subject: [PATCH 21/31] remove unused imports
---
synapseclient/models/services/migration.py | 1823 +++++++++-----------
1 file changed, 846 insertions(+), 977 deletions(-)
diff --git a/synapseclient/models/services/migration.py b/synapseclient/models/services/migration.py
index 0186e8b77..7cb495b8e 100644
--- a/synapseclient/models/services/migration.py
+++ b/synapseclient/models/services/migration.py
@@ -1,8 +1,7 @@
"""
-Async migration service for migrating files between storage locations.
+Asynchronous service for indexing, and migrating entities between storage locations.
-This module provides native async implementations of the migration functionality,
-replacing the threading-based approach in synapseutils.migrate_functions.
+This module provides native async implementations of the indexing and migration functionality
"""
import asyncio
@@ -27,13 +26,12 @@
from synapseclient.api.entity_services import get_children
from synapseclient.api.file_services import get_file_handle_for_download_async
-from synapseclient.api.table_services import create_table_snapshot, get_columns
+from synapseclient.api.table_services import get_columns
from synapseclient.core import utils
from synapseclient.core.constants import concrete_types
-from synapseclient.core.upload.multipart_upload import (
- MAX_NUMBER_OF_PARTS,
- multipart_copy,
-)
+from synapseclient.core.exceptions import SynapseError
+from synapseclient.core.upload.multipart_upload import MAX_NUMBER_OF_PARTS
+from synapseclient.core.upload.multipart_upload_async import multipart_copy_async
from synapseclient.models.table_components import (
AppendableRowSetRequest,
PartialRow,
@@ -52,20 +50,58 @@
)
if TYPE_CHECKING:
- from synapseclient import Synapse
+ from synapseclient.models import Table
+ from synapseclient.models.services import query_async
+
+import sqlite3
+
+from synapseclient import Synapse
+from synapseclient.api import get_entity_type, rest_get_paginated_async
+from synapseclient.entity import Entity
+from synapseclient.operations import FileOptions, get_async
# Default part size for multipart copy (100 MB)
+# we use a much larger default part size for part copies than we would for part uploads.
+# with part copies the data transfer is within AWS so don't need to concern ourselves
+# with upload failures of the actual bytes.
+# this value aligns with what some AWS client libraries use e.g.
+# https://github.com/aws/aws-sdk-java/blob/57ed2e4bd57e08f316bf5c6c71f6fd82a27fa240/aws-java-sdk-s3/src/main/java/com/amazonaws/services/s3/transfer/TransferManagerConfiguration.java#L46
DEFAULT_PART_SIZE = 100 * utils.MB
-# Batch size for database operations
+# Batch size for database operations so the batch operations are chunked.
BATCH_SIZE = 500
+# Maximum concurrent file copy.
+MAX_CONCURRENT_FILE_COPIES = max(int(Synapse().max_threads / 2), 1)
+
logger = logging.getLogger(__name__)
# =============================================================================
-# Temp Directory Helpers
+# Indexing Helper Functions
# =============================================================================
+async def _verify_storage_location_ownership_async(
+ storage_location_id: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+) -> None:
+ """Verify the user owns the destination storage location.
+ Only the creator of the storage location can can retrieve it by its id.
+
+ Arguments:
+ storage_location_id: The storage location ID to verify.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+
+ Raises:
+ ValueError: If the user does not own the storage location.
+ """
+ try:
+ await synapse_client.rest_get_async(f"/storageLocation/{storage_location_id}")
+ except SynapseError:
+ raise ValueError(
+ f"Unable to verify ownership of storage location {storage_location_id}. "
+ f"You must be the creator of the destination storage location."
+ )
def _get_default_db_path(entity_id: str) -> str:
@@ -81,17 +117,29 @@ def _get_default_db_path(entity_id: str) -> str:
return os.path.join(temp_dir, f"migration_{entity_id}.db")
-# =============================================================================
-# Column Name Helpers (replaces legacy synapseclient.table functions)
-# =============================================================================
+async def _get_version_numbers_async(
+ entity_id: str,
+ synapse_client: "Synapse",
+) -> AsyncGenerator[int, None]:
+ """Get all version numbers for an entity.
+
+ Arguments:
+ entity_id: The entity ID.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+
+ Yields:
+ Version numbers.
+ """
+ async for version_info in rest_get_paginated_async(
+ f"/entity/{entity_id}/version", synapse_client=synapse_client
+ ):
+ yield version_info["versionNumber"]
def _escape_column_name(column: Union[str, collections.abc.Mapping]) -> str:
"""Escape a column name for use in a Synapse table query statement.
-
Arguments:
column: A string column name or a dictionary with a 'name' key.
-
Returns:
Escaped column name wrapped in double quotes.
"""
@@ -104,29 +152,56 @@ def _escape_column_name(column: Union[str, collections.abc.Mapping]) -> str:
def _join_column_names(columns: List[Any]) -> str:
"""Join column names into a comma-delimited list for table queries.
-
Arguments:
columns: A list of column names or column objects with 'name' keys.
-
Returns:
Comma-separated string of escaped column names.
"""
return ",".join(_escape_column_name(c) for c in columns)
+def _check_indexed(cursor: sqlite3.Cursor, entity_id: str) -> bool:
+ """Check if an entity has already been indexed.
+ If so, it can skip reindexing it.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ entity_id: The entity ID to check.
+
+ Returns:
+ True if the entity is already indexed.
+ """
+ indexed_row = cursor.execute(
+ "select 1 from migrations where id = ?", (entity_id,)
+ ).fetchone()
+
+ if indexed_row:
+ logger.debug("%s already indexed, skipping", entity_id)
+ return True
+
+ logger.debug("%s not yet indexed, indexing now", entity_id)
+ return False
+
+
# =============================================================================
-# Database Helper Functions (Synchronous - wrapped with asyncio.to_thread)
+# Database Helper Functions
# =============================================================================
+def _ensure_schema(cursor: sqlite3.Cursor) -> None:
+ """Ensure the SQLite database has the required schema.
-
-def _ensure_schema(cursor) -> None:
- """Ensure the SQLite database has the required schema."""
- # Settings table - stores JSON configuration
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ """
+ # migration_settings table
+ # A table to store parameters used to create the index.
cursor.execute(
"CREATE TABLE IF NOT EXISTS migration_settings (settings TEXT NOT NULL)"
)
- # Main migrations table
+ # Migrations table
+ # The representation of migratable file handles is flat including both file entities
+ # and table attached files, so not all columns are applicable to both. row id and col id
+ # are only used by table attached files.
cursor.execute(
"""
CREATE TABLE IF NOT EXISTS migrations (
@@ -147,15 +222,19 @@ def _ensure_schema(cursor) -> None:
"""
)
- # Indexes for common queries
+ # Index the status column for faster status-based lookups
cursor.execute("CREATE INDEX IF NOT EXISTS ix_status ON migrations(status)")
+ # Index the from_file_handle_id and to_file_handle_id columns for faster file handle-based lookups
+ # This is used to see if there is already a migrated copy of a file handle before doing a copy
cursor.execute(
"CREATE INDEX IF NOT EXISTS ix_file_handle_ids "
"ON migrations(from_file_handle_id, to_file_handle_id)"
)
-def _initialize_database(
+def _prepare_migration_db(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
db_path: str,
root_id: str,
dest_storage_location_id: str,
@@ -163,9 +242,12 @@ def _initialize_database(
file_version_strategy: str,
include_table_files: bool,
) -> None:
- """Initialize the migration database with schema and settings.
+ """Prepare the migration database by checking the migration settings for the given parameters.
+ This is a guardrail: it binds a given SQLite index settings to the specific entity and migration options it was created with, enabling safe resumption and preventing mismatched reuse.
Arguments:
+ conn: The connection to the SQLite database.
+ cursor: The cursor to the SQLite database.
db_path: Path to the SQLite database file.
root_id: The root entity ID being migrated.
dest_storage_location_id: Destination storage location ID.
@@ -173,307 +255,177 @@ def _initialize_database(
file_version_strategy: Strategy for handling file versions.
include_table_files: Whether to include table-attached files.
"""
- import sqlite3
-
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
- _ensure_schema(cursor)
-
- # Check if settings already exist
- existing = cursor.execute("SELECT settings FROM migration_settings").fetchone()
+ current_settings = MigrationSettings(
+ root_id=root_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ )
+ existing_settings = _retrieve_index_settings(cursor)
- settings = MigrationSettings(
- root_id=root_id,
- dest_storage_location_id=dest_storage_location_id,
- source_storage_location_ids=source_storage_location_ids,
- file_version_strategy=file_version_strategy,
- include_table_files=include_table_files,
+ if existing_settings:
+ current_settings.verify_migration_settings(existing_settings, db_path)
+ else:
+ cursor.execute(
+ "INSERT INTO migration_settings (settings) VALUES (?)",
+ (json.dumps(current_settings.to_dict()),),
)
- if existing:
- # Verify settings match
- existing_settings = json.loads(existing[0])
- if existing_settings.get("root_id") != root_id:
- raise ValueError(
- f"Root entity ID mismatch: database has {existing_settings.get('root_id')}, "
- f"but {root_id} was provided"
- )
- if (
- existing_settings.get("dest_storage_location_id")
- != dest_storage_location_id
- ):
- raise ValueError(
- f"Destination storage location mismatch: database has "
- f"{existing_settings.get('dest_storage_location_id')}, "
- f"but {dest_storage_location_id} was provided"
- )
- else:
- # Insert new settings
- settings_json = json.dumps(
- {
- "root_id": settings.root_id,
- "dest_storage_location_id": settings.dest_storage_location_id,
- "source_storage_location_ids": settings.source_storage_location_ids,
- "file_version_strategy": settings.file_version_strategy,
- "include_table_files": settings.include_table_files,
- }
- )
- cursor.execute(
- "INSERT INTO migration_settings (settings) VALUES (?)",
- (settings_json,),
- )
-
- conn.commit()
-
-
-def _retrieve_index_settings(db_path: str) -> Optional[Dict[str, Any]]:
- """Retrieve index settings from the database.
-
- Arguments:
- db_path: Path to the SQLite database file.
-
- Returns:
- Dictionary of settings or None if not found.
- """
- import sqlite3
-
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
- _ensure_schema(cursor)
-
- row = cursor.execute("SELECT settings FROM migration_settings").fetchone()
- if row:
- return json.loads(row[0])
- return None
+ conn.commit()
-def _check_indexed(db_path: str, entity_id: str) -> bool:
- """Check if an entity has already been indexed.
+def _retrieve_index_settings(cursor: sqlite3.Cursor) -> Optional[MigrationSettings]:
+ """Retrieve index settings from the database as a MigrationSettings instance.
Arguments:
- db_path: Path to the SQLite database file.
- entity_id: The entity ID to check.
+ cursor: The cursor object from the connection to the SQLite database.
Returns:
- True if the entity is already indexed, False otherwise.
- """
- import sqlite3
-
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
- row = cursor.execute(
- "SELECT 1 FROM migrations WHERE id = ? LIMIT 1",
- (entity_id,),
- ).fetchone()
- return row is not None
-
-
-def _mark_container_indexed(
- db_path: str,
- entity_id: str,
- parent_id: Optional[str],
- migration_type: MigrationType,
-) -> None:
- """Mark a container (Project or Folder) as indexed.
-
- Arguments:
- db_path: Path to the SQLite database file.
- entity_id: The entity ID.
- parent_id: The parent entity ID.
- migration_type: The type of container.
+ MigrationSettings if a row exists, None otherwise.
"""
- import sqlite3
-
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
- cursor.execute(
- """
- INSERT OR IGNORE INTO migrations (id, type, parent_id, status)
- VALUES (?, ?, ?, ?)
- """,
- (
- entity_id,
- migration_type.value,
- parent_id,
- MigrationStatus.INDEXED.value,
- ),
- )
- conn.commit()
+ row = cursor.execute("SELECT settings FROM migration_settings").fetchone()
+ if row:
+ return MigrationSettings.from_dict(json.loads(row[0]))
+ return None
def _insert_file_migration(
- db_path: str,
- entity_id: str,
- version: Optional[int],
- parent_id: Optional[str],
- from_storage_location_id: int,
- from_file_handle_id: str,
- file_size: int,
- status: MigrationStatus,
+ cursor: sqlite3.Cursor,
+ insert_values: List[
+ Tuple[str, str, Optional[int], Optional[str], int, str, int, MigrationStatus]
+ ],
) -> None:
- """Insert a file migration entry.
+ """Insert a file migration entry to the migrations database.
Arguments:
- db_path: Path to the SQLite database file.
- entity_id: The file entity ID.
- version: The file version (None for new version).
- parent_id: The parent entity ID.
- from_storage_location_id: Source storage location ID.
- from_file_handle_id: Source file handle ID.
- file_size: File size in bytes.
- status: Migration status.
+ cursor: The cursor object from the connection to the SQLite database.
+ insert_values: List of tuples containing the file migration data.
"""
- import sqlite3
-
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
- cursor.execute(
- """
- INSERT OR IGNORE INTO migrations (
- id, type, version, parent_id,
- from_storage_location_id, from_file_handle_id,
- file_size, status
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
- """,
- (
- entity_id,
- MigrationType.FILE.value,
+ cursor.executemany(
+ """
+ insert into migrations (
+ id,
+ type,
version,
parent_id,
from_storage_location_id,
from_file_handle_id,
file_size,
- status.value,
- ),
- )
- conn.commit()
+ status
+ ) values (?, ?, ?, ?, ?, ?, ?, ?)
+ """,
+ insert_values,
+ )
def _insert_table_file_migration(
- db_path: str,
- entity_id: str,
- row_id: int,
- col_id: int,
- row_version: int,
- parent_id: Optional[str],
- from_storage_location_id: int,
- from_file_handle_id: str,
- file_size: int,
- status: MigrationStatus,
+ cursor: sqlite3.Cursor,
+ insert_values: List[
+ Tuple[str, str, Optional[int], Optional[str], int, str, int, MigrationStatus]
+ ],
) -> None:
"""Insert a table-attached file migration entry.
Arguments:
- db_path: Path to the SQLite database file.
- entity_id: The table entity ID.
- row_id: The table row ID.
- col_id: The table column ID.
- row_version: The row version.
- parent_id: The parent entity ID.
- from_storage_location_id: Source storage location ID.
- from_file_handle_id: Source file handle ID.
- file_size: File size in bytes.
- status: Migration status.
+ cursor: The cursor object from the connection to the SQLite database.
+ insert_values: List of tuples containing the table-attached file migration data.
"""
- import sqlite3
-
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
- cursor.execute(
- """
+ cursor.executemany(
+ """
INSERT OR IGNORE INTO migrations (
id, type, row_id, col_id, version, parent_id,
from_storage_location_id, from_file_handle_id,
file_size, status
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
- (
- entity_id,
- MigrationType.TABLE_ATTACHED_FILE.value,
- row_id,
- col_id,
- row_version,
- parent_id,
- from_storage_location_id,
- from_file_handle_id,
- file_size,
- status.value,
- ),
- )
- conn.commit()
+ insert_values,
+ )
-def _record_indexing_error(
- db_path: str,
+def _mark_container_indexed(
+ cursor: sqlite3.Cursor,
entity_id: str,
+ migration_type: MigrationType,
parent_id: Optional[str],
- exception: Exception,
) -> None:
- """Record an indexing error in the database.
+ """Mark a container (Project or Folder) as indexed.
Arguments:
- db_path: Path to the SQLite database file.
- entity_id: The entity ID that failed.
- parent_id: The parent entity ID.
- exception: The exception that occurred.
+ cursor: The cursor object from the connection to the SQLite database.
+ entity_id: The Synapse ID of the container entity.
+ migration_type: The MigrationType of the container.
+ parent_id: The Synapse ID of the parent entity.
"""
- import sqlite3
-
- tb_str = "".join(
- traceback.format_exception(type(exception), exception, exception.__traceback__)
+ cursor.execute(
+ "INSERT OR IGNORE INTO migrations (id, type, parent_id, status) VALUES (?, ?, ?, ?)",
+ [entity_id, migration_type, parent_id, MigrationStatus.INDEXED.value],
)
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
- cursor.execute(
- """
- INSERT OR IGNORE INTO migrations (
- id, type, parent_id, status, exception
- ) VALUES (?, ?, ?, ?, ?)
- """,
- (
- entity_id,
- MigrationType.FILE.value, # Default type for errors
+
+def _record_indexing_error(
+ cursor: sqlite3.Cursor,
+ entity_id: str,
+ migration_type: MigrationType,
+ parent_id: Optional[str],
+ tb_str: str,
+) -> None:
+ """Record an indexing error in the database.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ entity_id: The Synapse ID of the entity that failed.
+ migration_type: The MigrationType of the entity.
+ parent_id: The Synapse ID of the parent entity.
+ tb_str: The traceback string.
+ """
+ cursor.execute(
+ """
+ insert into migrations (
+ id,
+ type,
parent_id,
- MigrationStatus.ERRORED.value,
- tb_str,
- ),
- )
- conn.commit()
+ status,
+ exception
+ ) values (?, ?, ?, ?, ?)
+ """,
+ (
+ entity_id,
+ migration_type,
+ parent_id,
+ MigrationStatus.ERRORED.value,
+ tb_str,
+ ),
+ )
-def _check_file_handle_exists(db_path: str, from_file_handle_id: str) -> Optional[str]:
+# =============================================================================
+# Migration Helper Functions
+# =============================================================================
+def _check_file_handle_exists(
+ cursor: sqlite3.Cursor, from_file_handle_id: str
+) -> Optional[str]:
"""Check if a file handle has already been copied.
Arguments:
- db_path: Path to the SQLite database file.
+ cursor: The cursor object from the connection to the SQLite database.
from_file_handle_id: The source file handle ID.
Returns:
The destination file handle ID if found, None otherwise.
"""
- import sqlite3
-
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
- row = cursor.execute(
- """
- SELECT to_file_handle_id FROM migrations
- WHERE from_file_handle_id = ? AND to_file_handle_id IS NOT NULL
- """,
- (from_file_handle_id,),
- ).fetchone()
- return row[0] if row else None
+ row = cursor.execute(
+ "SELECT to_file_handle_id FROM migrations WHERE from_file_handle_id = ? AND to_file_handle_id IS NOT NULL",
+ (from_file_handle_id,),
+ ).fetchone()
+ return row[0] if row else None
def _query_migration_batch(
- db_path: str,
- last_id: str,
- last_version: int,
- last_row_id: int,
- last_col_id: int,
- pending_file_handles: Set[str],
- completed_file_handles: Set[str],
+ cursor: sqlite3.Cursor,
+ last_key: MigrationKey,
+ pending_file_handle_ids: Set[str],
+ completed_file_handle_ids: Set[str],
limit: int,
) -> List[Dict[str, Any]]:
"""Query the next batch of items to migrate.
@@ -483,48 +435,35 @@ def _query_migration_batch(
- Backtracking to pick up files with completed file handles that were skipped
Arguments:
- db_path: Path to the SQLite database file.
- last_id: Last processed entity ID.
- last_version: Last processed version.
- last_row_id: Last processed row ID.
- last_col_id: Last processed column ID.
- pending_file_handles: Set of file handles currently being processed.
+ cursor: The cursor object from the connection to the SQLite database.
+ last_key: The last processed MigrationKey.
+ pending_file_handle_ids: Set of file handle IDs currently being processed.
completed_file_handles: Set of file handles already completed.
limit: Maximum number of items to return.
Returns:
List of migration entries as dictionaries.
"""
- import sqlite3
-
- if limit <= 0:
- return []
-
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
-
- file_type = MigrationType.FILE.value
- table_type = MigrationType.TABLE_ATTACHED_FILE.value
- indexed_status = MigrationStatus.INDEXED.value
-
- # Build the IN clauses for file handles
- # We use string formatting for the IN clause since sqlite3 doesn't support array parameters
- pending_in = (
- "('" + "','".join(pending_file_handles) + "')"
- if pending_file_handles
- else "('')"
- )
- completed_in = (
- "('" + "','".join(completed_file_handles) + "')"
- if completed_file_handles
- else "('')"
- )
-
- # Match the original synapseutils query structure exactly
- # This handles:
- # 1. Forward progress: entities after the current position
- # 2. Backtracking: entities before current position that share completed file handles
- query = f"""
+ query_kwargs = {
+ "indexed_status": MigrationStatus.INDEXED.value,
+ "id": last_key.id,
+ "file_type": MigrationType.FILE.value,
+ "table_type": MigrationType.TABLE_ATTACHED_FILE.value,
+ "version": last_key.version,
+ "row_id": last_key.row_id,
+ "col_id": last_key.col_id,
+ "limit": limit,
+ }
+
+ # Build the IN clauses for file handles
+ pending = "('" + "','".join(pending_file_handle_ids) + "')"
+ completed = "('" + "','".join(completed_file_handle_ids) + "')"
+
+ # Query the next batch of items to migrate.
+ # 1. Forward progress: entities after the current position
+ # 2. Backtracking: entities before current position that share completed file handles
+ results = cursor.execute(
+ f"""
SELECT
id,
type,
@@ -541,11 +480,11 @@ def _query_migration_batch(
((id > :id AND type IN (:file_type, :table_type))
OR (id = :id AND type = :file_type AND version IS NOT NULL AND version > :version)
OR (id = :id AND type = :table_type AND (row_id > :row_id OR (row_id = :row_id AND col_id > :col_id))))
- AND from_file_handle_id NOT IN {pending_in}
+ AND from_file_handle_id NOT IN {pending}
) OR
(
id <= :id
- AND from_file_handle_id IN {completed_in}
+ AND from_file_handle_id IN {completed}
)
)
ORDER BY
@@ -555,170 +494,116 @@ def _query_migration_batch(
col_id,
version
LIMIT :limit
- """
-
- params = {
- "indexed_status": indexed_status,
- "id": last_id,
- "file_type": file_type,
- "table_type": table_type,
- "version": last_version,
- "row_id": last_row_id,
- "col_id": last_col_id,
- "limit": limit,
- }
-
- results = cursor.execute(query, params)
+ """, # noqa
+ query_kwargs,
+ )
- batch = []
- for row in results:
- batch.append(
- {
- "id": row[0],
- "type": MigrationType(row[1]),
- "version": row[2],
- "row_id": row[3],
- "col_id": row[4],
- "from_file_handle_id": row[5],
- "file_size": row[6],
- }
- )
- return batch
+ batch = []
+ for row in results:
+ batch.append(
+ {
+ "id": row[0],
+ "type": row[1],
+ "version": row[2],
+ "row_id": row[3],
+ "col_id": row[4],
+ "from_file_handle_id": row[5],
+ "file_size": row[6],
+ }
+ )
+ return batch
-def _update_migration_success(
- db_path: str,
+def _update_migration_database(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
key: MigrationKey,
to_file_handle_id: str,
+ status: MigrationStatus,
+ exception: Optional[Exception] = None,
) -> None:
- """Update a migration entry as successful.
+ """Update a migration database record as successful or errored.
Arguments:
- db_path: Path to the SQLite database file.
+ conn: The connection to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
key: The migration key.
to_file_handle_id: The destination file handle ID.
+ status: The migration status.
+ exception: The exception that occurred.
"""
- import sqlite3
-
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
+ tb_str = (
+ "".join(
+ traceback.format_exception(
+ type(exception), exception, exception.__traceback__
+ )
+ )
+ if exception
+ else None
+ )
- update_sql = """
- UPDATE migrations SET status = ?, to_file_handle_id = ?
- WHERE id = ? AND type = ?
- """
- params = [
- MigrationStatus.MIGRATED.value,
- to_file_handle_id,
- key.id,
- key.type.value,
- ]
-
- if key.version is not None:
- update_sql += " AND version = ?"
- params.append(key.version)
+ update_sql = """
+ UPDATE migrations SET
+ status = ?,
+ to_file_handle_id = ?,
+ exception = ?
+ WHERE
+ id = ?
+ AND type = ?
+ """
+ update_args = [status, to_file_handle_id, tb_str, key.id, key.type.value]
+ for arg in ("version", "row_id", "col_id"):
+ arg_value = getattr(key, arg)
+ if arg_value is not None:
+ update_sql += "and {} = ?\n".format(arg)
+ update_args.append(arg_value)
else:
- update_sql += " AND version IS NULL"
-
- if key.row_id is not None:
- update_sql += " AND row_id = ?"
- params.append(key.row_id)
-
- if key.col_id is not None:
- update_sql += " AND col_id = ?"
- params.append(key.col_id)
+ update_sql += "and {} is null\n".format(arg)
- cursor.execute(update_sql, tuple(params))
- conn.commit()
+ cursor.execute(update_sql, tuple(update_args))
+ conn.commit()
-def _update_migration_error(
- db_path: str,
- key: MigrationKey,
- exception: Exception,
-) -> None:
- """Update a migration entry with an error.
+def _confirm_migration(
+ cursor: sqlite3.Cursor, dest_storage_location_id: str, force: bool = False
+) -> bool:
+ """Confirm migration with user if in interactive mode.
Arguments:
- db_path: Path to the SQLite database file.
- key: The migration key.
- exception: The exception that occurred.
- """
- import sqlite3
-
- tb_str = "".join(
- traceback.format_exception(type(exception), exception, exception.__traceback__)
- )
-
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
-
- update_sql = """
- UPDATE migrations SET status = ?, exception = ?
- WHERE id = ? AND type = ?
- """
- params = [MigrationStatus.ERRORED.value, tb_str, key.id, key.type.value]
-
- if key.version is not None:
- update_sql += " AND version = ?"
- params.append(key.version)
- else:
- update_sql += " AND version IS NULL"
-
- if key.row_id is not None:
- update_sql += " AND row_id = ?"
- params.append(key.row_id)
-
- if key.col_id is not None:
- update_sql += " AND col_id = ?"
- params.append(key.col_id)
-
- cursor.execute(update_sql, tuple(params))
- conn.commit()
-
-
-def _confirm_migration(
- db_path: str, dest_storage_location_id: str, force: bool
-) -> bool:
- """Confirm migration with user if in interactive mode.
-
- Arguments:
- db_path: Path to the SQLite database file.
- dest_storage_location_id: Destination storage location ID.
- force: Whether to skip confirmation.
+ cursor: The cursor object from the connection to the SQLite database.
+ dest_storage_location_id: Destination storage location ID.
+ force: If running in an interactive shell, migration requires an interactice confirmation.
+ This can be bypassed by using the force=True option. Defaults to False.
Returns:
True if migration should proceed, False otherwise.
"""
- import sqlite3
if force:
return True
- with sqlite3.connect(db_path) as conn:
- cursor = conn.cursor()
- count = cursor.execute(
- "SELECT count(*) FROM migrations WHERE status = ?",
- (MigrationStatus.INDEXED.value,),
- ).fetchone()[0]
-
- if count == 0:
- logger.info("No items for migration.")
- return False
-
- if sys.stdout.isatty():
- user_input = input(
- f"{count} items for migration to {dest_storage_location_id}. Proceed? (y/n)? "
- )
- return user_input.strip().lower() == "y"
- else:
- logger.info(
- "%s items for migration. "
- "force option not used, and console input not available to confirm migration, aborting. "
- "Use the force option or run from an interactive shell to proceed with migration.",
- count,
- )
- return False
+ count = cursor.execute(
+ "SELECT count(*) FROM migrations WHERE status = ?",
+ (MigrationStatus.INDEXED.value,),
+ ).fetchone()[0]
+
+ if count == 0:
+ logger.info("No items for migration.")
+ return False
+
+ if sys.stdout.isatty():
+ user_input = input(
+ f"{count} items for migration to {dest_storage_location_id}. Proceed? (y/n)? "
+ )
+ return user_input.strip().lower() == "y"
+ else:
+ logger.info(
+ "%s items for migration. "
+ "force option not used, and console input not available to confirm migration, aborting. "
+ "Use the force option or run from an interactive shell to proceed with migration.",
+ count,
+ )
+ return False
def _get_part_size(file_size: int) -> int:
@@ -737,120 +622,103 @@ def _get_part_size(file_size: int) -> int:
return max(DEFAULT_PART_SIZE, min_part_size)
-# =============================================================================
-# Storage Location Validation
-# =============================================================================
-
-
-async def _verify_storage_location_ownership_async(
- storage_location_id: str,
- *,
- synapse_client: "Synapse",
-) -> None:
- """Verify the user owns the destination storage location.
-
- Arguments:
- storage_location_id: The storage location ID to verify.
- synapse_client: The Synapse client.
-
- Raises:
- ValueError: If the user does not own the storage location.
- """
- try:
- await synapse_client.rest_get_async(f"/storageLocation/{storage_location_id}")
- except Exception as ex:
- raise ValueError(
- f"Unable to verify ownership of storage location {storage_location_id}. "
- f"You must be the creator of the destination storage location. Error: {ex}"
- ) from ex
-
-
-def _include_file_in_migration(
+def _get_file_migration_status(
file_handle: Dict[str, Any],
source_storage_location_ids: List[str],
dest_storage_location_id: str,
) -> Optional[MigrationStatus]:
- """Determine if a file should be included in migration.
-
- Only S3 file handles can be migrated. External URLs and other file handle types
- are skipped.
-
- Arguments:
- file_handle: The file handle metadata.
- source_storage_location_ids: List of source storage locations to filter.
+ """
+ Determine whether a file should be included in the migrations database
+ and return its migration status.
+
+ Only S3 file handles are considered for migration. Other handle types
+ (e.g., external URLs) are ignored.
+
+ A file is included according to the following rules:
+ - If the file is already stored in the destination location, it is included
+ and marked as ALREADY_MIGRATED.
+ - If `source_storage_location_ids` is provided, the file's current storage
+ location must be in that list to be included.
+ - If `source_storage_location_ids` is empty, all files not already at the
+ destination are included.
+
+ Args:
+ file_handle: File handle metadata.
+ source_storage_location_ids: Storage location IDs that qualify as
+ migration sources. If empty, all source locations are considered.
dest_storage_location_id: Destination storage location ID.
Returns:
- MigrationStatus if file should be included, None otherwise.
+ MigrationStatus enum (ALREADY_MIGRATED, INDEXED) if the file should be included in the migrations database, or
+ None if the file should not be included in the migrations database.
"""
# Only S3 file handles can be migrated
- if file_handle.get("concreteType") != concrete_types.S3_FILE_HANDLE:
+ if file_handle.concrete_type != concrete_types.S3_FILE_HANDLE:
return None
- from_storage_location_id = str(file_handle.get("storageLocationId", 1))
+ current_storage_location_id = str(file_handle.storage_location_id)
+
+ if current_storage_location_id == dest_storage_location_id:
+ return MigrationStatus.ALREADY_MIGRATED.value
- # Check if file matches the migration criteria:
- # - If source_storage_location_ids is specified, from_storage_location must be in it
- # OR already at the destination
- # - If not specified, include all files not already at destination
if source_storage_location_ids:
- if (
- from_storage_location_id not in source_storage_location_ids
- and from_storage_location_id != dest_storage_location_id
- ):
+ if current_storage_location_id not in source_storage_location_ids:
return None
- # Already at destination - mark as already migrated
- if from_storage_location_id == dest_storage_location_id:
- return MigrationStatus.ALREADY_MIGRATED
-
- return MigrationStatus.INDEXED
+ return MigrationStatus.INDEXED.value
# =============================================================================
-# Public API Functions
+# Indexing Functions
# =============================================================================
-
-
async def index_files_for_migration_async(
- entity_id: str,
+ entity: Entity,
dest_storage_location_id: str,
db_path: Optional[str] = None,
*,
- source_storage_location_ids: Optional[List[str]] = None,
+ source_storage_location_ids: Optional[List[str]] = [],
file_version_strategy: str = "new",
include_table_files: bool = False,
continue_on_error: bool = False,
- synapse_client: Optional["Synapse"] = None,
+ synapse_client: Optional[Synapse] = None,
) -> MigrationResult:
"""Index files for migration to a new storage location.
- This is the first step in migrating files to a new storage location.
+ This is the first step in migrating files to a new storage location. This function itself does not modify the given entity but only update the migrations and migration_settings tables in the SQLite database.
After indexing, use `migrate_indexed_files_async` to perform the actual migration.
Arguments:
- entity_id: The Synapse entity ID to migrate (Project, Folder, File, or Table).
+ entity: The Synapse entity to migrate (Project, Folder, File, or Table). If it is a container (a Project or Folder), its contents will be recursively indexed.
dest_storage_location_id: The destination storage location ID.
- db_path: Path to create SQLite database. If None, uses temp directory.
- source_storage_location_ids: Optional list of source storage locations to filter.
- file_version_strategy: Strategy for file versions: "new", "all", "latest", "skip".
- include_table_files: Whether to include files attached to tables.
- continue_on_error: Whether to continue on individual errors.
- synapse_client: Optional Synapse client instance.
+ db_path: A path on disk where the SQLite index database will be created. Must be on a volume with enough space for metadata of all indexed contents. If not provided, a temporary directory will be created and the path will be returned in the MigrationResult object.
+ source_storage_location_ids: Optional list of source storage location IDs that will be migrated. If provided, files outside of one of the listed storage locations will not be indexed for migration. If not provided, then all files not already in the destination storage location will be indexed for migrated.
+ file_version_strategy: Strategy to migrate file versions: "new", "all", "latest", "skip".
+ - `new`: will create a new version of file entities in the new storage location, leaving existing versions unchanged
+ - `all`: all existing versions will be migrated in place to the new storage location
+ - `latest`: the latest version will be migrated in place to the new storage location
+ - `skip`: skip migrating file entities. use this e.g. if wanting to e.g. migrate table attached files in a container while leaving the files unchanged
+
+ include_table_files: Whether to include files attached to tables. If False (default) then e.g. only
+ file entities in the container will be migrated and tables will be untouched.
+ continue_on_error: Whether any errors encountered while indexing an entity will be raised
+ or instead just recorded in the index while allowing the index creation
+ to continue. Defaults to False.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
Returns:
- MigrationResult object for inspecting the index.
- """
- from synapseclient import Synapse
+ A MigrationResult object that can be used to inspect the contents of the index or output the index to a CSV for manual inspection.
+ Raises:
+ ValueError: If the file_version_strategy is invalid or if skipping both file entities and table attached files.
+ """
client = Synapse.get_client(synapse_client=synapse_client)
# Validate parameters
- valid_strategies = {"new", "all", "latest", "skip"}
- if file_version_strategy not in valid_strategies:
+ valid_file_version_strategy = {"new", "all", "latest", "skip"}
+ if file_version_strategy not in valid_file_version_strategy:
raise ValueError(
f"Invalid file_version_strategy: {file_version_strategy}, "
- f"must be one of {valid_strategies}"
+ f"must be one of {valid_file_version_strategy}"
)
if file_version_strategy == "skip" and not include_table_files:
@@ -858,39 +726,38 @@ async def index_files_for_migration_async(
"Skipping both file entities and table attached files, nothing to migrate"
)
- # Convert to strings
- dest_storage_location_id = str(dest_storage_location_id)
- source_storage_location_ids = [str(s) for s in (source_storage_location_ids or [])]
-
# Verify ownership
await _verify_storage_location_ownership_async(
storage_location_id=dest_storage_location_id,
synapse_client=client,
)
+ entity_id = utils.id_of(entity)
+
# Create database path if not provided
if db_path is None:
db_path = _get_default_db_path(entity_id)
# Initialize database
- await asyncio.to_thread(
- _initialize_database,
- db_path,
- entity_id,
- dest_storage_location_id,
- source_storage_location_ids,
- file_version_strategy,
- include_table_files,
- )
-
- # Get entity and start indexing
- entity = await client.get_async(entity_id, downloadFile=False)
-
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ _prepare_migration_db(
+ conn=conn,
+ cursor=cursor,
+ db_path=db_path,
+ root_id=entity_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ )
try:
await _index_entity_async(
+ conn=conn,
+ cursor=cursor,
entity=entity,
parent_id=None,
- db_path=db_path,
dest_storage_location_id=dest_storage_location_id,
source_storage_location_ids=source_storage_location_ids,
file_version_strategy=file_version_strategy,
@@ -900,76 +767,10 @@ async def index_files_for_migration_async(
)
except IndexingError as ex:
logger.exception(
- "Aborted due to failure to index entity %s of type %s. "
- "Use continue_on_error=True to skip individual failures.",
- ex.entity_id,
- ex.concrete_type,
- )
- raise ex
-
- return MigrationResult(db_path=db_path, synapse_client=client)
-
-
-async def migrate_indexed_files_async(
- db_path: str,
- *,
- create_table_snapshots: bool = True,
- continue_on_error: bool = False,
- force: bool = False,
- max_concurrent_copies: Optional[int] = None,
- synapse_client: Optional["Synapse"] = None,
-) -> Optional[MigrationResult]:
- """Migrate files that have been indexed.
-
- This is the second step in migrating files to a new storage location.
- Files must first be indexed using `index_files_for_migration_async`.
-
- Arguments:
- db_path: Path to SQLite database created by index_files_for_migration_async.
- create_table_snapshots: Whether to create table snapshots before migrating.
- continue_on_error: Whether to continue on individual migration errors.
- force: Whether to skip interactive confirmation.
- max_concurrent_copies: Maximum concurrent file copy operations.
- synapse_client: Optional Synapse client instance.
-
- Returns:
- MigrationResult object or None if migration was aborted.
- """
- from synapseclient import Synapse
-
- client = Synapse.get_client(synapse_client=synapse_client)
-
- # Retrieve settings
- settings = await asyncio.to_thread(_retrieve_index_settings, db_path)
- if settings is None:
- raise ValueError(
- f"Unable to retrieve existing index settings from '{db_path}'. "
- "Either this path does not represent a previously created migration index "
- "or the file is corrupt."
+ f"Aborted due to failure to index entity {ex.entity_id} of type {ex.concrete_type}. "
+ "Use continue_on_error=True to skip individual failures."
)
-
- dest_storage_location_id = settings["dest_storage_location_id"]
-
- # Confirm migration
- confirmed = await asyncio.to_thread(
- _confirm_migration, db_path, dest_storage_location_id, force
- )
- if not confirmed:
- logger.info("Migration aborted.")
- return None
-
- # Determine concurrency
- max_concurrent = max_concurrent_copies or max(client.max_threads // 2, 1)
-
- # Execute migration
- await _execute_migration_async(
- db_path=db_path,
- dest_storage_location_id=dest_storage_location_id,
- create_table_snapshots=create_table_snapshots,
- continue_on_error=continue_on_error,
- max_concurrent=max_concurrent,
- synapse_client=client,
- )
+ raise ex.__cause__
return MigrationResult(db_path=db_path, synapse_client=client)
@@ -977,12 +778,11 @@ async def migrate_indexed_files_async(
# =============================================================================
# Indexing Implementation
# =============================================================================
-
-
async def _index_entity_async(
- entity: Any,
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
+ entity: Entity,
parent_id: Optional[str],
- db_path: str,
dest_storage_location_id: str,
source_storage_location_ids: List[str],
file_version_strategy: str,
@@ -991,84 +791,88 @@ async def _index_entity_async(
*,
synapse_client: "Synapse",
) -> None:
- """Recursively index an entity and its children.
+ """Recursively index an entity and its children into migrations database.
Arguments:
+ conn: The connection to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
entity: The Synapse entity object.
- parent_id: The parent entity ID.
- db_path: Path to the SQLite database.
+ parent_id: The parent entity Synapse ID.
dest_storage_location_id: Destination storage location ID.
- source_storage_location_ids: List of source storage locations to filter.
+ source_storage_location_ids: List of source storage locations.
file_version_strategy: Strategy for file versions.
include_table_files: Whether to include table-attached files.
continue_on_error: Whether to continue on errors.
synapse_client: The Synapse client.
"""
entity_id = utils.id_of(entity)
- concrete_type = utils.concrete_type_of(entity)
+ retrieved_entity = await get_entity_type(entity_id=entity_id)
+ concrete_type = retrieved_entity.type
# Check if already indexed
- is_indexed = await asyncio.to_thread(_check_indexed, db_path, entity_id)
- if is_indexed:
- return
-
+ is_indexed = _check_indexed(cursor, entity_id)
try:
- if concrete_type == concrete_types.FILE_ENTITY:
- if file_version_strategy != "skip":
- await _index_file_entity_async(
- entity_id=entity_id,
- parent_id=parent_id,
- db_path=db_path,
- dest_storage_location_id=dest_storage_location_id,
- source_storage_location_ids=source_storage_location_ids,
- file_version_strategy=file_version_strategy,
- synapse_client=synapse_client,
- )
+ if not is_indexed:
+ if concrete_type == concrete_types.FILE_ENTITY:
+ if file_version_strategy != "skip":
+ await _index_file_entity_async(
+ cursor=cursor,
+ entity=entity,
+ parent_id=parent_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ synapse_client=synapse_client,
+ )
- elif concrete_type == concrete_types.TABLE_ENTITY:
- if include_table_files:
- await _index_table_entity_async(
+ elif concrete_type == concrete_types.TABLE_ENTITY:
+ if include_table_files:
+ await _index_table_entity_async(
+ cursor=cursor,
+ entity_id=entity_id,
+ parent_id=parent_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ synapse_client=synapse_client,
+ )
+
+ elif concrete_type in (
+ concrete_types.FOLDER_ENTITY,
+ concrete_types.PROJECT_ENTITY,
+ ):
+ await _index_container_async(
+ conn=conn,
+ cursor=cursor,
entity_id=entity_id,
parent_id=parent_id,
- db_path=db_path,
dest_storage_location_id=dest_storage_location_id,
source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ continue_on_error=continue_on_error,
synapse_client=synapse_client,
)
-
- elif concrete_type in (
- concrete_types.FOLDER_ENTITY,
- concrete_types.PROJECT_ENTITY,
- ):
- await _index_container_async(
- entity_id=entity_id,
- parent_id=parent_id,
- db_path=db_path,
- concrete_type=concrete_type,
- dest_storage_location_id=dest_storage_location_id,
- source_storage_location_ids=source_storage_location_ids,
- file_version_strategy=file_version_strategy,
- include_table_files=include_table_files,
- continue_on_error=continue_on_error,
- synapse_client=synapse_client,
- )
+ conn.commit()
except IndexingError:
+ # this is a recursive function, we don't need to log the error at every level so just
+ # pass up exceptions of this type that wrap the underlying exception and indicate
+ # that they were already logged
raise
except Exception as ex:
if continue_on_error:
- logger.warning("Error indexing entity %s: %s", entity_id, ex)
- await asyncio.to_thread(
- _record_indexing_error, db_path, entity_id, parent_id, ex
- )
+ logger.warning(f"Error indexing entity {entity_id}: {ex}")
+ tb_str = "".join(traceback.format_exception(type(ex), ex, ex.__traceback__))
+ migration_type = MigrationType.from_concrete_type(concrete_type).value
+ _record_indexing_error(cursor, entity_id, migration_type, parent_id, tb_str)
else:
raise IndexingError(entity_id, concrete_type) from ex
async def _index_file_entity_async(
- entity_id: str,
+ cursor: sqlite3.Cursor,
+ entity: Entity,
parent_id: Optional[str],
- db_path: str,
dest_storage_location_id: str,
source_storage_location_ids: List[str],
file_version_strategy: str,
@@ -1078,87 +882,104 @@ async def _index_file_entity_async(
"""Index a file entity for migration.
Arguments:
- entity_id: The file entity ID.
- parent_id: The parent entity ID.
- db_path: Path to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
+ entity: The Synapse entity object, a File.
+ parent_id: The parent entity Synapse ID.
dest_storage_location_id: Destination storage location ID.
- source_storage_location_ids: List of source storage locations to filter.
+ source_storage_location_ids: List of source storage locations.
file_version_strategy: Strategy for file versions.
synapse_client: The Synapse client.
"""
+ entity_id = utils.id_of(entity)
logger.info("Indexing file entity %s", entity_id)
entity_versions: List[Tuple[Any, Optional[int]]] = []
if file_version_strategy == "new":
- entity = await synapse_client.get_async(entity_id, downloadFile=False)
entity_versions.append((entity, None))
elif file_version_strategy == "all":
- # Get all versions
async for version in _get_version_numbers_async(entity_id, synapse_client):
- entity = await synapse_client.get_async(
- entity_id, version=version, downloadFile=False
+ entity = await get_async(
+ synapse_id=entity_id,
+ file_options=FileOptions(download_file=False),
+ synapse_client=synapse_client,
)
entity_versions.append((entity, version))
elif file_version_strategy == "latest":
- entity = await synapse_client.get_async(entity_id, downloadFile=False)
- entity_versions.append((entity, entity.versionNumber))
+ entity_versions.append((entity, entity.version_number))
+ insert_values = []
for entity, version in entity_versions:
- file_handle = entity._file_handle
- status = _include_file_in_migration(
- file_handle, source_storage_location_ids, dest_storage_location_id
+ status = _get_file_migration_status(
+ entity.file_handle, source_storage_location_ids, dest_storage_location_id
)
if status:
- await asyncio.to_thread(
- _insert_file_migration,
- db_path,
- entity_id,
- version,
- parent_id,
- file_handle["storageLocationId"],
- entity.dataFileHandleId,
- file_handle["contentSize"],
- status,
+ insert_values.append(
+ (
+ entity_id,
+ MigrationType.FILE.value,
+ version,
+ parent_id,
+ entity.file_handle.storage_location_id,
+ entity.data_file_handle_id,
+ entity.file_handle.content_size,
+ status,
+ )
)
+ if insert_values:
+ _insert_file_migration(cursor, insert_values)
-async def _get_version_numbers_async(
+async def _get_table_file_handle_rows_async(
entity_id: str,
+ *,
synapse_client: "Synapse",
-) -> AsyncGenerator[int, None]:
- """Get all version numbers for an entity.
+) -> List[Tuple[int, int, Dict[str, Any]]]:
+ """Get the table file handle rows for a given entity.
Arguments:
- entity_id: The entity ID.
- synapse_client: The Synapse client.
+ entity_id: The table entity ID.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
- Yields:
- Version numbers.
+ Returns:
+ A list of tuples containing the row ID, row version, and file handles.
"""
- offset = 0
- limit = 100
+ # Get file handle columns using the async API
+ columns = await get_columns(table_id=entity_id, synapse_client=synapse_client)
+ file_handle_columns = [c for c in columns if c.column_type == "FILEHANDLEID"]
- while True:
- response = await synapse_client.rest_get_async(
- f"/entity/{entity_id}/version?offset={offset}&limit={limit}"
+ if file_handle_columns:
+ file_column_select = _join_column_names(
+ file_handle_columns
+ ) # don't think we need this since only one column could be FILEHANDLEID
+ results = await query_async(
+ query=f"select {file_column_select} from {entity_id}",
+ include_row_id_and_row_version=True,
)
- results = response.get("results", [])
+ for row in results:
+ file_handles = {}
- for version_info in results:
- yield version_info["versionNumber"]
+ # first two cols are row id and row version, rest are file handle ids from our query
+ row_id, row_version = row[:2]
- if len(results) < limit:
- break
- offset += limit
+ file_handle_ids = row[2:]
+ for i, file_handle_id in enumerate(file_handle_ids):
+ if file_handle_id:
+ col_id = file_handle_columns[i]["id"]
+ file_handle = await get_file_handle_for_download_async(
+ file_handle_id, entity_id, objectType="TableEntity"
+ )["fileHandle"]
+ file_handles[col_id] = file_handle
+
+ yield row_id, row_version, file_handles
async def _index_table_entity_async(
+ cursor: sqlite3.Cursor,
entity_id: str,
parent_id: Optional[str],
- db_path: str,
dest_storage_location_id: str,
source_storage_location_ids: List[str],
*,
@@ -1167,74 +988,49 @@ async def _index_table_entity_async(
"""Index a table entity's file attachments for migration.
Arguments:
- entity_id: The table entity ID.
- parent_id: The parent entity ID.
- db_path: Path to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
+ entity_id: The Synapse ID of the table entity.
+ parent_id: The parent entity Synapse ID.
dest_storage_location_id: Destination storage location ID.
source_storage_location_ids: List of source storage locations to filter.
- synapse_client: The Synapse client.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
"""
logger.info("Indexing table entity %s", entity_id)
-
- # Get file handle columns using the async API
- columns = await get_columns(table_id=entity_id, synapse_client=synapse_client)
- file_handle_columns = [c for c in columns if c.column_type == "FILEHANDLEID"]
-
- if not file_handle_columns:
- return
-
- # Query table for file handles using local helper
- file_column_select = _join_column_names(file_handle_columns)
-
- # tableQuery is still a synchronous method on the Synapse client
- results = await asyncio.to_thread(
- synapse_client.tableQuery,
- f"SELECT {file_column_select} FROM {entity_id}",
- )
-
- for row in results:
- row_id, row_version = row[:2]
- file_handle_ids = row[2:]
-
- for i, file_handle_id in enumerate(file_handle_ids):
- if not file_handle_id:
- continue
-
- col_id = file_handle_columns[i].id
-
- # Get file handle metadata using the async API
- fh_response = await get_file_handle_for_download_async(
- file_handle_id=str(file_handle_id),
- synapse_id=entity_id,
- entity_type="TableEntity",
- synapse_client=synapse_client,
- )
- file_handle = fh_response["fileHandle"]
-
- status = _include_file_in_migration(
+ insert_values = []
+ for row_id, row_version, file_handles in await _get_table_file_handle_rows_async(
+ entity_id=entity_id, synapse_client=synapse_client
+ ):
+ for col_id, file_handle in file_handles.items():
+ status = _get_file_migration_status(
file_handle, source_storage_location_ids, dest_storage_location_id
)
if status:
- await asyncio.to_thread(
- _insert_table_file_migration,
- db_path,
- entity_id,
- row_id,
- int(col_id),
- row_version,
- parent_id,
- file_handle["storageLocationId"],
- file_handle_id,
- file_handle["contentSize"],
- status,
+ insert_values.append(
+ (
+ entity_id,
+ MigrationType.TABLE_ATTACHED_FILE.value,
+ parent_id,
+ row_id,
+ col_id,
+ row_version,
+ file_handle.storage_location_id,
+ file_handle.id,
+ file_handle.content_size,
+ status,
+ )
)
+ if len(insert_values) % BATCH_SIZE == 0:
+ _insert_table_file_migration(cursor, insert_values)
+ insert_values.clear()
+ if insert_values:
+ _insert_table_file_migration(cursor, insert_values)
async def _index_container_async(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
entity_id: str,
parent_id: Optional[str],
- db_path: str,
- concrete_type: str,
dest_storage_location_id: str,
source_storage_location_ids: List[str],
file_version_strategy: str,
@@ -1246,18 +1042,22 @@ async def _index_container_async(
"""Index a container (Project or Folder) and its children.
Arguments:
- entity_id: The container entity ID.
- parent_id: The parent entity ID.
- db_path: Path to the SQLite database.
- concrete_type: The concrete type of the container.
+ conn: The connection to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
+ entity_id: The Synapse ID of the entity, a Project or Folder.
+ parent_id: The Synapse ID of the parent entity.
dest_storage_location_id: Destination storage location ID.
source_storage_location_ids: List of source storage locations to filter.
file_version_strategy: Strategy for file versions.
include_table_files: Whether to include table-attached files.
continue_on_error: Whether to continue on errors.
- synapse_client: The Synapse client.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
"""
- logger.info("Indexing container %s", entity_id)
+ retrieved_entity = await get_entity_type(entity_id=entity_id)
+ concrete_type = retrieved_entity.type
+ logger.info(
+ f'Indexing {concrete_type[concrete_type.rindex(".") + 1 :]} {entity_id}'
+ )
# Determine included types
include_types = []
@@ -1275,18 +1075,19 @@ async def _index_container_async(
):
children.append(child)
- # Use bounded concurrency for indexing children
- semaphore = asyncio.Semaphore(10)
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_FILE_COPIES)
async def index_child(child: Dict[str, Any]) -> None:
async with semaphore:
- child_entity = await synapse_client.get_async(
- child["id"], downloadFile=False
+ child_entity = await get_async(
+ synapse_id=child["id"], synapse_client=synapse_client
)
+
await _index_entity_async(
+ conn=conn,
+ cursor=cursor,
entity=child_entity,
parent_id=entity_id,
- db_path=db_path,
dest_storage_location_id=dest_storage_location_id,
source_storage_location_ids=source_storage_location_ids,
file_version_strategy=file_version_strategy,
@@ -1302,185 +1103,16 @@ async def index_child(child: Dict[str, Any]) -> None:
# Mark container as indexed
migration_type = (
- MigrationType.PROJECT
+ MigrationType.PROJECT.value
if concrete_type == concrete_types.PROJECT_ENTITY
- else MigrationType.FOLDER
- )
- await asyncio.to_thread(
- _mark_container_indexed, db_path, entity_id, parent_id, migration_type
+ else MigrationType.FOLDER.value
)
+ _mark_container_indexed(cursor, entity_id, migration_type, parent_id)
# =============================================================================
-# Migration Execution
+# Migration Functions
# =============================================================================
-
-
-async def _execute_migration_async(
- db_path: str,
- dest_storage_location_id: str,
- create_table_snapshots: bool,
- continue_on_error: bool,
- max_concurrent: int,
- *,
- synapse_client: "Synapse",
-) -> None:
- """Execute the actual file migration.
-
- Arguments:
- db_path: Path to the SQLite database.
- dest_storage_location_id: Destination storage location ID.
- create_table_snapshots: Whether to create table snapshots.
- continue_on_error: Whether to continue on errors.
- max_concurrent: Maximum concurrent operations.
- synapse_client: The Synapse client.
- """
- pending_file_handles: Set[str] = set()
- completed_file_handles: Set[str] = set()
- pending_keys: Set[MigrationKey] = set()
- table_snapshots_created: Set[str] = set()
-
- semaphore = asyncio.Semaphore(max_concurrent)
- active_tasks: Set[asyncio.Task] = set()
-
- last_id = ""
- last_version = -1
- last_row_id = -1
- last_col_id = -1
-
- while True:
- # Query next batch
- batch = await asyncio.to_thread(
- _query_migration_batch,
- db_path,
- last_id,
- last_version,
- last_row_id,
- last_col_id,
- pending_file_handles,
- completed_file_handles,
- min(BATCH_SIZE, max_concurrent - len(active_tasks)),
- )
-
- if not batch and not active_tasks:
- break
-
- # Process batch items
- for item in batch:
- key = MigrationKey(
- id=item["id"],
- type=item["type"],
- version=item["version"],
- row_id=item["row_id"],
- col_id=item["col_id"],
- )
-
- if key in pending_keys:
- continue
-
- pending_keys.add(key)
- from_file_handle_id = item["from_file_handle_id"]
-
- # Check for existing copy
- to_file_handle_id = await asyncio.to_thread(
- _check_file_handle_exists, db_path, from_file_handle_id
- )
-
- if not to_file_handle_id:
- pending_file_handles.add(from_file_handle_id)
-
- # Create table snapshot if needed using the async API
- if (
- item["type"] == MigrationType.TABLE_ATTACHED_FILE
- and create_table_snapshots
- and item["id"] not in table_snapshots_created
- ):
- await create_table_snapshot(
- table_id=item["id"],
- synapse_client=synapse_client,
- )
- table_snapshots_created.add(item["id"])
-
- # Create migration task
- task = asyncio.create_task(
- _migrate_item_async(
- key=key,
- from_file_handle_id=from_file_handle_id,
- to_file_handle_id=to_file_handle_id,
- file_size=item["file_size"] or 0,
- dest_storage_location_id=dest_storage_location_id,
- semaphore=semaphore,
- synapse_client=synapse_client,
- )
- )
- active_tasks.add(task)
-
- # Update tracking for next batch
- last_id = item["id"]
- last_version = item["version"] if item["version"] is not None else -1
- last_row_id = item["row_id"] if item["row_id"] is not None else -1
- last_col_id = item["col_id"] if item["col_id"] is not None else -1
-
- # Wait for tasks if at capacity or end of batch
- if active_tasks and (
- len(active_tasks) >= max_concurrent or len(batch) < BATCH_SIZE
- ):
- done, active_tasks = await asyncio.wait(
- active_tasks,
- return_when=asyncio.FIRST_COMPLETED,
- )
-
- for completed_task in done:
- try:
- result = completed_task.result()
- key = result["key"]
- from_fh_id = result["from_file_handle_id"]
- to_fh_id = result["to_file_handle_id"]
-
- # Update database
- await asyncio.to_thread(
- _update_migration_success, db_path, key, to_fh_id
- )
-
- completed_file_handles.add(from_fh_id)
- pending_file_handles.discard(from_fh_id)
- pending_keys.discard(key)
-
- except Exception as ex:
- if hasattr(ex, "key"):
- key = ex.key
- await asyncio.to_thread(
- _update_migration_error, db_path, key, ex.__cause__ or ex
- )
- pending_keys.discard(key)
-
- if not continue_on_error:
- # Cancel remaining tasks
- for task in active_tasks:
- task.cancel()
- raise
-
- # Wait for any remaining tasks
- if active_tasks:
- done, _ = await asyncio.wait(active_tasks)
- for completed_task in done:
- try:
- result = completed_task.result()
- await asyncio.to_thread(
- _update_migration_success,
- db_path,
- result["key"],
- result["to_file_handle_id"],
- )
- except Exception as ex:
- if hasattr(ex, "key"):
- await asyncio.to_thread(
- _update_migration_error, db_path, ex.key, ex.__cause__ or ex
- )
- if not continue_on_error:
- raise
-
-
async def _migrate_item_async(
key: MigrationKey,
from_file_handle_id: str,
@@ -1495,19 +1127,19 @@ async def _migrate_item_async(
Arguments:
key: The migration key.
- from_file_handle_id: Source file handle ID.
- to_file_handle_id: Destination file handle ID (if already copied).
+ from_file_handle_id: The source file handle ID.
+ to_file_handle_id: The destination file handle ID (if already copied).
file_size: File size in bytes.
- dest_storage_location_id: Destination storage location ID.
- semaphore: Concurrency semaphore.
- synapse_client: The Synapse client.
+ dest_storage_location_id: The destination storage location ID.
+ semaphore: The concurrency semaphore.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
Returns:
- Dictionary with key, from_file_handle_id, to_file_handle_id.
+ Dictionary with the key, from_file_handle_id, and to_file_handle_id.
"""
async with semaphore:
try:
- # Copy file handle if needed
+ # copy to a new file handle if we haven't already
if not to_file_handle_id:
source_association = {
"fileHandleId": from_file_handle_id,
@@ -1519,12 +1151,10 @@ async def _migrate_item_async(
),
}
- # Use thread for multipart_copy (it uses threading internally)
- to_file_handle_id = await asyncio.to_thread(
- multipart_copy,
+ to_file_handle_id = await multipart_copy_async(
synapse_client,
source_association,
- dest_storage_location_id,
+ storage_location_id=dest_storage_location_id,
part_size=_get_part_size(file_size),
)
@@ -1537,7 +1167,7 @@ async def _migrate_item_async(
synapse_client=synapse_client,
)
else:
- await _update_file_version_async(
+ await _migrate_file_version_async(
entity_id=key.id,
version=key.version,
from_file_handle_id=from_file_handle_id,
@@ -1545,10 +1175,8 @@ async def _migrate_item_async(
synapse_client=synapse_client,
)
elif key.type == MigrationType.TABLE_ATTACHED_FILE:
- await _update_table_file_async(
- entity_id=key.id,
- row_id=key.row_id,
- col_id=key.col_id,
+ await _migrate_table_attached_file_async(
+ key=key,
to_file_handle_id=to_file_handle_id,
synapse_client=synapse_client,
)
@@ -1560,9 +1188,9 @@ async def _migrate_item_async(
}
except Exception as ex:
- error = MigrationError(key, from_file_handle_id, to_file_handle_id)
- error.__cause__ = ex
- raise error
+ raise MigrationError(
+ key, from_file_handle_id, to_file_handle_id, cause=ex
+ ) from ex
async def _create_new_file_version_async(
@@ -1578,12 +1206,19 @@ async def _create_new_file_version_async(
to_file_handle_id: The new file handle ID.
synapse_client: The Synapse client.
"""
- entity = await synapse_client.get_async(entity_id, downloadFile=False)
+ client = Synapse.get_client(synapse_client=synapse_client)
+ client.logger.info("Creating new version for file entity %s", entity_id)
+
+ entity = await get_async(
+ synapse_id=entity_id,
+ file_options=FileOptions(download_file=False),
+ synapse_client=synapse_client,
+ )
entity.dataFileHandleId = to_file_handle_id
- await synapse_client.store_async(entity)
+ await entity.store_async()
-async def _update_file_version_async(
+async def _migrate_file_version_async(
entity_id: str,
version: int,
from_file_handle_id: str,
@@ -1591,7 +1226,7 @@ async def _update_file_version_async(
*,
synapse_client: "Synapse",
) -> None:
- """Update an existing file version's file handle.
+ """Migrate/update an existing file version with a new file handle.
Arguments:
entity_id: The file entity ID.
@@ -1600,7 +1235,12 @@ async def _update_file_version_async(
to_file_handle_id: The new file handle ID.
synapse_client: The Synapse client.
"""
- await synapse_client.rest_put_async(
+ client = Synapse.get_client(synapse_client=synapse_client)
+ client.logger.info(
+ "Updating file handle for file entity %s version %s", entity_id, version
+ )
+
+ await client.rest_put_async(
f"/entity/{entity_id}/version/{version}/filehandle",
body=json.dumps(
{
@@ -1611,40 +1251,269 @@ async def _update_file_version_async(
)
-async def _update_table_file_async(
- entity_id: str,
- row_id: int,
- col_id: int,
+async def _migrate_table_attached_file_async(
+ key: MigrationKey,
to_file_handle_id: str,
*,
synapse_client: "Synapse",
) -> None:
- """Update a table cell with a new file handle.
+ """Migrate/update a table attached file with a new file handle.
Arguments:
- entity_id: The table entity ID.
- row_id: The row ID.
- col_id: The column ID.
+ key: The migration key.
to_file_handle_id: The new file handle ID.
- synapse_client: The Synapse client.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
"""
- # Create the partial row update using new OOP models
partial_row = PartialRow(
- row_id=str(row_id),
- values=[{"key": str(col_id), "value": to_file_handle_id}],
+ row_id=str(key.row_id),
+ values=[{str(key.col_id): to_file_handle_id}],
)
partial_row_set = PartialRowSet(
- table_id=entity_id,
+ table_id=key.id,
rows=[partial_row],
)
appendable_request = AppendableRowSetRequest(
- entity_id=entity_id,
+ entity_id=key.id,
to_append=partial_row_set,
)
-
- # Execute the update using TableUpdateTransaction
transaction = TableUpdateTransaction(
- entity_id=entity_id,
+ entity_id=key.id,
changes=[appendable_request],
)
await transaction.send_job_and_wait_async(synapse_client=synapse_client)
+
+
+async def track_migration_results_async(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
+ active_tasks: Set[asyncio.Task],
+ pending_file_handles: Set[str],
+ completed_file_handles: Set[str],
+ pending_keys: Set[MigrationKey],
+ return_when: asyncio.Future[asyncio.Task],
+ continue_on_error: bool,
+) -> None:
+ """Track the results of the migration tasks.
+
+ Arguments:
+ conn: The connection to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
+ pending_file_handles: The set of pending file handles.
+ completed_file_handles: The set of completed file handles.
+ active_tasks: The set of active migration tasks.
+ pending_keys: The set of pending migration keys.
+ return_when: The return when condition for the asyncio.wait.
+ continue_on_error: Whether to continue on errors.
+
+ Returns:
+ None
+ """
+ done, active_tasks = await asyncio.wait(
+ active_tasks,
+ return_when=return_when,
+ )
+ for completed_task in done:
+ to_file_handle_id = None
+ ex = None
+ try:
+ result = completed_task.result()
+ key = result["key"]
+ from_file_handle_id = result["from_file_handle_id"]
+ to_file_handle_id = result["to_file_handle_id"]
+ status = MigrationStatus.MIGRATED.value
+ completed_file_handles.add(from_file_handle_id)
+
+ except MigrationError as migration_error:
+ key = migration_error.key
+ from_file_handle_id = migration_error.from_file_handle_id
+ ex = migration_error.__cause__
+ status = MigrationStatus.ERRORED.value
+ completed_file_handles.add(from_file_handle_id)
+
+ _update_migration_database(conn, cursor, key, to_file_handle_id, status, ex)
+ pending_keys.discard(key)
+ pending_file_handles.discard(from_file_handle_id)
+
+ if not continue_on_error and ex:
+ raise ex from None
+
+
+# =============================================================================
+# Migration Implementation
+# =============================================================================
+async def migrate_indexed_files_async(
+ db_path: str,
+ *,
+ create_table_snapshots: bool = True,
+ continue_on_error: bool = False,
+ force: bool = False,
+ synapse_client: Optional["Synapse"] = None,
+) -> Optional[MigrationResult]:
+ """Migrate files that have been indexed.
+
+ This is the second step in migrating files to a new storage location.
+ Files must first be indexed using `index_files_for_migration_async`.
+
+ Arguments:
+ db_path: Path to SQLite database created by index_files_for_migration_async.
+ create_table_snapshots: Whether to create table snapshots before migrating. Defaults to True.
+ continue_on_error: Whether to continue on individual migration errors. Defaults to False.
+ force: If running in an interactive shell, migration requires an interactice confirmation.
+ This can be bypassed by using the force=True option. Defaults to False.
+ max_concurrent_copies: Maximum concurrent file copy operations. Defaults to None.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+
+ Returns:
+ MigrationResult object or None if migration was aborted.
+ """
+ client = Synapse.get_client(synapse_client=synapse_client)
+
+ # Retrieve settings
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ existing_settings = _retrieve_index_settings(cursor)
+ if existing_settings is None:
+ raise ValueError(
+ f"Unable to retrieve existing index settings from '{db_path}'. "
+ "Either this path does not represent a previously created migration index "
+ "or the file is corrupt."
+ )
+ dest_storage_location_id = existing_settings.dest_storage_location_id
+
+ # Confirm migration
+ confirmed = _confirm_migration(cursor, dest_storage_location_id, force)
+ if not confirmed:
+ logger.info("Migration aborted.")
+ return None
+
+ # Execute migration
+ await _execute_migration_async(
+ conn=conn,
+ cursor=cursor,
+ dest_storage_location_id=dest_storage_location_id,
+ create_table_snapshots=create_table_snapshots,
+ continue_on_error=continue_on_error,
+ synapse_client=client,
+ )
+ return MigrationResult(db_path=db_path, synapse_client=client)
+
+
+async def _execute_migration_async(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
+ dest_storage_location_id: str,
+ create_table_snapshots: bool,
+ continue_on_error: bool,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Execute the actual file migration.
+
+ Arguments:
+ conn: The connection to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
+ dest_storage_location_id: Destination storage location ID.
+ create_table_snapshots: Whether to create table snapshots.
+ continue_on_error: Whether to continue on errors.
+ max_concurrent: Maximum concurrent operations.
+ synapse_client: The Synapse client.
+ """
+ pending_file_handles: Set[str] = set()
+ completed_file_handles: Set[str] = set()
+ pending_keys: Set[MigrationKey] = set()
+
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_FILE_COPIES)
+ active_tasks: Set[asyncio.Task] = set()
+
+ # Initialize last key to an empty key so the first iteration can proceed.
+ key = MigrationKey(id="", type=None, row_id=-1, col_id=-1, version=-1)
+ while True:
+ # Query next batch
+ batch = _query_migration_batch(
+ cursor,
+ key,
+ pending_file_handles,
+ completed_file_handles,
+ min(BATCH_SIZE, MAX_CONCURRENT_FILE_COPIES - len(active_tasks)),
+ )
+ row_count = 0
+ for item in batch:
+ row_count += 1
+ last_key = key
+ key = MigrationKey(
+ id=item["id"],
+ type=MigrationType(item["type"]),
+ version=item["version"],
+ row_id=item["row_id"],
+ col_id=item["col_id"],
+ )
+ from_file_handle_id = item["from_file_handle_id"]
+ if key in pending_keys or from_file_handle_id in pending_file_handles:
+ # if this record is already being migrated or it shares a file handle
+ # with a record that is being migrated then skip this.
+ # if it the record shares a file handle it will be picked up later
+ # when its file handle is completed.
+ continue
+
+ pending_keys.add(key)
+
+ # Check for existing copy
+ to_file_handle_id = _check_file_handle_exists(cursor, from_file_handle_id)
+
+ if not to_file_handle_id:
+ pending_file_handles.add(from_file_handle_id)
+
+ # Create table snapshot if needed using the async API
+ if (
+ key.type == MigrationType.TABLE_ATTACHED_FILE.value
+ and create_table_snapshots
+ and last_key.id != key.id
+ ):
+ await Table(id=key.id).snapshot_async(synapse_client=synapse_client)
+
+ # Create migration task
+ task = asyncio.create_task(
+ _migrate_item_async(
+ key=key,
+ from_file_handle_id=from_file_handle_id,
+ to_file_handle_id=to_file_handle_id,
+ file_size=item["file_size"] or 0,
+ dest_storage_location_id=dest_storage_location_id,
+ semaphore=semaphore,
+ synapse_client=synapse_client,
+ )
+ )
+ active_tasks.add(task)
+
+ if row_count == 0 and not pending_file_handles:
+ # we've run out of migratable sqlite rows, we have nothing else
+ # to submit, so we break out and wait for all remaining
+ # tasks to conclude.
+ break
+
+ # Wait for tasks if at capacity or end of batch
+ if len(active_tasks) >= MAX_CONCURRENT_FILE_COPIES or len(batch) < BATCH_SIZE:
+ await track_migration_results_async(
+ conn,
+ cursor,
+ active_tasks,
+ pending_file_handles,
+ completed_file_handles,
+ pending_keys,
+ asyncio.FIRST_COMPLETED,
+ continue_on_error,
+ )
+
+ # Wait for any remaining tasks
+ if active_tasks:
+ await track_migration_results_async(
+ conn,
+ cursor,
+ active_tasks,
+ pending_file_handles,
+ completed_file_handles,
+ pending_keys,
+ asyncio.ALL_COMPLETED,
+ continue_on_error,
+ )
From 9574e2a11cc588847a15fc8230afe56e316b741c Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 21:01:32 -0700
Subject: [PATCH 22/31] remove MigrationEntry
---
synapseclient/models/services/__init__.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/synapseclient/models/services/__init__.py b/synapseclient/models/services/__init__.py
index fea05d199..5ff746bab 100644
--- a/synapseclient/models/services/__init__.py
+++ b/synapseclient/models/services/__init__.py
@@ -3,7 +3,6 @@
migrate_indexed_files_async,
)
from synapseclient.models.services.migration_types import (
- MigrationEntry,
MigrationError,
MigrationKey,
MigrationResult,
@@ -29,7 +28,6 @@
"MigrationStatus",
"MigrationType",
"MigrationKey",
- "MigrationEntry",
"MigrationSettings",
"MigrationError",
]
From fcd865968ff3f46902c868ec41f70601648b2cb7 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 21:16:41 -0700
Subject: [PATCH 23/31] reformat
---
.../models/services/migration_types.py | 128 ++++++++++--------
1 file changed, 74 insertions(+), 54 deletions(-)
diff --git a/synapseclient/models/services/migration_types.py b/synapseclient/models/services/migration_types.py
index a20cc008d..c53e423ab 100644
--- a/synapseclient/models/services/migration_types.py
+++ b/synapseclient/models/services/migration_types.py
@@ -6,7 +6,7 @@
import asyncio
import csv
-from dataclasses import dataclass, field
+from dataclasses import dataclass, fields
from enum import Enum
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional
@@ -17,7 +17,7 @@
class MigrationStatus(Enum):
- """Status of a migration entry in the tracking database."""
+ """Internal enum used by the SQLite database to track the state of entities during indexing and migration."""
INDEXED = 1
"""The file has been indexed and is ready to be migrated."""
@@ -26,33 +26,36 @@ class MigrationStatus(Enum):
"""The file has been successfully migrated to the new storage location."""
ALREADY_MIGRATED = 3
- """The file was already at the destination storage location."""
+ """The file was already at the destination storage location and no migration is needed."""
ERRORED = 4
- """An error occurred during indexing or migration."""
+ """An error occurred during indexing or migration for this entity."""
class MigrationType(Enum):
- """Type of entity being tracked in the migration database."""
+ """Type of entity being tracked in the migration database.
+ Container types (projects and folders) are only used during the indexing phase.
+ we record the containers we've indexed so we don't reindex them on a subsequent
+ run using the same db file (or reindex them after an indexing dry run)"""
PROJECT = 1
- """A project container (used for tracking indexed containers)."""
+ """A project entity."""
FOLDER = 2
- """A folder container (used for tracking indexed containers)."""
+ """A folder entity."""
FILE = 3
"""A file entity."""
TABLE_ATTACHED_FILE = 4
- """A file attached to a table column."""
+ """A file handle that is attached to a table column."""
@classmethod
def from_concrete_type(cls, concrete_type: str) -> "MigrationType":
"""Convert a Synapse concrete type string to a MigrationType.
Arguments:
- concrete_type: The concrete type string from Synapse API.
+ concrete_type: The concrete type of the entity.
Returns:
The corresponding MigrationType enum value.
@@ -74,12 +77,12 @@ def from_concrete_type(cls, concrete_type: str) -> "MigrationType":
@dataclass
class MigrationKey:
- """Unique identifier for a migration entry in the tracking database.
+ """Unique identifier for a entry in the migrations database.
Attributes:
id: The Synapse entity ID.
- type: The type of entity being migrated.
- version: The file version number (None for new versions or containers).
+ type: The migration type of entity being migrated.
+ version: The file version number (None for new versions or containers). #TODO double check if versions are NONE for containers
row_id: The table row ID (for table attached files).
col_id: The table column ID (for table attached files).
"""
@@ -105,49 +108,76 @@ def __eq__(self, other: object) -> bool:
)
-@dataclass
-class MigrationEntry:
- """A single migration entry with full details.
-
- Attributes:
- key: The unique identifier for this migration entry.
- parent_id: The parent entity ID.
- from_storage_location_id: The original storage location ID.
- from_file_handle_id: The original file handle ID.
- to_file_handle_id: The new file handle ID after migration.
- file_size: The file size in bytes.
- status: The current migration status.
- exception: Stack trace if an error occurred.
- """
-
- key: MigrationKey
- parent_id: Optional[str] = None
- from_storage_location_id: Optional[int] = None
- from_file_handle_id: Optional[str] = None
- to_file_handle_id: Optional[str] = None
- file_size: Optional[int] = None
- status: MigrationStatus = MigrationStatus.INDEXED
- exception: Optional[str] = None
-
-
@dataclass
class MigrationSettings:
"""Settings for a migration index stored in the database.
+ TODO: check if this is used anywhere
Attributes:
root_id: The root entity ID being migrated.
dest_storage_location_id: The destination storage location ID.
- source_storage_location_ids: List of source storage location IDs to filter.
+ source_storage_location_ids: List of of storage location ids that will be migrated.
file_version_strategy: Strategy for handling file versions.
include_table_files: Whether to include files attached to tables.
"""
root_id: str
dest_storage_location_id: str
- source_storage_location_ids: List[str] = field(default_factory=list)
+ source_storage_location_ids: List[str] = None
file_version_strategy: str = "new"
include_table_files: bool = False
+ def to_dict(self) -> Dict[str, Any]:
+ """Return a dict suitable for JSON serialization in the database."""
+ return {
+ "root_id": self.root_id,
+ "dest_storage_location_id": self.dest_storage_location_id,
+ "source_storage_location_ids": self.source_storage_location_ids,
+ "file_version_strategy": self.file_version_strategy,
+ "include_table_files": 1 if self.include_table_files else 0,
+ }
+
+ @classmethod
+ def from_dict(cls, d: Dict[str, Any]) -> "MigrationSettings":
+ """Build MigrationSettings from a dict (e.g. from JSON in the database)."""
+ include = d.get("include_table_files", False)
+ if isinstance(include, int):
+ include = bool(include)
+ return cls(
+ root_id=d["root_id"],
+ dest_storage_location_id=d["dest_storage_location_id"],
+ source_storage_location_ids=d.get("source_storage_location_ids") or [],
+ file_version_strategy=d.get("file_version_strategy", "new"),
+ include_table_files=include,
+ )
+
+ def verify_migration_settings(
+ self, existing_settings: "MigrationSettings", db_path: str
+ ) -> None:
+ """Raise ValueError if the migration settings do not match the existing settings"""
+ # compare all fields
+ for field in fields(self):
+ if getattr(self, field.name) != getattr(existing_settings, field.name):
+ # we can't resume indexing with an existing index file using a different setting.
+ raise ValueError(
+ "Index parameter does not match the setting recorded in the existing index file. "
+ "To change the index settings start over by deleting the file or using a different path. "
+ f"Expected {field.name} {getattr(existing_settings, field.name)}, found {getattr(self, field.name)} in index file {db_path}"
+ )
+
+
+class IndexingError(Exception):
+ """Error during an indexing operation.
+
+ Attributes:
+ entity_id: The entity ID that failed to index.
+ concrete_type: The concrete type of the entity.
+ """
+
+ def __init__(self, entity_id: str, concrete_type: str):
+ self.entity_id = entity_id
+ self.concrete_type = concrete_type
+
@dataclass
class MigrationResult:
@@ -350,22 +380,12 @@ def __init__(
key: MigrationKey,
from_file_handle_id: str,
to_file_handle_id: Optional[str] = None,
+ cause: Optional[Exception] = None,
):
self.key = key
self.from_file_handle_id = from_file_handle_id
self.to_file_handle_id = to_file_handle_id
- super().__init__(f"Migration failed for {key.id}")
-
-
-class IndexingError(Exception):
- """Error during an indexing operation.
-
- Attributes:
- entity_id: The entity ID that failed to index.
- concrete_type: The concrete type of the entity.
- """
-
- def __init__(self, entity_id: str, concrete_type: str):
- self.entity_id = entity_id
- self.concrete_type = concrete_type
- super().__init__(f"Indexing failed for {entity_id} ({concrete_type})")
+ message = f"Migration failed for {key.id}"
+ if cause is not None:
+ message += f": {cause}"
+ super().__init__(message)
From b3571b87c939f7efb33d580de4e3f82de968bec1 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 21:25:37 -0700
Subject: [PATCH 24/31] add enum_coercion mixin
---
synapseclient/models/mixins/enum_coercion.py | 32 ++++++++++++++++++++
1 file changed, 32 insertions(+)
create mode 100644 synapseclient/models/mixins/enum_coercion.py
diff --git a/synapseclient/models/mixins/enum_coercion.py b/synapseclient/models/mixins/enum_coercion.py
new file mode 100644
index 000000000..5eef2f802
--- /dev/null
+++ b/synapseclient/models/mixins/enum_coercion.py
@@ -0,0 +1,32 @@
+"""Mixin for automatic enum coercion in dataclasses."""
+
+from typing import Any, ClassVar, Dict
+
+
+class EnumCoercionMixin:
+ """Mixin for dataclasses that auto-coerces string values to enum types.
+
+ Subclasses declare a class-level ``_ENUM_FIELDS`` dict mapping field names
+ to their enum classes. On every ``__setattr__`` call the mixin checks
+ whether the target field is listed and, if the incoming value is not
+ already the correct enum type, coerces it via the enum constructor.
+
+ Example::
+
+ @dataclass
+ class MyModel(EnumCoercionMixin):
+ _ENUM_FIELDS = {"status": StatusEnum}
+ status: Optional[Union[str, StatusEnum]] = None
+ """
+
+ _ENUM_FIELDS: ClassVar[Dict[str, type]] = {}
+
+ def __setattr__(self, name: str, value: Any) -> None:
+ enum_cls = self._ENUM_FIELDS.get(name)
+ if (
+ value is not None
+ and enum_cls is not None
+ and not isinstance(value, enum_cls)
+ ):
+ value = enum_cls(value)
+ super().__setattr__(name, value)
From 268d9769c508f48662186a8be702296c3c9fe5f5 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Wed, 18 Mar 2026 21:27:59 -0700
Subject: [PATCH 25/31] add EXTERNAL_HTTPS type, type-specific field mapping
and displaying, add EnumCoercionMixin
---
synapseclient/models/storage_location.py | 102 +++++++++++++++++------
1 file changed, 77 insertions(+), 25 deletions(-)
diff --git a/synapseclient/models/storage_location.py b/synapseclient/models/storage_location.py
index a3ebe6f12..caab2cfc3 100644
--- a/synapseclient/models/storage_location.py
+++ b/synapseclient/models/storage_location.py
@@ -10,6 +10,7 @@
get_storage_location_setting,
)
from synapseclient.core.async_utils import async_to_sync, otel_trace_method
+from synapseclient.models.mixins.enum_coercion import EnumCoercionMixin
from synapseclient.models.protocols.storage_location_protocol import (
StorageLocationSynchronousProtocol,
)
@@ -34,6 +35,7 @@ class StorageLocationType(str, Enum):
EXTERNAL_S3 = "ExternalS3StorageLocationSetting"
EXTERNAL_GOOGLE_CLOUD = "ExternalGoogleCloudStorageLocationSetting"
EXTERNAL_SFTP = "ExternalStorageLocationSetting"
+ EXTERNAL_HTTPS = "ExternalStorageLocationSetting"
EXTERNAL_OBJECT_STORE = "ExternalObjectStorageLocationSetting"
PROXY = "ProxyStorageLocationSettings"
@@ -62,6 +64,7 @@ class UploadType(str, Enum):
StorageLocationType.EXTERNAL_S3: UploadType.S3,
StorageLocationType.EXTERNAL_GOOGLE_CLOUD: UploadType.GOOGLE_CLOUD_STORAGE,
StorageLocationType.EXTERNAL_SFTP: UploadType.SFTP,
+ StorageLocationType.EXTERNAL_HTTPS: UploadType.HTTPS,
StorageLocationType.EXTERNAL_OBJECT_STORE: UploadType.S3,
StorageLocationType.PROXY: UploadType.HTTPS,
}
@@ -71,10 +74,47 @@ class UploadType(str, Enum):
storage_type.value: storage_type for storage_type in StorageLocationType
}
+# Mapping from StorageLocationType to its type-specific (field_name, api_key) pairs.
+# Only fields listed here are populated by fill_from_dict for a given type.
+_STORAGE_TYPE_SPECIFIC_FIELDS: Dict[StorageLocationType, Dict[str, str]] = {
+ StorageLocationType.SYNAPSE_S3: {
+ "bucket": "bucket",
+ "base_key": "baseKey",
+ "sts_enabled": "stsEnabled",
+ },
+ StorageLocationType.EXTERNAL_S3: {
+ "bucket": "bucket",
+ "base_key": "baseKey",
+ "sts_enabled": "stsEnabled",
+ "endpoint_url": "endpointUrl",
+ },
+ StorageLocationType.EXTERNAL_GOOGLE_CLOUD: {
+ "bucket": "bucket",
+ "base_key": "baseKey",
+ },
+ StorageLocationType.EXTERNAL_OBJECT_STORE: {
+ "bucket": "bucket",
+ "endpoint_url": "endpointUrl",
+ },
+ StorageLocationType.EXTERNAL_SFTP: {
+ "url": "url",
+ "supports_subfolders": "supportsSubfolders",
+ },
+ StorageLocationType.EXTERNAL_HTTPS: {
+ "url": "url",
+ "supports_subfolders": "supportsSubfolders",
+ },
+ StorageLocationType.PROXY: {
+ "proxy_url": "proxyUrl",
+ "secret_key": "secretKey",
+ "benefactor_id": "benefactorId",
+ },
+}
+
@dataclass()
@async_to_sync
-class StorageLocation(StorageLocationSynchronousProtocol):
+class StorageLocation(EnumCoercionMixin, StorageLocationSynchronousProtocol):
"""A storage location setting describes where files are uploaded to and
downloaded from via Synapse. Storage location settings may be created for
external locations, such as user-owned Amazon S3 buckets, Google Cloud
@@ -177,6 +217,11 @@ class StorageLocation(StorageLocationSynchronousProtocol):
).store()
"""
+ _ENUM_FIELDS = {
+ "storage_type": StorageLocationType,
+ "upload_type": UploadType,
+ }
+
# Core fields - present on all storage locations
storage_location_id: Optional[int] = None
"""(Read Only) The unique ID for this storage location, assigned by the server
@@ -248,6 +293,28 @@ class StorageLocation(StorageLocationSynchronousProtocol):
created_by: Optional[int] = field(default=None, compare=False)
"""(Read Only) The ID of the user that created this storage location setting."""
+ def __repr__(self) -> str:
+ common = {
+ "storage_location_id": self.storage_location_id,
+ "storage_type": self.storage_type,
+ "upload_type": self.upload_type,
+ "banner": self.banner,
+ "description": self.description,
+ "etag": self.etag,
+ "created_on": self.created_on,
+ "created_by": self.created_by,
+ }
+ type_specific = {
+ field_name: getattr(self, field_name)
+ for field_name in _STORAGE_TYPE_SPECIFIC_FIELDS.get(self.storage_type, {})
+ }
+ parts = [
+ f"{k}={v!r}"
+ for k, v in {**common, **type_specific}.items()
+ if v is not None
+ ]
+ return f"StorageLocation({', '.join(parts)})"
+
def fill_from_dict(self, synapse_response: Dict[str, Any]) -> "StorageLocation":
"""Converts a response from the REST API into this dataclass.
@@ -264,13 +331,7 @@ def fill_from_dict(self, synapse_response: Dict[str, Any]) -> "StorageLocation":
self.created_on = synapse_response.get("createdOn", None)
self.created_by = synapse_response.get("createdBy", None)
- # Parse upload type
- upload_type_str = synapse_response.get("uploadType", None)
- if upload_type_str:
- try:
- self.upload_type = UploadType(upload_type_str)
- except ValueError:
- self.upload_type = None
+ self.upload_type = synapse_response.get("uploadType", None)
# Parse storage type from concreteType
concrete_type = synapse_response.get("concreteType", "")
@@ -280,21 +341,12 @@ def fill_from_dict(self, synapse_response: Dict[str, Any]) -> "StorageLocation":
if type_suffix in _CONCRETE_TYPE_TO_STORAGE_TYPE:
self.storage_type = _CONCRETE_TYPE_TO_STORAGE_TYPE[type_suffix]
- # S3/GCS fields
- self.bucket = synapse_response.get("bucket", None)
- self.base_key = synapse_response.get("baseKey", None)
- self.sts_enabled = synapse_response.get("stsEnabled", None)
- self.endpoint_url = synapse_response.get("endpointUrl", None)
-
- # SFTP fields
- self.url = synapse_response.get("url", None)
- self.supports_subfolders = synapse_response.get("supportsSubfolders", None)
-
- # Proxy fields
- self.proxy_url = synapse_response.get("proxyUrl", None)
- self.secret_key = synapse_response.get("secretKey", None)
- self.benefactor_id = synapse_response.get("benefactorId", None)
-
+ # Type-specific fields — only populate attributes relevant to this storage type
+ if self.storage_type:
+ for field_name, api_key in _STORAGE_TYPE_SPECIFIC_FIELDS.get(
+ self.storage_type, {}
+ ).items():
+ setattr(self, field_name, synapse_response.get(api_key, None))
return self
def _to_synapse_request(self) -> Dict[str, Any]:
@@ -414,9 +466,9 @@ async def main():
asyncio.run(main())
"""
- body = self._to_synapse_request()
+ request = self._to_synapse_request()
response = await create_storage_location_setting(
- body=body,
+ request=request,
synapse_client=synapse_client,
)
self.fill_from_dict(response)
From d47fdb5eff7d138a804785ca0291a92b4e501cc1 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Fri, 20 Mar 2026 13:10:56 -0700
Subject: [PATCH 26/31] Pass the port from the parsed SFTP URL to resolve the
connection issue if server is on a specific port
---
synapseclient/core/remote_file_storage_wrappers.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/synapseclient/core/remote_file_storage_wrappers.py b/synapseclient/core/remote_file_storage_wrappers.py
index 811cfdbd9..9010392e7 100644
--- a/synapseclient/core/remote_file_storage_wrappers.py
+++ b/synapseclient/core/remote_file_storage_wrappers.py
@@ -316,8 +316,9 @@ def progress_callback(*args, **kwargs) -> None:
progress_bar.update(args[0] - progress_bar.n)
parsedURL = SFTPWrapper._parse_for_sftp(url)
+ port_kwargs = {"port": parsedURL.port} if parsedURL.port else {}
with _retry_pysftp_connection(
- parsedURL.hostname, username=username, password=password
+ parsedURL.hostname, username=username, password=password, **port_kwargs
) as sftp:
sftp.makedirs(parsedURL.path)
with sftp.cd(parsedURL.path):
From 65d39a796f56942f5e3c2225ef0abafb4425e433 Mon Sep 17 00:00:00 2001
From: danlu1
Date: Fri, 20 Mar 2026 13:18:52 -0700
Subject: [PATCH 27/31] reformat storage_location.md
---
.../experimental/async/storage_location.md | 1 -
docs/tutorials/python/storage_location.md | 182 ++++++++---
.../migration_results.png | Bin 0 -> 38122 bytes
.../tutorial_scripts/storage_location.py | 286 ++++++++++++++----
4 files changed, 382 insertions(+), 87 deletions(-)
create mode 100644 docs/tutorials/python/tutorial_screenshots/migration_results.png
diff --git a/docs/reference/experimental/async/storage_location.md b/docs/reference/experimental/async/storage_location.md
index 00e03fc47..cf9630de2 100644
--- a/docs/reference/experimental/async/storage_location.md
+++ b/docs/reference/experimental/async/storage_location.md
@@ -12,7 +12,6 @@ at your own risk.
members:
- store_async
- get_async
- - setup_s3_async
---
diff --git a/docs/tutorials/python/storage_location.md b/docs/tutorials/python/storage_location.md
index 41dd5036c..3e45473c2 100644
--- a/docs/tutorials/python/storage_location.md
+++ b/docs/tutorials/python/storage_location.md
@@ -13,11 +13,14 @@ locations using the new object-oriented models.
## Tutorial Purpose
In this tutorial you will:
-1. Create an external S3 storage location
-2. Set up a folder backed by external S3 storage
-3. Create an STS-enabled storage location for direct S3 access
-4. Use STS credentials with boto3
-5. Retrieve and inspect storage location settings
+1. Create an external S3 storage location and assign it to a folder
+2. Create a Google Cloud Storage location and assign it to a folder
+3. Create an SFTP storage location and assign it to a folder
+4. Create an HTTPS storage location and assign it to a folder
+5. Create an External Object Store location and assign it to a folder
+6. Create a Proxy storage location, register a proxy file handle, and assign it to a folder
+7. Retrieve and inspect storage location settings
+8. Index and migrate files to a new storage location
## Prerequisites
@@ -29,40 +32,76 @@ In this tutorial you will:
`owner.txt` file. See
[Custom Storage Locations](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html).
* (Optional) `boto3` installed for STS credential examples.
+* For SFTP: `pysftp` installed (`pip install "synapseclient[pysftp]"`).
+* For Object Store: AWS credentials configured in your environment.
+* For Proxy: a running proxy server and its shared secret key.
## Understanding Storage Location Types
Synapse supports several types of storage locations:
- **SYNAPSE_S3**: Synapse-managed S3 storage (default)
-- **EXTERNAL_S3**: User-owned Amazon S3 bucket accessed by Synapse
+- **EXTERNAL_S3**: User-owned AWS S3 bucket, accessed by Synapse on
+ your behalf. Synapse transfers the data for uploads and downloads. Requires
+ an `owner.txt` file in the bucket to verify ownership.
- **EXTERNAL_GOOGLE_CLOUD**: User-owned Google Cloud Storage bucket
-- **EXTERNAL_SFTP**: External SFTP server not accessed by Synapse
-- **EXTERNAL_OBJECT_STORE**: S3-like bucket (e.g., OpenStack) not accessed by Synapse
-- **PROXY**: A proxy server that controls access to storage
+- **EXTERNAL_SFTP**: External SFTP server
+- **EXTERNAL_HTTPS**: External HTTPS server (uploading via client is not
+ supported right now.)
+- **EXTERNAL_OBJECT_STORE**: An S3-compatible store (e.g., MinIO, OpenStack
+ Swift) that Synapse does **not** access. The client transfers data directly
+ to the object store using credentials configured in your environment; Synapse
+ only stores the file metadata.
+- **PROXY**: A proxy server that controls access to the underlying storage
-## STS-Enabled Storage
+## Storage Location Settings
-STS (AWS Security Token Service) enabled storage locations allow users to get
-temporary AWS credentials for direct S3 access. This is useful for:
+Each storage type exposes a different set of configuration fields on
+`StorageLocation`. When you retrieve a stored location, only the fields
+relevant to its type are populated:
-- Uploading large files directly to S3
-- Using AWS tools like the AWS CLI or boto3
-- Performing bulk operations on files
+| Type | Key fields |
+|------|-----------|
+| `EXTERNAL_S3` | `bucket`, `base_key` |
+| `EXTERNAL_GOOGLE_CLOUD` | `bucket`, `base_key` |
+| `EXTERNAL_SFTP` / `EXTERNAL_HTTPS` | `url`, `supports_subfolders` |
+| `EXTERNAL_OBJECT_STORE` | `bucket`, `endpoint_url` |
+| `PROXY` | `proxy_url`, `secret_key`, `benefactor_id` |
+
+Common attributes are: concrete_type, storage_location_id, storage_type, upload_type, banner, description, etag, created_on, created_by
+
+## Data Migration Between Storage Locations
+
+Files in a folder can be migrated from one storage location to another using
+`index_files_for_migration` followed by `migrate_indexed_files`. Migration is
+currently supported only between S3 storage locations (both Synapse-managed
+`SYNAPSE_S3` and external `EXTERNAL_S3`) that reside in the **same AWS
+region**.
+
+Migration is a two-phase process:
+
+1. **Index** — scan the folder and record every file that needs to move into a
+ local SQLite database.
+2. **Migrate** — read the index database and move each file to the destination
+ storage location.
+
+Separating the phases lets you review what will be migrated before committing
+to the move.
## 1. Set up and get project
```python
-{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=5-12}
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=4-15}
```
## 2. Create an external S3 storage location
Create a storage location backed by your own S3 bucket. The bucket must be
-properly configured with an `owner.txt` file.
+properly configured with an `owner.txt` file. Synapse will transfer data
+directly to and from this bucket on the user's behalf.
```python
-{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=14-27}
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=17-30}
```
@@ -70,45 +109,119 @@ properly configured with an `owner.txt` file.
```
Created storage location: 12345
-Type: StorageLocationType.EXTERNAL_S3
-Bucket: my-synapse-bucket
+storage location type: StorageLocationType.EXTERNAL_S3
```
## 3. Set up a folder with external S3 storage
-The `setup_s3` convenience method handles creating the folder, storage location,
-and project settings in a single call.
+Create a folder and assign it the S3 storage location. All files uploaded into
+this folder will be stored in your S3 bucket.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=32-40}
+```
+
+## 4. Create a Google Cloud Storage location
+
+Create a storage location backed by a Google Cloud Storage bucket and assign it
+to a folder.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=42-62}
+```
+
+## 5. Create an SFTP storage location
+
+SFTP storage locations point to an external SFTP server. Files are not
+transferred through Synapse — Synapse only stores metadata. Requires the
+`pysftp` package.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=64-87}
+```
+
+## 6. Create an HTTPS storage location
+
+`EXTERNAL_HTTPS` uses the same underlying API type as `EXTERNAL_SFTP` but is
+used when the external server is accessed over HTTPS. Note that the Python
+client does NOT support uploading files to HTTPS storage locations directly yet.
```python
-{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=29-38}
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=89-111}
```
-## 4. Create an STS-enabled storage location
+## 7. Create an External Object Store storage location
-STS-enabled storage locations allow you to get temporary AWS credentials for
-direct S3 access.
+Use `EXTERNAL_OBJECT_STORE` for S3-compatible stores that are not directly
+accessed by Synapse. Unlike `EXTERNAL_S3`, the Python client transfers data
+directly to the object store using locally configured AWS credentials —
+Synapse is never involved in the data transfer, only in storing the metadata.
+
+You can add a profile to work with s3 in ~/.synapseConfig
+
+Add a section matching your endpoint+bucket URL:
+
+[https://s3.us-east-1.amazonaws.com/test-external-object-store]
+profile_name = my-s3-profile
+
+Then ensure my-s3-profile exists in ~/.aws/credentials with valid keys:
+
+[my-s3-profile]
+aws_access_key_id = ...
+aws_secret_access_key = ...
```python
-{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=40-50}
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=113-139}
```
-## 5. Use STS credentials with boto3
+## 8. Create a Proxy storage location
-Once you have an STS-enabled folder, you can get temporary credentials to
-access the underlying S3 bucket directly.
+Proxy storage locations delegate file access to a proxy server that controls
+authentication and access to the underlying storage. Files are registered by
+creating a `ProxyFileHandle` via the REST API. Then, files can be uploaded via store function with data_file_handle_id.
```python
-{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=52-72}
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=141-194}
+```
+
+## 9. Retrieve and inspect storage location settings
+
+You can retrieve a storage location by ID. Only fields relevant to the storage
+type are populated.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=196-204}
+```
+
+
+ You'll notice the output looks like:
+
+```
+Retrieved storage location ID: 12345
+Storage type: StorageLocationType.EXTERNAL_S3
+Bucket: my-synapse-bucket
+Base key: synapse-data
```
+
+
+## 10. Index and migrate files to a new storage location
-## 6. Retrieve and inspect storage location settings
+> **Warning:** This will migrate files associated with the folder. Run against a
+> test project first and review the index result before migrating production data.
-You can retrieve your storage location settings and inspect their configuration.
+Phase 1 indexes all files that need to move into a local SQLite database. This will return a MigrationResults object. You can use the `as_csv` to check the details of indexing status.
```python
-{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=74-86}
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=214-221}
```
+Phase 2 reads that database and performs the actual migration. This will return a MigrationResults object. You can use the `as_csv` to check the details of migration status and errors if any.
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=224-234}
+```
+Currently, detailed Traceback is saved in the exception columns of the csv.
+
+
## Source code for this tutorial
@@ -125,6 +238,7 @@ You can retrieve your storage location settings and inspect their configuration.
- [StorageLocation][synapseclient.models.StorageLocation]
- [StorageLocationType][synapseclient.models.StorageLocationType]
- [Folder][synapseclient.models.Folder]
+- [File][synapseclient.models.File]
- [Project][synapseclient.models.Project]
- [syn.login][synapseclient.Synapse.login]
- [Custom Storage Locations Documentation](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html)
diff --git a/docs/tutorials/python/tutorial_screenshots/migration_results.png b/docs/tutorials/python/tutorial_screenshots/migration_results.png
new file mode 100644
index 0000000000000000000000000000000000000000..501a9cbc616b6f9c878b6e82ac5fba562481fe88
GIT binary patch
literal 38122
zcmeFZby(Ex);|iOjDigK2!b$_s7OdC!cYT<0!o8)3eq`rhe)SNN=t(bAl*m{5)#re
zbobD4?(uo{exLLH-o4Meuj~AEuETW=-}&bKz1LdziqBeW0_9}Hi3lhOaBy&lUPwGs
zz`?na0KTKH;)DNRwc2yQ2G>?W{3#Bq8@3E~q777E7)ndyFoW+`aW3Oh;#|RA0zL>_
zs(*eL!+n5*_xt!I92|cWoXdY*BLlYBzXW
zhuQDY{l3J`Oz@7Xv>ctNm5l)%HyZ~V#~lO#9UYy3%^O2Lg=b=a-46By?-<+JS@W^8
zJ32bDIX-5yvN2-k9ATUn&qQ81;jSfKtD$~U+ol-(wUjJXn2EprPKI1`$DLqRhhG0X
zS4M`OlF8HDx%xHW%iCMe%Pw6aev{w0>wW3UF%WMrUNKN%q9+YzcLJ
zfAQ$M;9`=i<@tph;-EVrPU`-})J441{g2=GDi0=I&2q20AMeSRe$sA{J)@FzjX%?v
z88Ewl>~$3lp&Vu%uD=%+NzFlLZzTv(U|YdNxZpi&e{$t-XNDqOpy`?etMnNBfCw`J
zu0pj!Pe~XPzpgNf2?LX;%pMY5AtIq4A~n`67Op?hM7EGU$(1Nf;CDoQvZ^~;fzsVT
zMl|1Wze;26O?N6$8Q~(j@O%i9|X~swUx_I^EEv%3JNF!l)}Y
z41sCE7N*vu(poq7o8!cS
zFkiya*XfdR{N~#)xD5-utRrfJ)2@?3KfmcfeE5%cN_Zoniw{N)pMAwe>sP9pGSmIr
zp|#-CtFHYv=!RE5n^o2gLkF4jjovxZ~JYiNQ~}|#MWMF6xnABdn5-B
z_`q}`@gMEzi0g~hemr9Y)W_f;wciUC~?;$$T%5@<9#Snok=-0SYMp*#ZvIf{cq
z7e}(TkAv;|W+z=Y4PF
z{nZ5c)*xO5jZ{3hwBYWK$14+#Q(cpbY