diff --git a/.gitignore b/.gitignore
index fa4e7f520..19eb11079 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,7 @@ jenkins/
.idea/*
docs/build/doctrees/*
docs/build/html/_sources/*
+docs_site/*
build/*
/venv
diff --git a/docs/explanations/storage_location_architecture.md b/docs/explanations/storage_location_architecture.md
new file mode 100644
index 000000000..40c7d2292
--- /dev/null
+++ b/docs/explanations/storage_location_architecture.md
@@ -0,0 +1,819 @@
+# Storage Location Architecture
+
+This document provides an in-depth architectural overview of the StorageLocation
+system in the Synapse Python Client. It explains the design decisions, class
+relationships, and data flows that enable flexible storage configuration.
+
+---
+
+## On This Page
+
+
+
+- **[Domain Model](#domain-model)**
+
+ Core classes, enums, and their relationships
+
+- **[Storage Types](#storage-type-mapping)**
+
+ How storage types map to REST API types and choosing the right one
+
+- **[Entity Inheritance](#entity-inheritance-hierarchy)**
+
+ How Projects and Folders gain storage capabilities
+
+- **[Operation Flows](#operation-flows)**
+
+ Sequence diagrams for store, setup, and STS operations
+
+- **[Settings & API](#project-setting-lifecycle)**
+
+ Project settings lifecycle and REST API architecture
+
+- **[Migration](#migration-flow)**
+
+ Two-phase file migration process
+
+
+
+---
+
+## Overview
+
+The StorageLocation setting enables Synapse users to configure a location where files are uploaded to and downloaded from via Synapse.
+By default, Synapse stores files in its internal S3 storage, but
+users can configure projects and folders to use external storage backends such as
+AWS S3 buckets, Google Cloud Storage, SFTP servers, or a local file server using a proxy server.
+
+### Key Concepts
+- [**StorageLocationSetting**](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/project/StorageLocationSetting.html): A configuration specifying file storage and download locations.
+- [**ProjectSetting**](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/project/ProjectSetting.html): A configuration applied to projects that allows customization of file storage locations.
+- [**UploadType**](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/file/UploadType.html): An enumeration that defines the types of file upload destinations that Synapse supports.
+- **STS Credentials**: Temporary AWS credentials for direct S3 access.
+- **StorageLocation Migration**: The process of transferring the files associated with Synapse entities between storage locations while preserving the entities’ structure and identifiers.
+
+---
+
+
+
+# Part 1: Data Model
+
+This section covers the core classes, enumerations, and type mappings.
+
+
+
+## Domain Model
+
+The following class diagram shows the core classes and their relationships in the
+StorageLocation system.
+
+```mermaid
+classDiagram
+ direction TB
+
+ class StorageLocation {
+ +int storage_location_id
+ +StorageLocationType storage_type
+ +UploadType upload_type
+ +str bucket
+ +str base_key
+ +bool sts_enabled
+ +str banner
+ +str description
+ +str etag
+ +str created_on
+ +int created_by
+ +str url
+ +bool supports_subfolders
+ +str endpoint_url
+ +str proxy_url
+ +str secret_key
+ +str benefactor_id
+ +store() StorageLocation
+ +get() StorageLocation
+ +fill_from_dict(dict) StorageLocation
+ }
+
+ class StorageLocationType {
+ <>
+ SYNAPSE_S3
+ EXTERNAL_S3
+ EXTERNAL_GOOGLE_CLOUD
+ EXTERNAL_SFTP
+ EXTERNAL_OBJECT_STORE
+ PROXY
+ }
+
+ class UploadType {
+ <>
+ S3
+ GOOGLE_CLOUD_STORAGE
+ SFTP
+ HTTPS
+ NONE
+ }
+
+ class StorageLocationConfigurable {
+ <>
+ +set_storage_location(storage_location_id) ProjectSetting
+ +get_project_setting(setting_type) ProjectSetting
+ +delete_project_setting(setting_id)
+ +get_sts_storage_token(permission, output_format) dict
+ +index_files_for_migration(dest_storage_location_id, db_path) MigrationResult
+ +migrate_indexed_files(db_path) MigrationResult
+ }
+
+ class Project {
+ +str id
+ +str name
+ +str description
+ }
+
+ class Folder {
+ +str id
+ +str name
+ +str parent_id
+ }
+
+ class UploadDestinationListSetting {
+ <>
+ concreteType
+ id
+ projectId
+ settingsType
+ etag
+ locations
+ }
+
+ class ProjectSetting {
+ <>
+ concreteType
+ id
+ projectId
+ settingsType
+ etag
+
+ }
+ StorageLocation --> StorageLocationType : storage_type
+ StorageLocation --> UploadType : upload_type
+ StorageLocationConfigurable <|-- Project : implements
+ StorageLocationConfigurable <|-- Folder : implements
+ StorageLocationConfigurable ..> ProjectSetting : returns
+ StorageLocationConfigurable ..> UploadDestinationListSetting : uses
+
+```
+
+
+
+
+### Key Components
+
+| Component | Description |
+|-----------|-------------|
+| [synapseclient.models.StorageLocation] | The model representing a storage location setting in Synapse |
+| [synapseclient.models.StorageLocationType] | Enumeration defining the supported storage backend types |
+| [synapseclient.models.UploadType] | Enumeration defining the upload protocol for each storage type |
+| [synapseclient.models.mixins.StorageLocationConfigurable] | Mixin providing storage management methods to entities |
+| [synapseclient.models.mixins.UploadDestinationListSetting] | Dataclass defining the upload destination list setting containing storage location IDs |
+| [synapseclient.models.mixins.ProjectSetting] | Dataclass defining the base project setting structure |
+
+---
+
+
+
+## Storage Type Mapping (TODO: double checking if EXTERNAL_HTTP works as expected)
+
+Each `StorageLocationType` maps to a specific REST API `concreteType` and has a
+default `UploadType`. This mapping allows the system to parse
+responses from the API and construct requests.
+
+```mermaid
+flowchart LR
+ subgraph StorageLocationType
+ SYNAPSE_S3["SYNAPSE_S3"]
+ EXTERNAL_S3["EXTERNAL_S3"]
+ EXTERNAL_GOOGLE_CLOUD["EXTERNAL_GOOGLE_CLOUD"]
+ EXTERNAL_SFTP["EXTERNAL_SFTP"]
+ EXTERNAL_OBJECT_STORE["EXTERNAL_OBJECT_STORE"]
+ PROXY["PROXY"]
+ end
+
+ subgraph concreteType
+ S3SLS["S3StorageLocationSetting"]
+ ExtS3SLS["ExternalS3StorageLocationSetting"]
+ ExtGCSSLS["ExternalGoogleCloudStorageLocationSetting"]
+ ExtSLS["ExternalStorageLocationSetting"]
+ ExtObjSLS["ExternalObjectStorageLocationSetting"]
+ ProxySLS["ProxyStorageLocationSettings"]
+ end
+
+ subgraph UploadType
+ S3["S3"]
+ GCS["GOOGLECLOUDSTORAGE"]
+ SFTP["SFTP"]
+ HTTPS["HTTPS"]
+ end
+
+ SYNAPSE_S3 --> S3SLS --> S3
+ EXTERNAL_S3 --> ExtS3SLS --> S3
+ EXTERNAL_GOOGLE_CLOUD --> ExtGCSSLS --> GCS
+ EXTERNAL_SFTP --> ExtSLS --> SFTP
+ EXTERNAL_OBJECT_STORE --> ExtObjSLS --> S3
+ PROXY --> ProxySLS --> HTTPS
+```
+
+
+
+### Storage Type Attributes
+
+Different storage types support different configuration attributes:
+
+| Attribute | Type | S3StorageLocationSetting | ExternalS3StorageLocationSetting | ExternalObjectStorageLocationSetting | ExternalStorageLocationSetting | ExternalGoogleCloudStorageLocationSetting | ProxyStorageLocationSettings |
+|-----------|------|--------------------------|----------------------------------|--------------------------------------|--------------------------------|-------------------------------------------|------------------------------|
+| **Common (all types)** |
+| `concreteType` | string (enum) | ✓ (required) | ✓ (required) | ✓ (required) | ✓ (required) | ✓ (required) | ✓ (required) |
+| `storageLocationId` | integer (int32) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `uploadType` | string | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `banner` | string | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `description` | string | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `etag` | string | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `createdOn` | string | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| `createdBy` | integer (int32) | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
+| **Type-specific** |
+| `baseKey` | string | ✓ | ✓ | — | — | ✓ | — |
+| `stsEnabled` | boolean | ✓ | ✓ | — | — | — | — |
+| `bucket` | string | — | ✓ (required) | ✓ (required) | — | ✓ (required) | — |
+| `endpointUrl` | string | — | ✓ | ✓ (required) | — | — | — |
+| `url` | string | — | — | — | ✓ | — | — |
+| `supportsSubfolders` | boolean | — | — | — | ✓ | — | — |
+| `proxyUrl` | string | — | — | — | — | — | ✓ |
+| `secretKey` | string | — | — | — | — | — | ✓ |
+| `benefactorId` | string | — | — | — | — | — | ✓ |
+
+## Summary by type
+
+| Setting type | Description | Type-specific attributes |
+|--------------|-------------|---------------------------|
+| **S3StorageLocationSetting** | Default Synapse storage on Amazon S3. | `baseKey`, `stsEnabled` |
+| **ExternalS3StorageLocationSetting** | External S3 bucket connected with Synapse (Synapse-accessed). | `bucket` (required), `baseKey`, `stsEnabled`, `endpointUrl` |
+| **ExternalObjectStorageLocationSetting** | S3-compatible object storage **not** accessed by Synapse. | `bucket` (required), `endpointUrl` (required) |
+| **ExternalStorageLocationSetting** | SFTP or HTTPS upload destination. | `url`, `supportsSubfolders` |
+| **ExternalGoogleCloudStorageLocationSetting** | External Google Cloud Storage bucket connected with Synapse. | `bucket` (required), `baseKey` |
+| **ProxyStorageLocationSettings** | HTTPS proxy for all upload/download operations. | `proxyUrl`, `secretKey`, `benefactorId` |
+
+
+
+
+### Choosing a Storage Type
+
+Use this decision tree to select the appropriate storage type for your use case:
+
+```mermaid
+flowchart TB
+ Start{Need custom storage?}
+ Start -->|No| DEFAULT[Use default Synapse storage]
+ Start -->|Yes| Q1{Want Synapse to
manage storage?}
+
+ Q1 -->|Yes| SYNAPSE_S3[Use SYNAPSE_S3]
+ Q1 -->|No| Q2{What storage
backend?}
+
+ Q2 -->|AWS S3| Q3{Synapse accesses
bucket directly?}
+ Q2 -->|Google Cloud| EXTERNAL_GOOGLE_CLOUD[Use EXTERNAL_GOOGLE_CLOUD]
+ Q2 -->|SFTP Server| EXTERNAL_SFTP[Use EXTERNAL_SFTP]
+ Q2 -->|Proxy Server| PROXY[Use PROXY]
+ Q2 -->|AWS S3 | EXTERNAL_OBJECT_STORE[Use EXTERNAL_OBJECT_STORE]
+
+ Q3 -->|Yes| Q4{Need STS
credentials?}
+ Q3 -->|No| EXTERNAL_OBJECT_STORE
+
+ Q4 -->|Yes| EXTERNAL_S3_STS[Use EXTERNAL_S3
with sts_enabled=True]
+ Q4 -->|No| EXTERNAL_S3[Use EXTERNAL_S3]
+
+ SYNAPSE_S3 --> Benefits1[Benefits:
- Zero configuration
- Managed by Synapse
- STS available]
+ EXTERNAL_S3 --> Benefits2[Benefits:
- Use your own bucket
- Control access & costs
- Optional STS]
+ EXTERNAL_S3_STS --> Benefits2
+ EXTERNAL_GOOGLE_CLOUD --> Benefits3[Benefits:
- GCP native
- Use existing GCS buckets]
+ EXTERNAL_SFTP --> Benefits4[Benefits:
- Legacy systems
- Synapse never touches data]
+ EXTERNAL_OBJECT_STORE --> Benefits5[Benefits:
- OpenStack, MinIO, etc
- Synapse never touches data]
+ PROXY --> Benefits6[Benefits:
- Custom access control
- Data transformation]
+ DEFAULT --> Benefits0[Benefits:
- No configuration needed
- Synapse-managed S3]
+```
+
+---
+
+
+
+## Entity Inheritance Hierarchy
+
+Projects and Folders inherit storage configuration capabilities through the
+`StorageLocation` mixin. This pattern allows consistent storage
+management across container entities.
+
+```mermaid
+classDiagram
+ direction TB
+
+ class StorageLocation {
+ <>
+ +set_storage_location()
+ +get_project_setting()
+ +delete_project_setting()
+ +get_sts_storage_token()
+ +index_files_for_migration()
+ +migrate_indexed_files()
+ }
+
+ class Project {
+ +str id
+ +str name
+ +str description
+ +str etag
+ }
+
+ class Folder {
+ +str id
+ +str name
+ +str parent_id
+ +str etag
+ }
+
+ StorageLocation <|-- Project
+ StorageLocation <|-- Folder
+```
+
+The mixin pattern allows `Project` and `Folder` to share storage location
+functionality without code duplication. Both classes inherit the same
+methods from `StorageLocation`.
+
+---
+
+
+
+
+# Part 2: Operation Flows
+
+This section contains sequence diagrams for key operations.
+
+
+
+## Operation Flows
+
+### Store Operation
+
+The `store()` method creates a new storage location in Synapse. Creating a storage location is idempotent per user. Repeating a creation request with the same properties will return the previously created storage location rather than creating a new one.
+
+```mermaid
+sequenceDiagram
+ participant User
+ participant StorageLocation
+ participant _to_synapse_request as _to_synapse_request()
+ participant API as storage_location_services
+ participant Synapse as Synapse REST API
+
+ User->>StorageLocation: store()
+ activate StorageLocation
+
+ StorageLocation->>_to_synapse_request: Build request body
+ activate _to_synapse_request
+
+ Note over _to_synapse_request: Validate storage_type is set
+ Note over _to_synapse_request: Build concreteType from storage_type
+ Note over _to_synapse_request: Determine uploadType
+ Note over _to_synapse_request: Add type-specific fields
+
+ _to_synapse_request-->>StorageLocation: Request body dict
+ deactivate _to_synapse_request
+
+ StorageLocation->>API: create_storage_location_setting(body)
+ activate API
+
+ API->>Synapse: POST /storageLocation
+ activate Synapse
+
+ Synapse-->>API: Response with storageLocationId
+ deactivate Synapse
+
+ API-->>StorageLocation: Response dict
+ deactivate API
+
+ StorageLocation->>StorageLocation: fill_from_dict(response)
+ Note over StorageLocation: Parse storageLocationId
+ Note over StorageLocation: Parse concreteType → storage_type
+ Note over StorageLocation: Parse uploadType → upload_type
+ Note over StorageLocation: Extract type-specific fields
+
+ StorageLocation-->>User: StorageLocation (populated)
+ deactivate StorageLocation
+```
+
+
+
+### STS Token Retrieval
+
+STS (AWS Security Token Service) enables direct S3 access using temporary credentials.
+
+When a Synapse client is constructed (`Synapse.__init__`), it creates an in-memory token cache:
+
+- `self._sts_token_store = sts_transfer.StsTokenStore()` (see `synapseclient/client.py`)
+
+The store caches STS tokens per entity and permission so repeated access to the same storage location can reuse credentials without a round-trip to the REST API.
+
+```mermaid
+sequenceDiagram
+ participant User
+ participant Entity as Folder/Project
+ participant Mixin as StorageLocation
+ participant STS as sts_transfer module
+ participant Client as Synapse Client
+ participant TokenStore as _sts_token_store (StsTokenStore)
+ participant Synapse as Synapse REST API
+
+ Note over Client,TokenStore: Client.__init__ creates self._sts_token_store = sts_transfer.StsTokenStore()
+
+ User->>Entity: get_sts_storage_token(permission, output_format)
+ activate Entity
+
+ Entity->>Mixin: get_sts_storage_token_async()
+ activate Mixin
+
+ Mixin->>Client: Synapse.get_client()
+ Client-->>Mixin: Synapse client instance
+
+ Mixin->>STS: sts_transfer.get_sts_credentials()
+ activate STS
+
+ STS->>Client: syn._sts_token_store.get_token(...)
+ activate Client
+ Client->>TokenStore: get_token(entity_id, permission, min_remaining_life)
+ activate TokenStore
+
+ alt token cached and not expired
+ TokenStore-->>Client: Cached token
+ else cache miss or token expired
+ TokenStore->>Synapse: GET /entity/{id}/sts?permission={permission}
+ activate Synapse
+ Synapse-->>TokenStore: STS credentials response
+ deactivate Synapse
+ TokenStore-->>Client: New token (cached)
+ end
+ deactivate TokenStore
+ Client-->>STS: Token
+ deactivate Client
+
+ Note over STS: Parse credentials
+
+ alt output_format == "boto"
+ Note over STS: Format for boto3 client kwargs
+ STS-->>Mixin: {aws_access_key_id, aws_secret_access_key, aws_session_token}
+ else output_format == "json"
+ Note over STS: Return JSON string
+ STS-->>Mixin: JSON credentials string
+ else output_format == "shell" / "bash"
+ Note over STS: Format as export commands
+ STS-->>Mixin: Shell export commands
+ end
+ deactivate STS
+
+ Mixin-->>Entity: Formatted credentials
+ deactivate Mixin
+
+ Entity-->>User: Credentials
+ deactivate Entity
+```
+
+
+
+#### Credential Output Formats
+
+| Format | Description | Use Case |
+|--------|-------------|----------|
+| `boto` | Dict with `aws_access_key_id`, `aws_secret_access_key`, `aws_session_token` | Pass directly to `boto3.client('s3', **creds)` |
+| `json` | JSON string | Store or pass to external tools |
+| `shell` / `bash` | `export AWS_ACCESS_KEY_ID=...` format | Execute in shell |
+| `cmd` | Windows SET commands | Windows command prompt |
+| `powershell` | PowerShell variable assignments | PowerShell scripts |
+
+---
+
+
+
+
+# Part 3: Settings & Infrastructure
+
+This section covers project settings, API architecture, and the async/sync pattern.
+
+
+
+## Project Setting Lifecycle
+
+Project settings control which storage location(s) are used for uploads to an
+entity. The following state diagram shows the lifecycle of a project setting.
+
+```mermaid
+stateDiagram-v2
+ [*] --> NoSetting: Entity created
+
+ NoSetting --> Created: set_storage_location()
+ Note right of NoSetting: Inherits from parent or uses Synapse default
+
+ Created --> Updated: set_storage_location() updates existing setting
+ Updated --> Updated: set_storage_location() updates existing setting
+
+ Created --> Deleted: delete_project_setting(project_setting_id)
+ Updated --> Deleted: delete_project_setting(project_setting_id)
+
+ Deleted --> NoSetting: Returns to default (inherits from parent)
+
+ state NoSetting {
+ [*] --> Inherited
+ Inherited: No project setting exists
+ Inherited: Uses parent or Synapse default (ID=1)
+ }
+
+ state Created {
+ [*] --> Active
+ Active: concreteType = UploadDestinationListSetting
+ Active: locations = [storage_location_id]
+ Active: settingsType = "upload"
+ Active: projectId = entity.id
+ Active: Has id and etag
+ }
+
+ state Updated {
+ [*] --> Modified
+ Modified: concreteType = UploadDestinationListSetting
+ Modified: locations = [new_id, ...] (max 10)
+ Modified: settingsType = "upload"
+ Modified: etag updated (OCC)
+ }
+```
+
+
+
+### Setting Types
+
+| Type | Purpose | Status |
+|------|---------|--------|
+| `upload` | Configures upload destination storage location(s) | **Supported** |
+
+Other setting types may be added in the future.
+
+---
+
+
+
+## API Layer Architecture
+
+The storage location services module provides async functions that wrap the
+Synapse REST API endpoints. This layer handles serialization and error handling.
+
+```mermaid
+flowchart TB
+ subgraph "Model Layer"
+ SL[StorageLocation]
+ SLCM[StorageLocation Mixin]
+ end
+
+ subgraph "API Layer"
+ create_sls[create_storage_location_setting]
+ get_sls[get_storage_location_setting]
+ get_ps[get_project_setting]
+ create_ps[create_project_setting]
+ update_ps[update_project_setting]
+ delete_ps[delete_project_setting]
+ end
+
+ subgraph "REST Endpoints"
+ POST_SL["POST /storageLocation"]
+ GET_SL["GET /storageLocation/{id}"]
+ GET_PS["GET /projectSettings/{id}/type/{type}"]
+ POST_PS["POST /projectSettings"]
+ PUT_PS["PUT /projectSettings"]
+ DELETE_PS["DELETE /projectSettings/{id}"]
+ end
+
+ SL --> create_sls --> POST_SL
+ SL --> get_sls --> GET_SL
+
+ SLCM --> get_ps --> GET_PS
+ SLCM --> create_ps --> POST_PS
+ SLCM --> update_ps --> PUT_PS
+ SLCM --> delete_ps --> DELETE_PS
+```
+
+
+
+### REST API Reference
+
+| Method | Endpoint | Description |
+|--------|----------|-------------|
+| POST | `/storageLocation` | Create a new storage location setting |
+| GET | `/storageLocation/{id}` | Retrieve a storage location by ID |
+| GET | `/projectSettings/{projectId}/type/{type}` | Get project settings for an entity |
+| POST | `/projectSettings` | Create a new project setting |
+| PUT | `/projectSettings` | Update an existing project setting |
+| DELETE | `/projectSettings/{id}` | Delete a project setting |
+
+---
+
+
+
+## Async/Sync Pattern
+
+The StorageLocation system follows the Python client's `@async_to_sync` pattern,
+providing both async and sync versions of all methods.
+
+```mermaid
+flowchart LR
+ subgraph "User Code"
+ SyncCall["folder.set_storage_location()"]
+ AsyncCall["await folder.set_storage_location_async()"]
+ end
+
+ subgraph "@async_to_sync Decorator"
+ Wrapper["Sync wrapper"]
+ AsyncMethod["Async implementation"]
+ end
+
+ subgraph "Event Loop"
+ RunSync["wrap_async_to_sync()"]
+ AsyncIO["asyncio"]
+ end
+
+ SyncCall --> Wrapper
+ Wrapper --> RunSync
+ RunSync --> AsyncIO
+ AsyncIO --> AsyncMethod
+
+ AsyncCall --> AsyncMethod
+```
+
+
+
+### Method Pairs
+
+| Sync Method | Async Method |
+|-------------|--------------|
+| `StorageLocation.store()` | `StorageLocation.store_async()` |
+| `StorageLocation.get()` | `StorageLocation.get_async()` |
+| `StorageLocation.setup_s3()` | `StorageLocation.setup_s3_async()` |
+| `folder.set_storage_location()` | `folder.set_storage_location_async()` |
+| `folder.get_project_setting()` | `folder.get_project_setting_async()` |
+| `folder.delete_project_setting()` | `folder.delete_project_setting_async()` |
+| `folder.get_sts_storage_token()` | `folder.get_sts_storage_token_async()` |
+| `folder.index_files_for_migration()` | `folder.index_files_for_migration_async()` |
+| `folder.migrate_indexed_files()` | `folder.migrate_indexed_files_async()` |
+
+---
+
+
+
+
+# Part 4: Migration
+
+This section covers the file migration system.
+
+
+
+## Migration Flow
+
+File migration is a two-phase process that first indexes all candidate files and then performs an asynchronous, batched migration that reuses copied file handles where possible, respects concurrency limits, snapshots affected tables when needed, and updates entities and table cells via transactional table operations while recording per-item status in a SQLite database.
+
+```mermaid
+sequenceDiagram
+ participant User
+ participant Entity as Project/Folder
+ participant IndexFn as index_files_for_migration
+ participant DB as SQLite Database
+ participant MigrateFn as migrate_indexed_files
+ participant Synapse as Synapse REST API
+
+ Note over User,Synapse: === Phase 1: Index Files ===
+ User->>Entity: index_files_for_migration_async
+ activate Entity
+
+ Entity->>IndexFn: index_files_for_migration_async(dest_id, source_ids, file_version_strategy, include_table_files)
+ activate IndexFn
+
+ IndexFn->>Synapse: Verify user owns destination storage location
+ Synapse-->>IndexFn: OK / error
+
+ IndexFn->>DB: Create/open DB + ensure schema
+ IndexFn->>DB: Store migration settings (root_id, dest_id, source_ids, file_version_strategy, include_table_files)
+
+ alt Entity is Project/Folder (container)
+ IndexFn->>Synapse: get_children(parent, include_types)
+ Synapse-->>IndexFn: Child references (folders/files/tables)
+
+ loop For each child (bounded concurrency)
+ IndexFn->>Synapse: get_async(child, downloadFile=false)
+ Synapse-->>IndexFn: Child entity
+ IndexFn->>IndexFn: _index_entity_async(child)
+ end
+
+ IndexFn->>DB: Mark container as indexed (PROJECT/FOLDER)
+
+ else Entity is File
+ alt file_version_strategy = new / latest / all
+ IndexFn->>Synapse: Get file handle metadata (and versions if needed)
+ Synapse-->>IndexFn: File handle(s)
+ IndexFn->>DB: Insert/append FILE migration rows (INDEXED and ALREADY_MIGRATED)
+ else file_version_strategy = skip
+ Note over IndexFn: Skip file entities
+ end
+
+ else Entity is Table (include_table_files=true)
+ IndexFn->>Synapse: get_columns(table_id)
+ Synapse-->>IndexFn: Column list
+ IndexFn->>Synapse: Query rows for FILEHANDLEID columns (+ rowId,rowVersion)
+ Synapse-->>IndexFn: Row results (fileHandleId values)
+ loop For each row + file-handle cell (bounded concurrency)
+ IndexFn->>Synapse: get_file_handle_for_download(fileHandleId, objectType=TableEntity)
+ Synapse-->>IndexFn: File handle
+ IndexFn->>DB: Insert TABLE_ATTACHED_FILE migration row (or ALREADY_MIGRATED)
+ end
+ end
+
+ opt continue_on_error=true
+ Note over IndexFn,DB: Indexing errors are recorded in DB instead of aborting
+ end
+
+ IndexFn-->>Entity: MigrationResult (db_path)
+ deactivate IndexFn
+
+ Entity-->>User: MigrationResult
+ deactivate Entity
+
+ Note over User,Synapse: === Phase 2: Migrate Files ===
+ User->>Entity: migrate_indexed_files / migrate_indexed_files_async (db_path)
+ activate Entity
+
+ Entity->>MigrateFn: Start migration
+ activate MigrateFn
+
+ MigrateFn->>DB: Open DB, ensure schema, load settings
+ MigrateFn->>User: Confirm migration (unless force=True)
+ Note over MigrateFn,DB: If not confirmed, abort and return
+
+ loop While there are indexed items
+ MigrateFn->>DB: Query next batch (respecting pending/completed handles & concurrency)
+
+ loop For each item in batch
+ MigrateFn->>MigrateFn: Skip if key or file handle already pending
+
+ MigrateFn->>DB: Check if destination file handle already exists
+ alt Existing copy found
+ Note over MigrateFn,DB: Reuse existing to_file_handle_id
+ else No existing copy
+ MigrateFn->>Synapse: Copy file to new storage (bounded concurrency)
+ Synapse-->>MigrateFn: New to_file_handle_id
+ end
+
+ alt Item is FILE (entity)
+ alt file_version_strategy = new (version is None)
+ MigrateFn->>Synapse: Create new file version with new file handle
+ else specific version
+ MigrateFn->>Synapse: Update existing version's file handle
+ end
+ else Item is TABLE_ATTACHED_FILE
+ alt create_table_snapshots=True
+ MigrateFn->>Synapse: Create table snapshot
+ end
+ MigrateFn->>Synapse: Update table cell via transactional table update (PartialRowSet/TableUpdateTransaction)
+ end
+
+ MigrateFn->>DB: Update row status to MIGRATED/ERRORED
+ end
+
+
+ MigrateFn-->>Entity: MigrationResult (migrated counts)
+ deactivate MigrateFn
+
+ Entity-->>User: MigrationResult
+ deactivate Entity
+```
+
+
+
+### Migration Strategies
+
+| Strategy | Description |
+|----------|-------------|
+| `new` | Create new file versions in destination (default) |
+| `all` | Migrate all versions of each file |
+| `latest` | Only migrate the latest version |
+| `skip` | Skip if file already exists in destination |
+
+---
+
+
+
+
+# Learn More
+
+| Resource | Description |
+|----------|-------------|
+| [Storage Location Tutorial](../tutorials/python/storage_location.md) | Step-by-step guide to using storage locations |
+| [StorageLocation API Reference][synapseclient.models.StorageLocation] | Complete API documentation |
+| [StorageLocation Mixin][synapseclient.models.mixins.StorageLocation] | Mixin methods for Projects and Folders |
+| [Custom Storage Locations (Synapse Docs)](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html) | Official Synapse documentation |
diff --git a/docs/js/mermaid-init.js b/docs/js/mermaid-init.js
new file mode 100644
index 000000000..823cbce57
--- /dev/null
+++ b/docs/js/mermaid-init.js
@@ -0,0 +1,12 @@
+// Initialize Mermaid diagrams
+document.addEventListener("DOMContentLoaded", function() {
+ mermaid.initialize({
+ startOnLoad: true,
+ theme: "default",
+ securityLevel: "loose",
+ flowchart: {
+ useMaxWidth: true,
+ htmlLabels: true
+ }
+ });
+});
diff --git a/docs/reference/experimental/async/folder.md b/docs/reference/experimental/async/folder.md
index 7b29f84ea..fd74e65dd 100644
--- a/docs/reference/experimental/async/folder.md
+++ b/docs/reference/experimental/async/folder.md
@@ -30,3 +30,9 @@ at your own risk.
- get_schema_derived_keys_async
- get_schema_validation_statistics_async
- get_invalid_validation_async
+ - set_storage_location_async
+ - get_project_setting_async
+ - delete_project_setting_async
+ - get_sts_storage_token_async
+ - index_files_for_migration_async
+ - migrate_indexed_files_async
diff --git a/docs/reference/experimental/async/project.md b/docs/reference/experimental/async/project.md
index e3adfa9fc..42803e871 100644
--- a/docs/reference/experimental/async/project.md
+++ b/docs/reference/experimental/async/project.md
@@ -29,3 +29,9 @@ at your own risk.
- get_schema_derived_keys_async
- get_schema_validation_statistics_async
- get_invalid_validation_async
+ - set_storage_location_async
+ - get_project_setting_async
+ - delete_project_setting_async
+ - get_sts_storage_token_async
+ - index_files_for_migration_async
+ - migrate_indexed_files_async
diff --git a/docs/reference/experimental/async/storage_location.md b/docs/reference/experimental/async/storage_location.md
new file mode 100644
index 000000000..cf9630de2
--- /dev/null
+++ b/docs/reference/experimental/async/storage_location.md
@@ -0,0 +1,22 @@
+# StorageLocation
+
+Contained within this file are experimental interfaces for working with the Synapse Python
+Client. Unless otherwise noted these interfaces are subject to change at any time. Use
+at your own risk.
+
+## API Reference
+
+::: synapseclient.models.StorageLocation
+ options:
+ inherited_members: true
+ members:
+ - store_async
+ - get_async
+
+---
+
+::: synapseclient.models.StorageLocationType
+
+---
+
+::: synapseclient.models.UploadType
diff --git a/docs/reference/experimental/mixins/manifest_generatable.md b/docs/reference/experimental/mixins/manifest_generatable.md
new file mode 100644
index 000000000..47aac2a4c
--- /dev/null
+++ b/docs/reference/experimental/mixins/manifest_generatable.md
@@ -0,0 +1,69 @@
+# ManifestGeneratable Mixin
+
+The `ManifestGeneratable` mixin provides manifest TSV file generation and reading capabilities for container entities (Projects and Folders).
+
+## Overview
+
+This mixin enables:
+
+- Generating manifest TSV files after syncing from Synapse
+- Uploading files from manifest TSV files
+- Validating manifest files before upload
+
+## Usage
+
+The mixin is automatically available on `Project` and `Folder` classes:
+
+```python
+from synapseclient.models import Project, Folder
+
+# Project and Folder both have manifest capabilities
+project = Project(id="syn123")
+folder = Folder(id="syn456")
+```
+
+## API Reference
+
+::: synapseclient.models.mixins.manifest.ManifestGeneratable
+ options:
+ show_root_heading: true
+ show_source: false
+ members:
+ - generate_manifest
+ - generate_manifest_async
+ - from_manifest
+ - from_manifest_async
+ - validate_manifest
+ - validate_manifest_async
+ - get_manifest_data
+ - get_manifest_data_async
+
+## Constants
+
+### MANIFEST_FILENAME
+
+The default filename for generated manifests: `SYNAPSE_METADATA_MANIFEST.tsv`
+
+```python
+from synapseclient.models import MANIFEST_FILENAME
+
+print(MANIFEST_FILENAME) # "SYNAPSE_METADATA_MANIFEST.tsv"
+```
+
+### DEFAULT_GENERATED_MANIFEST_KEYS
+
+The default columns included in generated manifest files:
+
+```python
+from synapseclient.models import DEFAULT_GENERATED_MANIFEST_KEYS
+
+print(DEFAULT_GENERATED_MANIFEST_KEYS)
+# ['path', 'parent', 'name', 'id', 'synapseStore', 'contentType',
+# 'used', 'executed', 'activityName', 'activityDescription']
+```
+
+## See Also
+
+- [Manifest Operations Tutorial](../../../tutorials/python/manifest_operations.md)
+- [StorableContainer Mixin](storable_container.md)
+- [Manifest TSV Format](../../../explanations/manifest_tsv.md)
diff --git a/docs/reference/experimental/mixins/storage_location_configurable.md b/docs/reference/experimental/mixins/storage_location_configurable.md
new file mode 100644
index 000000000..3cf29d81a
--- /dev/null
+++ b/docs/reference/experimental/mixins/storage_location_configurable.md
@@ -0,0 +1,54 @@
+# StorageLocationConfigurable
+
+The `StorageLocationConfigurable` mixin provides methods for managing storage locations
+on entities (Projects and Folders).
+
+For architecture diagrams and design documentation, see
+[Storage Location Architecture](../../../explanations/storage_location_architecture.md).
+
+This mixin includes:
+
+- Setting upload storage locations
+- Getting and deleting project settings
+- Obtaining STS credentials for direct S3 access
+- Migrating files to new storage locations
+
+## Methods Overview
+
+| Method | Description |
+|--------|-------------|
+| `set_storage_location` | Set the upload storage location for this entity |
+| `get_project_setting` | Get project settings (upload, external_sync, etc.) |
+| `delete_project_setting` | Delete a project setting |
+| `get_sts_storage_token` | Get STS credentials for direct S3 access |
+| `index_files_for_migration` | Index files for migration to a new storage location |
+| `migrate_indexed_files` | Migrate previously indexed files |
+
+## Usage Example
+
+```python
+from synapseclient.models import Folder, StorageLocation, StorageLocationType
+
+# Create a storage location
+storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket="my-bucket",
+ sts_enabled=True,
+).store()
+
+# Set storage location on a folder
+folder = Folder(id="syn123").get()
+folder.set_storage_location(storage_location_id=storage.storage_location_id)
+
+# Get STS credentials
+credentials = folder.get_sts_storage_token(
+ permission="read_write",
+ output_format="boto",
+)
+```
+
+::: synapseclient.models.mixins.StorageLocationConfigurable
+
+---
+
+::: synapseclient.models.protocols.storage_location_mixin_protocol.StorageLocationConfigurableSynchronousProtocol
diff --git a/docs/reference/experimental/sync/folder.md b/docs/reference/experimental/sync/folder.md
index 43272ea30..c866a727e 100644
--- a/docs/reference/experimental/sync/folder.md
+++ b/docs/reference/experimental/sync/folder.md
@@ -41,3 +41,9 @@ at your own risk.
- get_schema_derived_keys
- get_schema_validation_statistics
- get_invalid_validation
+ - set_storage_location
+ - get_project_setting
+ - delete_project_setting
+ - get_sts_storage_token
+ - index_files_for_migration
+ - migrate_indexed_files
diff --git a/docs/reference/experimental/sync/project.md b/docs/reference/experimental/sync/project.md
index 4e2f35a26..1bb859795 100644
--- a/docs/reference/experimental/sync/project.md
+++ b/docs/reference/experimental/sync/project.md
@@ -40,3 +40,9 @@ at your own risk.
- get_schema_derived_keys
- get_schema_validation_statistics
- get_invalid_validation
+ - set_storage_location
+ - get_project_setting
+ - delete_project_setting
+ - get_sts_storage_token
+ - index_files_for_migration
+ - migrate_indexed_files
diff --git a/docs/reference/experimental/sync/storage_location.md b/docs/reference/experimental/sync/storage_location.md
new file mode 100644
index 000000000..a764c9d7d
--- /dev/null
+++ b/docs/reference/experimental/sync/storage_location.md
@@ -0,0 +1,24 @@
+[](){ #storage-location-reference-sync }
+# StorageLocation
+
+Contained within this file are experimental interfaces for working with the Synapse Python
+Client. Unless otherwise noted these interfaces are subject to change at any time. Use
+at your own risk.
+
+## API Reference
+
+::: synapseclient.models.StorageLocation
+ options:
+ inherited_members: true
+ members:
+ - store
+ - get
+ - setup_s3
+
+---
+
+::: synapseclient.models.StorageLocationType
+
+---
+
+::: synapseclient.models.UploadType
diff --git a/docs/tutorials/python/storage_location.md b/docs/tutorials/python/storage_location.md
new file mode 100644
index 000000000..3e45473c2
--- /dev/null
+++ b/docs/tutorials/python/storage_location.md
@@ -0,0 +1,249 @@
+# Storage Locations in Synapse
+
+Storage locations allow you to configure where files uploaded to Synapse are
+stored. By default, files are stored in Synapse's internal S3 storage, but you
+can configure projects or folders to use your own AWS S3 buckets, Google Cloud
+Storage buckets, or other external storage.
+
+This tutorial demonstrates how to use the Python client to manage storage
+locations using the new object-oriented models.
+
+[Read more about Custom Storage Locations](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html)
+
+## Tutorial Purpose
+In this tutorial you will:
+
+1. Create an external S3 storage location and assign it to a folder
+2. Create a Google Cloud Storage location and assign it to a folder
+3. Create an SFTP storage location and assign it to a folder
+4. Create an HTTPS storage location and assign it to a folder
+5. Create an External Object Store location and assign it to a folder
+6. Create a Proxy storage location, register a proxy file handle, and assign it to a folder
+7. Retrieve and inspect storage location settings
+8. Index and migrate files to a new storage location
+
+## Prerequisites
+
+* Make sure that you have completed the [Installation](../installation.md) and
+ [Authentication](../authentication.md) setup.
+* You must have a [Project](./project.md) created and replace the one used in
+ this tutorial.
+* An AWS S3 bucket properly configured for use with Synapse, including an
+ `owner.txt` file. See
+ [Custom Storage Locations](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html).
+* (Optional) `boto3` installed for STS credential examples.
+* For SFTP: `pysftp` installed (`pip install "synapseclient[pysftp]"`).
+* For Object Store: AWS credentials configured in your environment.
+* For Proxy: a running proxy server and its shared secret key.
+
+## Understanding Storage Location Types
+
+Synapse supports several types of storage locations:
+
+- **SYNAPSE_S3**: Synapse-managed S3 storage (default)
+- **EXTERNAL_S3**: User-owned AWS S3 bucket, accessed by Synapse on
+ your behalf. Synapse transfers the data for uploads and downloads. Requires
+ an `owner.txt` file in the bucket to verify ownership.
+- **EXTERNAL_GOOGLE_CLOUD**: User-owned Google Cloud Storage bucket
+- **EXTERNAL_SFTP**: External SFTP server
+- **EXTERNAL_HTTPS**: External HTTPS server (uploading via client is not
+ supported right now.)
+- **EXTERNAL_OBJECT_STORE**: An S3-compatible store (e.g., MinIO, OpenStack
+ Swift) that Synapse does **not** access. The client transfers data directly
+ to the object store using credentials configured in your environment; Synapse
+ only stores the file metadata.
+- **PROXY**: A proxy server that controls access to the underlying storage
+
+## Storage Location Settings
+
+Each storage type exposes a different set of configuration fields on
+`StorageLocation`. When you retrieve a stored location, only the fields
+relevant to its type are populated:
+
+| Type | Key fields |
+|------|-----------|
+| `EXTERNAL_S3` | `bucket`, `base_key` |
+| `EXTERNAL_GOOGLE_CLOUD` | `bucket`, `base_key` |
+| `EXTERNAL_SFTP` / `EXTERNAL_HTTPS` | `url`, `supports_subfolders` |
+| `EXTERNAL_OBJECT_STORE` | `bucket`, `endpoint_url` |
+| `PROXY` | `proxy_url`, `secret_key`, `benefactor_id` |
+
+Common attributes are: concrete_type, storage_location_id, storage_type, upload_type, banner, description, etag, created_on, created_by
+
+## Data Migration Between Storage Locations
+
+Files in a folder can be migrated from one storage location to another using
+`index_files_for_migration` followed by `migrate_indexed_files`. Migration is
+currently supported only between S3 storage locations (both Synapse-managed
+`SYNAPSE_S3` and external `EXTERNAL_S3`) that reside in the **same AWS
+region**.
+
+Migration is a two-phase process:
+
+1. **Index** — scan the folder and record every file that needs to move into a
+ local SQLite database.
+2. **Migrate** — read the index database and move each file to the destination
+ storage location.
+
+Separating the phases lets you review what will be migrated before committing
+to the move.
+
+## 1. Set up and get project
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=4-15}
+```
+
+## 2. Create an external S3 storage location
+
+Create a storage location backed by your own S3 bucket. The bucket must be
+properly configured with an `owner.txt` file. Synapse will transfer data
+directly to and from this bucket on the user's behalf.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=17-30}
+```
+
+
+ You'll notice the output looks like:
+
+```
+Created storage location: 12345
+storage location type: StorageLocationType.EXTERNAL_S3
+```
+
+
+## 3. Set up a folder with external S3 storage
+
+Create a folder and assign it the S3 storage location. All files uploaded into
+this folder will be stored in your S3 bucket.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=32-40}
+```
+
+## 4. Create a Google Cloud Storage location
+
+Create a storage location backed by a Google Cloud Storage bucket and assign it
+to a folder.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=42-62}
+```
+
+## 5. Create an SFTP storage location
+
+SFTP storage locations point to an external SFTP server. Files are not
+transferred through Synapse — Synapse only stores metadata. Requires the
+`pysftp` package.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=64-87}
+```
+
+## 6. Create an HTTPS storage location
+
+`EXTERNAL_HTTPS` uses the same underlying API type as `EXTERNAL_SFTP` but is
+used when the external server is accessed over HTTPS. Note that the Python
+client does NOT support uploading files to HTTPS storage locations directly yet.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=89-111}
+```
+
+## 7. Create an External Object Store storage location
+
+Use `EXTERNAL_OBJECT_STORE` for S3-compatible stores that are not directly
+accessed by Synapse. Unlike `EXTERNAL_S3`, the Python client transfers data
+directly to the object store using locally configured AWS credentials —
+Synapse is never involved in the data transfer, only in storing the metadata.
+
+You can add a profile to work with s3 in ~/.synapseConfig
+
+Add a section matching your endpoint+bucket URL:
+
+[https://s3.us-east-1.amazonaws.com/test-external-object-store]
+profile_name = my-s3-profile
+
+Then ensure my-s3-profile exists in ~/.aws/credentials with valid keys:
+
+[my-s3-profile]
+aws_access_key_id = ...
+aws_secret_access_key = ...
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=113-139}
+```
+
+## 8. Create a Proxy storage location
+
+Proxy storage locations delegate file access to a proxy server that controls
+authentication and access to the underlying storage. Files are registered by
+creating a `ProxyFileHandle` via the REST API. Then, files can be uploaded via store function with data_file_handle_id.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=141-194}
+```
+
+## 9. Retrieve and inspect storage location settings
+
+You can retrieve a storage location by ID. Only fields relevant to the storage
+type are populated.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=196-204}
+```
+
+
+ You'll notice the output looks like:
+
+```
+Retrieved storage location ID: 12345
+Storage type: StorageLocationType.EXTERNAL_S3
+Bucket: my-synapse-bucket
+Base key: synapse-data
+```
+
+
+## 10. Index and migrate files to a new storage location
+
+> **Warning:** This will migrate files associated with the folder. Run against a
+> test project first and review the index result before migrating production data.
+
+Phase 1 indexes all files that need to move into a local SQLite database. This will return a MigrationResults object. You can use the `as_csv` to check the details of indexing status.
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=214-221}
+```
+Phase 2 reads that database and performs the actual migration. This will return a MigrationResults object. You can use the `as_csv` to check the details of migration status and errors if any.
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!lines=224-234}
+```
+Currently, detailed Traceback is saved in the exception columns of the csv.
+
+
+
+## Source code for this tutorial
+
+
+ Click to show me
+
+```python
+{!docs/tutorials/python/tutorial_scripts/storage_location.py!}
+```
+
+
+## References used in this tutorial
+
+- [StorageLocation][synapseclient.models.StorageLocation]
+- [StorageLocationType][synapseclient.models.StorageLocationType]
+- [Folder][synapseclient.models.Folder]
+- [File][synapseclient.models.File]
+- [Project][synapseclient.models.Project]
+- [syn.login][synapseclient.Synapse.login]
+- [Custom Storage Locations Documentation](https://help.synapse.org/docs/Custom-Storage-Locations.2048327803.html)
+
+## See also
+
+- [Storage Location Architecture](../../explanations/storage_location_architecture.md) -
+ In-depth architecture diagrams and design documentation
diff --git a/docs/tutorials/python/tutorial_screenshots/migration_results.png b/docs/tutorials/python/tutorial_screenshots/migration_results.png
new file mode 100644
index 000000000..501a9cbc6
Binary files /dev/null and b/docs/tutorials/python/tutorial_screenshots/migration_results.png differ
diff --git a/docs/tutorials/python/tutorial_scripts/storage_location.py b/docs/tutorials/python/tutorial_scripts/storage_location.py
new file mode 100644
index 000000000..65dd3fc12
--- /dev/null
+++ b/docs/tutorials/python/tutorial_scripts/storage_location.py
@@ -0,0 +1,268 @@
+"""
+Tutorial code for the Storage Location and project settings.
+"""
+import asyncio
+import hashlib
+import json
+import os
+
+import synapseclient
+from synapseclient.models import (
+ File,
+ Folder,
+ Project,
+ StorageLocation,
+ StorageLocationType,
+)
+
+syn = synapseclient.login()
+
+# Step 1: Retrieve the project
+my_project = Project(name="My uniquely named project about Alzheimer's Disease").get()
+
+# Step 2: Create an External S3 Storage Location that in the same region as the current storage location
+# Replace with your S3 bucket name (must have owner.txt configured)
+MY_BUCKET_NAME = "my-synapse-bucket"
+MY_BASE_KEY = "synapse-data"
+
+external_s3_storage_location = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket=MY_BUCKET_NAME,
+ base_key=MY_BASE_KEY,
+ description="External S3 storage location",
+).store()
+
+print(f"Created storage location: {external_s3_storage_location.storage_location_id}")
+print(f"storage location type: {external_s3_storage_location.storage_type}")
+
+# Step 3. Create a Folder with the new storage location
+external_s3_folder = Folder(name="my-folder-for-external-s3", parent_id=my_project.id)
+external_s3_folder = external_s3_folder.store()
+
+# Set the storage location for the folder
+external_s3_folder.set_storage_location(
+ storage_location_id=external_s3_storage_location.storage_location_id
+)
+external_s3_folder_storage_location = external_s3_folder.get_project_setting()
+# Verify the storage location is set correctly
+assert (
+ external_s3_folder_storage_location["locations"][0]
+ == external_s3_storage_location.storage_location_id
+), "Folder storage location does not match the storage location"
+
+# Step 4: Create a Google Cloud Storage location
+MY_GCS_BUCKET = "my-gcs-bucket"
+MY_GCS_BASE_KEY = "synapse-data"
+gcs_storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_GOOGLE_CLOUD,
+ bucket=MY_GCS_BUCKET,
+ base_key=MY_GCS_BASE_KEY,
+ description="External Google Cloud Storage location",
+).store()
+
+print(f"Created GCS storage location: {gcs_storage.storage_location_id}")
+print(f"storage location type: {gcs_storage.storage_type}")
+
+gcs_folder = Folder(name="my-folder-for-gcs", parent_id=my_project.id)
+gcs_folder = gcs_folder.store()
+
+# Set the storage location for the folder
+gcs_folder.set_storage_location(storage_location_id=gcs_storage.storage_location_id)
+gcs_folder_storage_location = gcs_folder.get_project_setting()
+# Verify the storage location is set correctly
+assert (
+ gcs_folder_storage_location["locations"][0] == gcs_storage.storage_location_id
+), "Folder storage location does not match the storage location"
+
+# Step 5: Create an SFTP storage location
+MY_SFTP_URL = "sftp://your-sftp-server.example.com/upload"
+sftp_storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_SFTP,
+ url=MY_SFTP_URL,
+ supports_subfolders=True,
+ description="External SFTP server",
+).store()
+
+print(f"Created SFTP storage location: {sftp_storage.storage_location_id}")
+print(f"storage location type: {sftp_storage.storage_type}")
+
+sftp_folder = Folder(name="my-folder-for-sftp", parent_id=my_project.id)
+sftp_folder = sftp_folder.store()
+
+# Set the storage location for the folder
+sftp_folder.set_storage_location(storage_location_id=sftp_storage.storage_location_id)
+sftp_folder_storage_location = sftp_folder.get_project_setting()
+# Verify the storage location is set correctly
+assert (
+ sftp_folder_storage_location["locations"][0] == sftp_storage.storage_location_id
+), "Folder storage location does not match the storage location"
+
+# Add a file to the sftp folder, need 'pysftp' package installed.
+file = File(path="/path/to/your/file.csv", parent_id=sftp_folder.id)
+file = file.store()
+
+# Step 6: Create an HTTPS storage location
+# EXTERNAL_HTTPS shares the same underlying API type as EXTERNAL_SFTP but is used
+# when the external server is accessed over HTTPS rather than SFTP.
+my_https_folder = Folder(name="my-folder-for-https", parent_id=my_project.id)
+my_https_folder = my_https_folder.store()
+
+my_https_url = "https://my-https-server.example.com"
+
+https_storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_HTTPS,
+ url=my_https_url,
+ description="External HTTPS server",
+).store()
+
+print(f"Created HTTPS storage location: {https_storage.storage_location_id}")
+print(f"storage location type: {https_storage.storage_type}")
+
+my_https_folder.set_storage_location(
+ storage_location_id=https_storage.storage_location_id
+)
+my_https_folder_storage_location = my_https_folder.get_project_setting()
+assert (
+ my_https_folder_storage_location["locations"][0]
+ == https_storage.storage_location_id
+), "Folder storage location does not match the storage location"
+
+# Note: The Python client does not support uploading files directly to HTTPS
+# storage locations. To add files, use the Synapse web UI or REST API directly.
+
+# Step 7: Create an External Object Store storage location
+# Use this for S3-compatible stores (e.g. OpenStack Swift) not accessed by Synapse.
+MY_OBJECT_STORE_BUCKET = "test-external-object-store"
+MY_OBJECT_STORE_ENDPOINT_URL = "https://s3.us-east-1.amazonaws.com"
+
+object_store_storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_OBJECT_STORE,
+ bucket=MY_OBJECT_STORE_BUCKET,
+ endpoint_url=MY_OBJECT_STORE_ENDPOINT_URL,
+ description="External S3-compatible object store",
+).store()
+
+print(f"Created object store location: {object_store_storage.storage_location_id}")
+print(f"storage location type: {object_store_storage.storage_type}")
+
+# create a folder with the object store storage location
+object_store_folder = Folder(name="my-folder-for-object-store", parent_id=my_project.id)
+object_store_folder = object_store_folder.store()
+
+object_store_folder.set_storage_location(
+ storage_location_id=object_store_storage.storage_location_id
+)
+object_store_folder_storage_location = object_store_folder.get_project_setting()
+assert (
+ object_store_folder_storage_location["locations"][0]
+ == object_store_storage.storage_location_id
+), "Folder storage location does not match the storage location"
+
+# Add a file to the object store folder.
+# Requires AWS credentials (access key and secret key) configured in your environment.
+file = File(path="/path/to/your/file.csv", parent_id=object_store_folder.id)
+file = file.store()
+
+# Step 8: Create a Proxy storage location
+# Use this when a proxy server controls access to the underlying storage.
+my_proxy_folder = Folder(name="my-folder-for-proxy", parent_id=my_project.id)
+my_proxy_folder = my_proxy_folder.store()
+MY_PROXY_URL = "https://my-proxy-server.example.com"
+proxy_storage = StorageLocation(
+ storage_type=StorageLocationType.PROXY,
+ proxy_url=MY_PROXY_URL,
+ secret_key=my_proxy_secret_key,
+ benefactor_id=my_project.id,
+ description="Proxy-controlled storage",
+).store()
+
+print(f"Created proxy storage location: {proxy_storage.storage_location_id}")
+print(f" Proxy URL: {proxy_storage.proxy_url}")
+print(f" Benefactor ID: {proxy_storage.benefactor_id}")
+
+my_proxy_folder.set_storage_location(
+ storage_location_id=proxy_storage.storage_location_id
+)
+my_proxy_folder_storage_location = my_proxy_folder.get_project_setting()
+assert (
+ my_proxy_folder_storage_location["locations"][0]
+ == proxy_storage.storage_location_id
+), "Folder storage location does not match the storage location"
+
+# Add a file to the proxy folder, need a proxy file handle id
+# Create ProxyFileHandle via REST API
+file_path = "/path/to/your/file.csv"
+with open(file_path, "rb") as f:
+ content_md5 = hashlib.md5(f.read(), usedforsecurity=False).hexdigest()
+file_size = os.path.getsize(file_path)
+
+
+async def create_proxy_file_handle():
+ proxy_file_handle = await syn.rest_post_async(
+ "/externalFileHandle/proxy",
+ body=json.dumps(
+ {
+ "concreteType": "org.sagebionetworks.repo.model.file.ProxyFileHandle",
+ "storageLocationId": proxy_storage.storage_location_id,
+ "filePath": "test.csv", # relative path served by your proxy
+ "fileName": "test.csv",
+ "contentType": "text/csv",
+ "contentMd5": content_md5,
+ "contentSize": file_size,
+ }
+ ),
+ endpoint=syn.fileHandleEndpoint,
+ )
+ print(f"File handle ID: {proxy_file_handle['id']}")
+ return proxy_file_handle["id"]
+
+
+proxy_file_handle_id = asyncio.run(create_proxy_file_handle())
+# Associate the ProxyFileHandle with a Synapse File entity
+proxy_file = File(
+ parent_id=my_proxy_folder.id,
+ name="test.csv",
+ data_file_handle_id=proxy_file_handle_id,
+).store()
+print(f"Synapse entity: {proxy_file.id}")
+
+# Step 9: Retrieve and inspect storage location settings
+# Only fields that belong to the storage type are populated after retrieval.
+retrieved_storage = StorageLocation(
+ storage_location_id=external_s3_storage_location.storage_location_id
+).get()
+print(f"Retrieved storage location ID: {retrieved_storage.storage_location_id}")
+print(f"Storage type: {retrieved_storage.storage_type}")
+print(f"Bucket: {retrieved_storage.bucket}")
+print(f"Base key: {retrieved_storage.base_key}")
+
+
+# Step 10: Index and migrate files to the new storage location
+#
+# WARNING: This will actually migrate files associated with the project/folder.
+# Run against a test project first and review the index (MigrationResult) before
+# migrating production data.
+
+# Phase 1: Index files for migration
+my_migration_folder = Folder(
+ name="my-data-migration-folder", parent_id=my_project.id
+).get()
+index_result = my_migration_folder.index_files_for_migration(
+ dest_storage_location_id=external_s3_storage_location.storage_location_id,
+ db_path="/path/to/your/migration.db",
+ include_table_files=False, # Set True if you also want table-attached files
+)
+print(f"Migration index database: {index_result.db_path}")
+print(f"Indexed counts by status: {index_result.counts_by_status}")
+
+# Phase 2: Migrate indexed files
+migrate_result = my_migration_folder.migrate_indexed_files(
+ db_path="/path/to/your/migration.db",
+ continue_on_error=True,
+ force=True, # Skip interactive confirmation for tutorial purposes
+)
+
+if migrate_result is not None:
+ print(f"Migrated counts by status: {migrate_result.counts_by_status}")
+else:
+ print("Migration was aborted (confirmation declined).")
diff --git a/mkdocs.yml b/mkdocs.yml
index eda90ac08..4b52fef08 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -45,8 +45,11 @@ nav:
# - Team: tutorials/python/team.md
- Upload data in bulk: tutorials/python/upload_data_in_bulk.md
- Download data in bulk: tutorials/python/download_data_in_bulk.md
+ - Manifest Operations: tutorials/python/manifest_operations.md
+ - Creating JSON Schema: tutorials/python/schema_operations.md
# - Creating JSON Schema: tutorials/python/schema_operations.md
- Working with JSON Schema: tutorials/python/json_schema.md
+ - Storage Location: tutorials/python/storage_location.md
# - Move Files and Folders: tutorials/python/move_files_and_folders.md
# - Migrate data to other storage locations: tutorials/python/migrate_data_to_other_storage_locations.md
- Working with the Command Line Client: tutorials/command_line_client.md
@@ -112,6 +115,7 @@ nav:
- JSONSchema: reference/experimental/sync/json_schema.md
- Wiki: reference/experimental/sync/wiki.md
- FormGroup and Form: reference/experimental/sync/form.md
+ - StorageLocation: reference/experimental/sync/storage_location.md
- Extensions:
- Curator: reference/extensions/curator.md
- Asynchronous:
@@ -140,15 +144,18 @@ nav:
- JSONSchema: reference/experimental/async/json_schema.md
- Wiki: reference/experimental/async/wiki.md
- FormGroup and Form: reference/experimental/async/form.md
+ - StorageLocation: reference/experimental/async/storage_location.md
- Mixins:
- AccessControllable: reference/experimental/mixins/access_controllable.md
- StorableContainer: reference/experimental/mixins/storable_container.md
+ - ManifestGeneratable: reference/experimental/mixins/manifest_generatable.md
- AsynchronousCommunicator: reference/experimental/mixins/asynchronous_communicator.md
- FailureStrategy: reference/experimental/mixins/failure_strategy.md
- BaseJSONSchema: reference/experimental/mixins/base_json_schema.md
- ContainerEntityJSONSchema: reference/experimental/mixins/container_json_schema.md
- FormData: reference/experimental/mixins/form_data.md
- FormGroup: reference/experimental/mixins/form_group.md
+ - StorageLocationConfigurable: reference/experimental/mixins/storage_location_configurable.md
- Further Reading:
- Home: explanations/home.md
@@ -160,6 +167,7 @@ nav:
- Structuring Your Project: explanations/structuring_your_project.md
- Asyncio Changes in Python 3.14: explanations/asyncio_in_python_3_14.md
- Curator Data model: explanations/curator_data_model.md
+ - Storage Location Architecture: explanations/storage_location_architecture.md
- News:
- news.md
- Contact Us: https://sagebionetworks.jira.com/servicedesk/customer/portal/9/group/16/create/206
@@ -202,6 +210,10 @@ theme:
extra_css:
- css/custom.css
+extra_javascript:
+ - https://unpkg.com/mermaid@10/dist/mermaid.min.js
+ - js/mermaid-init.js
+
plugins:
- search
- mkdocstrings:
diff --git a/synapseclient/api/__init__.py b/synapseclient/api/__init__.py
index 2f9e454ea..36e61117e 100644
--- a/synapseclient/api/__init__.py
+++ b/synapseclient/api/__init__.py
@@ -131,6 +131,14 @@
update_organization_acl,
validate_entity_with_json_schema,
)
+from .storage_location_services import (
+ create_project_setting,
+ create_storage_location_setting,
+ delete_project_setting,
+ get_project_setting,
+ get_storage_location_setting,
+ update_project_setting,
+)
from .table_services import (
ViewEntityType,
ViewTypeMask,
@@ -360,4 +368,11 @@
"create_form_data",
"list_form_data",
"list_form_data_sync",
+ # storage_location_services
+ "create_storage_location_setting",
+ "get_storage_location_setting",
+ "get_project_setting",
+ "create_project_setting",
+ "update_project_setting",
+ "delete_project_setting",
]
diff --git a/synapseclient/api/storage_location_services.py b/synapseclient/api/storage_location_services.py
new file mode 100644
index 000000000..c80070dde
--- /dev/null
+++ b/synapseclient/api/storage_location_services.py
@@ -0,0 +1,178 @@
+"""Services for interacting with storage location settings and project settings in Synapse.
+
+This module provides async REST wrappers for creating, retrieving, and managing
+storage location settings and their associated project settings.
+"""
+
+import json
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+if TYPE_CHECKING:
+ from synapseclient import Synapse
+
+
+async def create_storage_location_setting(
+ request: Dict[str, Any],
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> Dict[str, Any]:
+ """Create a new storage location in Synapse that can be linked to a project,
+ allowing users to upload their data to a storage location they own.
+
+ Storage location creation is idempotent per user - if the same user creates
+ a storage location with identical properties, the existing one is returned.
+
+ Arguments:
+ request: The storage location setting matching .
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The created storage location setting matching .
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ return await client.rest_post_async(
+ uri="/storageLocation",
+ body=json.dumps(request),
+ )
+
+
+async def get_storage_location_setting(
+ storage_location_id: int,
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> Dict[str, Any]:
+ """Retrieve a storage location setting by its ID.
+
+ Only the creator of a StorageLocationSetting can retrieve it by its ID.
+
+ Arguments:
+ storage_location_id: The ID of the storage location setting to retrieve.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The created storage location setting matching .
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ return await client.rest_get_async(
+ uri=f"/storageLocation/{storage_location_id}",
+ )
+
+
+async def get_project_setting(
+ project_id: str,
+ setting_type: str = "upload",
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> Optional[Dict[str, Any]]:
+ """Retrieve the project setting of a particular setting type for the project or folder.
+ Only users with READ access on a project can retrieve its project settings.
+
+ Arguments:
+ project_id: The Synapse ID of the project or folder.
+ setting_type: The type of project setting to retrieve. Currently supports 'upload' only.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The upload destination list setting matching .
+ If the storage location is Synapse S3, the response will be None.
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ response = await client.rest_get_async(
+ uri=f"/projectSettings/{project_id}/type/{setting_type}",
+ )
+ return (
+ response if response else None
+ ) # if no project setting, a empty string is returned as the response
+
+
+async def create_project_setting(
+ request: Dict[str, Any],
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> Dict[str, Any]:
+ """Create a project setting for a project or folder.
+ Only the users with CREATE access to the project or folder can add a project setting.
+ Currently, only the "upload" project setting is supported. This is implemented using UploadDestinationListSetting matching .
+ A project can have a maximum of 10 storage locations.
+
+ Arguments:
+ request: The project setting request body matching .
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The created project setting matching .
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ return await client.rest_post_async(
+ uri="/projectSettings",
+ body=json.dumps(request),
+ )
+
+
+async def update_project_setting(
+ request: Dict[str, Any],
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> None:
+ """Update an existing project setting for a project or folder.
+ Only the users with UPDATE access to the project or folder can update a project setting.
+ Currently, only the "upload" project setting is supported. This is implemented using UploadDestinationListSetting matching .
+ A project can have a maximum of 10 storage locations.
+
+ Arguments:
+ request: The project setting request body including the id field matching .
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ None
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ return await client.rest_put_async(
+ uri="/projectSettings",
+ body=json.dumps(request),
+ )
+
+
+async def delete_project_setting(
+ project_setting_id: str,
+ *,
+ synapse_client: Optional["Synapse"] = None,
+) -> None:
+ """Delete a project setting for a project or folder.
+ Only the users with DELETE access to the project or folder can delete a project setting.
+
+ Arguments:
+ project_setting_id: The ID of the project setting to delete.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ None
+ """
+ from synapseclient import Synapse
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+ await client.rest_delete_async(
+ uri=f"/projectSettings/{project_setting_id}",
+ )
diff --git a/synapseclient/client.py b/synapseclient/client.py
index 2e9c543cb..35d521a27 100644
--- a/synapseclient/client.py
+++ b/synapseclient/client.py
@@ -5512,6 +5512,11 @@ def _createExternalObjectStoreFileHandle(
"/externalFileHandle", json.dumps(file_handle), self.fileHandleEndpoint
)
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `synapseclient.api.post_external_s3_file_handle()` instead.",
+ )
def create_external_s3_file_handle(
self,
bucket_name,
@@ -5650,7 +5655,11 @@ def _getUserCredentials(
# Project/Folder storage location settings #
############################################
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `StorageLocation(...).store()` from synapseclient.models instead.",
+ )
def createStorageLocationSetting(self, storage_type, **kwargs):
"""
Creates an IMMUTABLE storage location based on the specified type.
@@ -5707,7 +5716,12 @@ def createStorageLocationSetting(self, storage_type, **kwargs):
return self.restPOST("/storageLocation", body=json.dumps(kwargs))
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `StorageLocation(storage_location_id=id).get()` from "
+ "synapseclient.models instead.",
+ )
def getMyStorageLocationSetting(self, storage_location_id):
"""
Get a StorageLocationSetting by its id.
@@ -5721,7 +5735,12 @@ def getMyStorageLocationSetting(self, storage_location_id):
"""
return self.restGET("/storageLocation/%s" % storage_location_id)
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `Folder(id=...).set_storage_location(...)` or "
+ "`Project(id=...).set_storage_location(...)` from synapseclient.models instead.",
+ )
def setStorageLocation(self, entity, storage_location_id):
"""
Sets the storage location for a Project or Folder
@@ -5759,7 +5778,12 @@ def setStorageLocation(self, entity, storage_location_id):
"/projectSettings", body=json.dumps(project_destination)
)
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `Folder(id=...).get_project_setting(...)` or "
+ "`Project(id=...).get_project_setting(...)` from synapseclient.models instead.",
+ )
def getProjectSetting(self, project, setting_type):
"""
Gets the ProjectSetting for a project.
@@ -5787,7 +5811,12 @@ def getProjectSetting(self, project, setting_type):
response if response else None
) # if no project setting, a empty string is returned as the response
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `Folder(id=...).get_sts_storage_token(...)` or "
+ "`Project(id=...).get_sts_storage_token(...)` from synapseclient.models instead.",
+ )
def get_sts_storage_token(
self, entity, permission, *, output_format="json", min_remaining_life=None
):
@@ -5820,7 +5849,11 @@ def get_sts_storage_token(
min_remaining_life=min_remaining_life,
)
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `StorageLocation.setup_s3(...)` from synapseclient.models instead.",
+ )
def create_s3_storage_location(
self,
*,
@@ -5862,7 +5895,11 @@ def create_s3_storage_location(
)
)
- # TODO: Deprecate method in https://sagebionetworks.jira.com/browse/SYNPY-1441
+ @deprecated(
+ version="4.12.0",
+ reason="To be removed in 5.0.0. "
+ "Use `StorageLocation.setup_s3_async(...)` from synapseclient.models instead.",
+ )
async def create_s3_storage_location_async(
self,
*,
diff --git a/synapseclient/core/constants/concrete_types.py b/synapseclient/core/constants/concrete_types.py
index fba11dbdb..f34fc3887 100644
--- a/synapseclient/core/constants/concrete_types.py
+++ b/synapseclient/core/constants/concrete_types.py
@@ -9,7 +9,23 @@
EXTERNAL_S3_STORAGE_LOCATION_SETTING = (
"org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting"
)
-# EXTERNAL_GCP_STORAGE_LOCATION_SETTING = 'org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting' # noqa: E501
+EXTERNAL_GCP_STORAGE_LOCATION_SETTING = (
+ "org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting"
+)
+EXTERNAL_STORAGE_LOCATION_SETTING = (
+ "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting"
+)
+EXTERNAL_OBJECT_STORAGE_LOCATION_SETTING = (
+ "org.sagebionetworks.repo.model.project.ExternalObjectStorageLocationSetting"
+)
+PROXY_STORAGE_LOCATION_SETTINGS = (
+ "org.sagebionetworks.repo.model.project.ProxyStorageLocationSettings"
+)
+
+# Concrete types for ProjectSettings
+UPLOAD_DESTINATION_LIST_SETTING = (
+ "org.sagebionetworks.repo.model.project.UploadDestinationListSetting"
+)
# Concrete types for UploadDestinations
SYNAPSE_S3_UPLOAD_DESTINATION = (
@@ -117,6 +133,14 @@
"org.sagebionetworks.repo.model.curation.metadata.RecordBasedMetadataTaskProperties"
)
+# Download List Types
+DOWNLOAD_LIST_MANIFEST_REQUEST = (
+ "org.sagebionetworks.repo.model.download.DownloadListManifestRequest"
+)
+DOWNLOAD_LIST_MANIFEST_RESPONSE = (
+ "org.sagebionetworks.repo.model.download.DownloadListManifestResponse"
+)
+
# Grid Session Types
CREATE_GRID_REQUEST = "org.sagebionetworks.repo.model.grid.CreateGridRequest"
GRID_RECORD_SET_EXPORT_REQUEST = (
diff --git a/synapseclient/core/remote_file_storage_wrappers.py b/synapseclient/core/remote_file_storage_wrappers.py
index 811cfdbd9..9010392e7 100644
--- a/synapseclient/core/remote_file_storage_wrappers.py
+++ b/synapseclient/core/remote_file_storage_wrappers.py
@@ -316,8 +316,9 @@ def progress_callback(*args, **kwargs) -> None:
progress_bar.update(args[0] - progress_bar.n)
parsedURL = SFTPWrapper._parse_for_sftp(url)
+ port_kwargs = {"port": parsedURL.port} if parsedURL.port else {}
with _retry_pysftp_connection(
- parsedURL.hostname, username=username, password=password
+ parsedURL.hostname, username=username, password=password, **port_kwargs
) as sftp:
sftp.makedirs(parsedURL.path)
with sftp.cd(parsedURL.path):
diff --git a/synapseclient/models/__init__.py b/synapseclient/models/__init__.py
index 7a85b6b83..17d966d0b 100644
--- a/synapseclient/models/__init__.py
+++ b/synapseclient/models/__init__.py
@@ -27,6 +27,11 @@
from synapseclient.models.recordset import RecordSet
from synapseclient.models.schema_organization import JSONSchema, SchemaOrganization
from synapseclient.models.services import FailureStrategy
+from synapseclient.models.storage_location import (
+ StorageLocation,
+ StorageLocationType,
+ UploadType,
+)
from synapseclient.models.submission import Submission
from synapseclient.models.submission_bundle import SubmissionBundle
from synapseclient.models.submission_status import SubmissionStatus
@@ -155,6 +160,10 @@
# Form models
"FormGroup",
"FormData",
+ # Storage Location models
+ "StorageLocation",
+ "StorageLocationType",
+ "UploadType",
]
# Static methods to expose as functions
diff --git a/synapseclient/models/folder.py b/synapseclient/models/folder.py
index a0658f521..9a6dff47e 100644
--- a/synapseclient/models/folder.py
+++ b/synapseclient/models/folder.py
@@ -18,6 +18,9 @@
ContainerEntityJSONSchema,
StorableContainer,
)
+from synapseclient.models.mixins.storage_location_mixin import (
+ StorageLocationConfigurable,
+)
from synapseclient.models.protocols.folder_protocol import FolderSynchronousProtocol
from synapseclient.models.services.search import get_id
from synapseclient.models.services.storable_entity import store_entity
@@ -47,6 +50,7 @@ class Folder(
AccessControllable,
StorableContainer,
ContainerEntityJSONSchema,
+ StorageLocationConfigurable,
):
"""Folder is a hierarchical container for organizing data in Synapse.
diff --git a/synapseclient/models/mixins/__init__.py b/synapseclient/models/mixins/__init__.py
index 62ddcf017..443c34810 100644
--- a/synapseclient/models/mixins/__init__.py
+++ b/synapseclient/models/mixins/__init__.py
@@ -2,6 +2,7 @@
from synapseclient.models.mixins.access_control import AccessControllable
from synapseclient.models.mixins.asynchronous_job import AsynchronousCommunicator
+from synapseclient.models.mixins.enum_coercion import EnumCoercionMixin
from synapseclient.models.mixins.form import (
FormChangeRequest,
FormData,
@@ -21,10 +22,15 @@
ValidationException,
)
from synapseclient.models.mixins.storable_container import StorableContainer
+from synapseclient.models.mixins.storage_location_mixin import (
+ StorageLocationConfigurable,
+)
__all__ = [
"AccessControllable",
+ "EnumCoercionMixin",
"StorableContainer",
+ "StorageLocationConfigurable",
"AsynchronousCommunicator",
"BaseJSONSchema",
"ContainerEntityJSONSchema",
diff --git a/synapseclient/models/mixins/asynchronous_job.py b/synapseclient/models/mixins/asynchronous_job.py
index fd3649bc1..407babe92 100644
--- a/synapseclient/models/mixins/asynchronous_job.py
+++ b/synapseclient/models/mixins/asynchronous_job.py
@@ -14,6 +14,7 @@
AGENT_CHAT_REQUEST,
CREATE_GRID_REQUEST,
CREATE_SCHEMA_REQUEST,
+ DOWNLOAD_LIST_MANIFEST_REQUEST,
GET_VALIDATION_SCHEMA_REQUEST,
GRID_RECORD_SET_EXPORT_REQUEST,
QUERY_BUNDLE_REQUEST,
@@ -29,6 +30,7 @@
ASYNC_JOB_URIS = {
AGENT_CHAT_REQUEST: "/agent/chat/async",
CREATE_GRID_REQUEST: "/grid/session/async",
+ DOWNLOAD_LIST_MANIFEST_REQUEST: "/download/list/manifest/async",
GRID_RECORD_SET_EXPORT_REQUEST: "/grid/export/recordset/async",
TABLE_UPDATE_TRANSACTION_REQUEST: "/entity/{entityId}/table/transaction/async",
GET_VALIDATION_SCHEMA_REQUEST: "/schema/type/validation/async",
diff --git a/synapseclient/models/mixins/enum_coercion.py b/synapseclient/models/mixins/enum_coercion.py
new file mode 100644
index 000000000..5eef2f802
--- /dev/null
+++ b/synapseclient/models/mixins/enum_coercion.py
@@ -0,0 +1,32 @@
+"""Mixin for automatic enum coercion in dataclasses."""
+
+from typing import Any, ClassVar, Dict
+
+
+class EnumCoercionMixin:
+ """Mixin for dataclasses that auto-coerces string values to enum types.
+
+ Subclasses declare a class-level ``_ENUM_FIELDS`` dict mapping field names
+ to their enum classes. On every ``__setattr__`` call the mixin checks
+ whether the target field is listed and, if the incoming value is not
+ already the correct enum type, coerces it via the enum constructor.
+
+ Example::
+
+ @dataclass
+ class MyModel(EnumCoercionMixin):
+ _ENUM_FIELDS = {"status": StatusEnum}
+ status: Optional[Union[str, StatusEnum]] = None
+ """
+
+ _ENUM_FIELDS: ClassVar[Dict[str, type]] = {}
+
+ def __setattr__(self, name: str, value: Any) -> None:
+ enum_cls = self._ENUM_FIELDS.get(name)
+ if (
+ value is not None
+ and enum_cls is not None
+ and not isinstance(value, enum_cls)
+ ):
+ value = enum_cls(value)
+ super().__setattr__(name, value)
diff --git a/synapseclient/models/mixins/storable_container.py b/synapseclient/models/mixins/storable_container.py
index 25432a6b9..1a2d557f2 100644
--- a/synapseclient/models/mixins/storable_container.py
+++ b/synapseclient/models/mixins/storable_container.py
@@ -159,6 +159,7 @@ async def sync_from_synapse_async(
link_hops: int = 1,
queue: asyncio.Queue = None,
include_types: Optional[List[str]] = None,
+ generate_manifest: str = "suppress",
*,
synapse_client: Optional[Synapse] = None,
) -> Self:
@@ -170,9 +171,8 @@ async def sync_from_synapse_async(
If you only want to retrieve the full tree of metadata about your
container specify `download_file` as False.
- This works similar to [synapseutils.syncFromSynapse][], however, this does not
- currently support the writing of data to a manifest TSV file. This will be a
- future enhancement.
+ This works similar to [synapseutils.syncFromSynapse][] and supports
+ generating a manifest TSV file with file metadata.
Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets,
DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The
@@ -208,6 +208,13 @@ async def sync_from_synapse_async(
`["folder", "file", "table", "entityview", "dockerrepo",
"submissionview", "dataset", "datasetcollection", "materializedview",
"virtualtable"]`.
+ generate_manifest: Controls manifest file generation. Options:
+
+ - "all": Create a manifest in each directory level
+ - "root": Create a single manifest at the root path only
+ - "suppress": (Default) Do not create any manifest files
+
+ A path must be specified for manifest generation.
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
@@ -386,7 +393,7 @@ async def my_function():
file_size=1, synapse_client=syn, custom_message=custom_message
):
self._synced_from_synapse = True
- return await self._sync_from_synapse_async(
+ await self._sync_from_synapse_async(
path=path,
recursive=recursive,
download_file=download_file,
@@ -400,6 +407,19 @@ async def my_function():
synapse_client=syn,
)
+ # Generate manifest if requested and path is provided
+ if generate_manifest != "suppress" and path:
+ # The manifest generation is handled by ManifestGeneratable mixin
+ # which provides generate_manifest_async method
+ if hasattr(self, "generate_manifest_async"):
+ await self.generate_manifest_async(
+ path=path,
+ manifest_scope=generate_manifest,
+ synapse_client=syn,
+ )
+
+ return self
+
async def _sync_from_synapse_async(
self: Self,
path: Optional[str] = None,
diff --git a/synapseclient/models/mixins/storage_location_mixin.py b/synapseclient/models/mixins/storage_location_mixin.py
new file mode 100644
index 000000000..a10308ca9
--- /dev/null
+++ b/synapseclient/models/mixins/storage_location_mixin.py
@@ -0,0 +1,450 @@
+"""Mixin for entities that can have their storage location configured."""
+
+import asyncio
+from typing import Any, Dict, List, Optional, Union
+
+from synapseclient import Synapse
+from synapseclient.api.storage_location_services import (
+ create_project_setting,
+ delete_project_setting,
+ get_project_setting,
+ update_project_setting,
+)
+from synapseclient.core.async_utils import async_to_sync, otel_trace_method
+from synapseclient.core.constants import concrete_types
+from synapseclient.models.protocols.storage_location_mixin_protocol import (
+ StorageLocationConfigurableSynchronousProtocol,
+)
+from synapseclient.models.services.migration import (
+ index_files_for_migration_async as _index_files_for_migration_async,
+)
+from synapseclient.models.services.migration import (
+ migrate_indexed_files_async as _migrate_indexed_files_async,
+)
+from synapseclient.models.services.migration_types import MigrationResult
+
+# Default storage location ID used by Synapse
+DEFAULT_STORAGE_LOCATION_ID = 1
+
+
+@async_to_sync
+class StorageLocationConfigurable(StorageLocationConfigurableSynchronousProtocol):
+ """Mixin for objects that can have their storage location configured.
+
+ In order to use this mixin, the class must have an `id` attribute.
+
+ This mixin provides methods for:
+ - Setting and getting the upload storage location for an entity
+ - Getting STS (AWS Security Token Service) credentials for direct S3 access
+ - Migrating files to a new storage location
+ """
+
+ id: Optional[str] = None
+ """The unique immutable ID for this entity."""
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_SetStorageLocation: {self.id}"
+ )
+ async def set_storage_location_async(
+ self,
+ storage_location_id: Optional[Union[int, List[int]]] = None,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Dict[str, Any]:
+ """Set the upload storage location for this entity. This configures where
+ files uploaded to this entity will be stored.
+
+ Arguments:
+ storage_location_id: The storage location ID(s) to set. Can be a single
+ ID, a list of IDs (first is default, max 10), or None to use
+ Synapse default storage.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The project setting dict returned from Synapse.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Using this function
+ Set storage location on a folder:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Folder
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ folder = await Folder(id="syn123").get_async()
+ setting = await folder.set_storage_location_async(
+ storage_location_id=12345
+ )
+ print(setting)
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ if storage_location_id is None:
+ storage_location_id = DEFAULT_STORAGE_LOCATION_ID
+
+ locations = (
+ storage_location_id
+ if isinstance(storage_location_id, list)
+ else [storage_location_id]
+ )
+
+ existing_setting = await get_project_setting(
+ project_id=self.id,
+ setting_type="upload",
+ synapse_client=synapse_client,
+ )
+
+ if existing_setting is not None:
+ existing_setting["locations"] = locations
+ await update_project_setting(
+ request=existing_setting,
+ synapse_client=synapse_client,
+ )
+ return await get_project_setting(
+ project_id=self.id,
+ setting_type="upload",
+ synapse_client=synapse_client,
+ )
+ else:
+ project_destination = {
+ "concreteType": concrete_types.UPLOAD_DESTINATION_LIST_SETTING,
+ "settingsType": "upload",
+ "locations": locations,
+ "projectId": self.id,
+ }
+ return await create_project_setting(
+ request=project_destination,
+ synapse_client=synapse_client,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_GetProjectSetting: {self.id}"
+ )
+ async def get_project_setting_async(
+ self,
+ setting_type: str = "upload",
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Optional[Dict[str, Any]]:
+ """Get the project setting for this entity.
+
+ Arguments:
+ setting_type: The type of setting to retrieve. One of:
+ 'upload', 'external_sync', 'requester_pays'. Default: 'upload'.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The project setting as a dictionary, or None if no setting exists.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Using this function
+ Get the upload settings for a folder:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Folder
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ folder = await Folder(id="syn123").get_async()
+ setting = await folder.get_project_setting_async(setting_type="upload")
+ if setting:
+ print(f"Storage locations: {setting.get('locations')}")
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ if setting_type not in {"upload", "external_sync", "requester_pays"}:
+ raise ValueError(f"Invalid setting_type: {setting_type}")
+
+ return await get_project_setting(
+ project_id=self.id,
+ setting_type=setting_type,
+ synapse_client=synapse_client,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_DeleteProjectSetting: {self.id}"
+ )
+ async def delete_project_setting_async(
+ self,
+ setting_id: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> None:
+ """Delete a project setting by its setting ID.
+
+ Arguments:
+ setting_id: The ID of the project setting to delete.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ None
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Using this function
+ Delete the upload settings for a folder:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Folder
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ folder = await Folder(id="syn123").get_async()
+ setting = await folder.get_project_setting_async(setting_type="upload")
+ if setting:
+ await folder.delete_project_setting_async(setting_id=setting['id'])
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ await delete_project_setting(
+ setting_id=setting_id,
+ synapse_client=synapse_client,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_GetStsStorageToken: {self.id}"
+ )
+ async def get_sts_storage_token_async(
+ self,
+ permission: str,
+ *,
+ output_format: str = "json",
+ min_remaining_life: Optional[int] = None,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Any:
+ """Get STS (AWS Security Token Service) credentials for direct access to
+ the storage location backing this entity. These credentials can be used
+ with AWS tools like awscli and boto3.
+
+ Arguments:
+ permission: The permission level for the token. Must be 'read_only'
+ or 'read_write'.
+ output_format: The output format for the credentials. Options:
+ 'json' (default), 'boto', 'shell', 'bash', 'cmd', 'powershell'.
+ min_remaining_life: The minimum remaining life (in seconds) for a
+ cached token before a new one is fetched.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The STS credentials in the requested format.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Using credentials with boto3
+ Get STS credentials for an STS-enabled folder and use with boto3:
+
+ import asyncio
+ import boto3
+ from synapseclient import Synapse
+ from synapseclient.models import Folder
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ folder = await Folder(id="syn123").get_async()
+ credentials = await folder.get_sts_storage_token_async(
+ permission="read_write",
+ output_format="boto",
+ )
+ s3_client = boto3.client('s3', **credentials)
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ from synapseclient.core import sts_transfer
+
+ client = Synapse.get_client(synapse_client=synapse_client)
+
+ return await asyncio.to_thread(
+ sts_transfer.get_sts_credentials,
+ client,
+ self.id,
+ permission,
+ output_format=output_format,
+ min_remaining_life=min_remaining_life,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_IndexFilesForMigration: {self.id}"
+ )
+ async def index_files_for_migration_async(
+ self,
+ dest_storage_location_id: int,
+ db_path: Optional[str] = None,
+ *,
+ source_storage_location_ids: Optional[List[int]] = None,
+ file_version_strategy: str = "new",
+ include_table_files: bool = False,
+ continue_on_error: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> MigrationResult:
+ """Index files in this entity for migration to a new storage location.
+
+ This is the first step in migrating files to a new storage location.
+ After indexing, use `migrate_indexed_files` to perform the actual migration.
+
+ Arguments:
+ dest_storage_location_id: The destination storage location ID.
+ db_path: Path to the SQLite database file for tracking migration state.
+ If not provided, a temporary directory will be used. The path
+ can be retrieved from the returned MigrationResult.db_path.
+ source_storage_location_ids: Optional list of source storage location IDs
+ to filter which files to migrate. If None, all files are indexed.
+ file_version_strategy: Strategy for handling file versions. Options:
+ 'new' (default) - create new versions, 'all' - migrate all versions,
+ 'latest' - only migrate latest version, 'skip' - skip if file exists.
+ include_table_files: Whether to include files attached to tables.
+ continue_on_error: Whether to continue indexing if an error occurs.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A MigrationResult object containing indexing statistics and the database
+ path (accessible via result.db_path).
+
+ Example: Indexing files for migration
+ Index files in a project for migration:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Project
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ project = await Project(id="syn123").get_async()
+ result = await project.index_files_for_migration_async(
+ dest_storage_location_id=12345,
+ )
+ print(f"Database path: {result.db_path}")
+ print(f"Indexed {result.counts_by_status}")
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ return await _index_files_for_migration_async(
+ self,
+ dest_storage_location_id=str(dest_storage_location_id),
+ db_path=db_path,
+ source_storage_location_ids=(
+ [str(s) for s in source_storage_location_ids]
+ if source_storage_location_ids
+ else None
+ ),
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ continue_on_error=continue_on_error,
+ synapse_client=synapse_client,
+ )
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"Entity_MigrateIndexedFiles: {self.id}"
+ )
+ async def migrate_indexed_files_async(
+ self,
+ db_path: str,
+ *,
+ create_table_snapshots: bool = True,
+ continue_on_error: bool = False,
+ force: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Optional[MigrationResult]:
+ """Migrate files that have been indexed with `index_files_for_migration`.
+
+ This is the second step in migrating files to a new storage location.
+ Files must first be indexed using `index_files_for_migration`.
+
+ Arguments:
+ db_path: Path to the SQLite database file created by
+ `index_files_for_migration`. You can get this from the
+ MigrationResult.db_path returned by index_files_for_migration.
+ create_table_snapshots: Whether to create table snapshots before
+ migrating table files.
+ continue_on_error: Whether to continue migration if an error occurs.
+ force: Whether to force migration of files that have already been
+ migrated. Also bypasses interactive confirmation.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A MigrationResult object containing migration statistics, or None
+ if the user declined the confirmation prompt.
+
+ Example: Migrating indexed files
+ Migrate previously indexed files:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import Project
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ project = await Project(id="syn123").get_async()
+
+ # Index first
+ index_result = await project.index_files_for_migration_async(
+ dest_storage_location_id=12345,
+ )
+
+ # Then migrate using the db_path from index result
+ result = await project.migrate_indexed_files_async(
+ db_path=index_result.db_path,
+ force=True, # Skip interactive confirmation
+ )
+ print(f"Migrated {result.counts_by_status}")
+
+ asyncio.run(main())
+ """
+ if not self.id:
+ raise ValueError("The entity must have an id set.")
+
+ return await _migrate_indexed_files_async(
+ db_path=db_path,
+ create_table_snapshots=create_table_snapshots,
+ continue_on_error=continue_on_error,
+ force=force,
+ synapse_client=synapse_client,
+ )
diff --git a/synapseclient/models/project.py b/synapseclient/models/project.py
index a1a6a1c21..d5a4479c2 100644
--- a/synapseclient/models/project.py
+++ b/synapseclient/models/project.py
@@ -18,6 +18,9 @@
ContainerEntityJSONSchema,
StorableContainer,
)
+from synapseclient.models.mixins.storage_location_mixin import (
+ StorageLocationConfigurable,
+)
from synapseclient.models.protocols.project_protocol import ProjectSynchronousProtocol
from synapseclient.models.services.search import get_id
from synapseclient.models.services.storable_entity import store_entity
@@ -46,6 +49,7 @@ class Project(
AccessControllable,
StorableContainer,
ContainerEntityJSONSchema,
+ StorageLocationConfigurable,
):
"""A Project is a top-level container for organizing data in Synapse.
diff --git a/synapseclient/models/protocols/download_list_protocol.py b/synapseclient/models/protocols/download_list_protocol.py
new file mode 100644
index 000000000..7152d4bf1
--- /dev/null
+++ b/synapseclient/models/protocols/download_list_protocol.py
@@ -0,0 +1,97 @@
+"""Protocol for the specific methods of download list classes that have synchronous counterparts
+generated at runtime."""
+
+from typing import Any, Dict, Optional, Protocol
+
+from typing_extensions import Self
+
+from synapseclient import Synapse
+
+
+class DownloadListManifestRequestSynchronousProtocol(Protocol):
+ """
+ The protocol for methods that are asynchronous but also
+ have a synchronous counterpart that may also be called.
+ """
+
+ def send_job_and_wait(
+ self,
+ post_exchange_args: Optional[Dict[str, Any]] = None,
+ timeout: int = 120,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Self:
+ """Send the job to the Asynchronous Job service and wait for it to complete.
+
+ This method sends the manifest generation request to Synapse and waits
+ for the job to complete. After completion, the `result_file_handle_id`
+ attribute will be populated.
+
+ Arguments:
+ post_exchange_args: Additional arguments to pass to the request.
+ timeout: The number of seconds to wait for the job to complete or progress
+ before raising a SynapseTimeoutError. Defaults to 120.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ This instance with `result_file_handle_id` populated.
+
+ Raises:
+ SynapseTimeoutError: If the job does not complete within the timeout.
+ SynapseError: If the job fails.
+
+ Example: Generate a manifest
+ Generate a manifest from the download list:
+
+ from synapseclient.models import DownloadListManifestRequest
+ import synapseclient
+
+ synapseclient.login()
+
+ request = DownloadListManifestRequest()
+ request.send_job_and_wait()
+ print(f"Manifest file handle: {request.result_file_handle_id}")
+ """
+ return self
+
+ def download_manifest(
+ self,
+ download_path: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> str:
+ """
+ Download the generated manifest file to a local path.
+
+ This method should be called after `send_job_and_wait()` has completed
+ successfully and `result_file_handle_id` is populated.
+
+ Arguments:
+ download_path: The local directory path where the manifest will be saved.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The full path to the downloaded manifest file.
+
+ Raises:
+ ValueError: If the manifest has not been generated yet (no result_file_handle_id).
+
+ Example: Download the manifest after generation
+ Generate and download a manifest:
+
+ from synapseclient.models import DownloadListManifestRequest
+ import synapseclient
+
+ synapseclient.login()
+
+ request = DownloadListManifestRequest()
+ request.send_job_and_wait()
+
+ manifest_path = request.download_manifest(download_path="/path/to/download")
+ print(f"Manifest downloaded to: {manifest_path}")
+ """
+ return ""
diff --git a/synapseclient/models/protocols/storable_container_protocol.py b/synapseclient/models/protocols/storable_container_protocol.py
index 0352132d1..245836adf 100644
--- a/synapseclient/models/protocols/storable_container_protocol.py
+++ b/synapseclient/models/protocols/storable_container_protocol.py
@@ -29,6 +29,7 @@ def sync_from_synapse(
link_hops: int = 1,
queue: asyncio.Queue = None,
include_types: Optional[List[str]] = None,
+ generate_manifest: str = "suppress",
*,
synapse_client: Optional[Synapse] = None,
) -> Self:
@@ -40,9 +41,8 @@ def sync_from_synapse(
If you only want to retrieve the full tree of metadata about your
container specify `download_file` as False.
- This works similar to [synapseutils.syncFromSynapse][], however, this does not
- currently support the writing of data to a manifest TSV file. This will be a
- future enhancement.
+ This works similar to [synapseutils.syncFromSynapse][] and supports
+ generating a manifest TSV file with file metadata.
Supports syncing Files, Folders, Tables, EntityViews, SubmissionViews, Datasets,
DatasetCollections, MaterializedViews, and VirtualTables from Synapse. The
@@ -74,6 +74,13 @@ def sync_from_synapse(
include_types: Must be a list of entity types (ie. ["folder","file"]) which
can be found
[here](https://rest-docs.synapse.org/rest/org/sagebionetworks/repo/model/EntityType.html)
+ generate_manifest: Controls manifest file generation. Options:
+
+ - "all": Create a manifest in each directory level
+ - "root": Create a single manifest at the root path only
+ - "suppress": (Default) Do not create any manifest files
+
+ A path must be specified for manifest generation.
synapse_client: If not passed in and caching was not disabled by
`Synapse.allow_client_caching(False)` this will use the last created
instance from the Synapse class constructor.
diff --git a/synapseclient/models/protocols/storage_location_mixin_protocol.py b/synapseclient/models/protocols/storage_location_mixin_protocol.py
new file mode 100644
index 000000000..7403972a6
--- /dev/null
+++ b/synapseclient/models/protocols/storage_location_mixin_protocol.py
@@ -0,0 +1,279 @@
+"""Protocol for the specific methods of StorageLocationConfigurable mixin that have
+synchronous counterparts generated at runtime."""
+
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Protocol, Union
+
+from synapseclient import Synapse
+
+if TYPE_CHECKING:
+ from synapseclient.models.services.migration_types import MigrationResult
+
+
+class StorageLocationConfigurableSynchronousProtocol(Protocol):
+ """
+ The protocol for methods that are asynchronous but also
+ have a synchronous counterpart that may also be called.
+ """
+
+ def set_storage_location(
+ self,
+ storage_location_id: Optional[Union[int, List[int]]] = None,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Dict[str, Any]:
+ """Set the upload storage location for this entity. This configures where
+ files uploaded to this entity will be stored.
+
+ Arguments:
+ storage_location_id: The storage location ID(s) to set. Can be a single
+ ID, a list of IDs (first is default, max 10), or None to use
+ Synapse default storage.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The project setting dict returned from Synapse.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Setting storage location on a folder
+ Set storage location on a folder:
+
+ from synapseclient.models import Folder
+
+ import synapseclient
+ synapseclient.login()
+
+ folder = Folder(id="syn123").get()
+ setting = folder.set_storage_location(storage_location_id=12345)
+ print(setting)
+ """
+ return {}
+
+ def get_project_setting(
+ self,
+ setting_type: str = "upload",
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Optional[Dict[str, Any]]:
+ """Get the project setting for this entity.
+
+ Arguments:
+ setting_type: The type of setting to retrieve. One of:
+ 'upload', 'external_sync', 'requester_pays'. Default: 'upload'.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The project setting as a dictionary, or None if no setting exists.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Getting project settings
+ Get the upload settings for a folder:
+
+ from synapseclient.models import Folder
+
+ import synapseclient
+ synapseclient.login()
+
+ folder = Folder(id="syn123").get()
+ setting = folder.get_project_setting(setting_type="upload")
+ if setting:
+ print(f"Storage locations: {setting.get('locations')}")
+ """
+ return {}
+
+ def delete_project_setting(
+ self,
+ setting_id: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> None:
+ """Delete a project setting by its setting ID.
+
+ Arguments:
+ setting_id: The ID of the project setting to delete.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ None
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Deleting a project setting
+ Delete the upload settings for a folder:
+
+ from synapseclient.models import Folder
+
+ import synapseclient
+ synapseclient.login()
+
+ folder = Folder(id="syn123").get()
+ setting = folder.get_project_setting(setting_type="upload")
+ if setting:
+ folder.delete_project_setting(setting_id=setting['id'])
+ """
+ return None
+
+ def get_sts_storage_token(
+ self,
+ permission: str,
+ *,
+ output_format: str = "json",
+ min_remaining_life: Optional[int] = None,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Any:
+ """Get STS (AWS Security Token Service) credentials for direct access to
+ the storage location backing this entity. These credentials can be used
+ with AWS tools like awscli and boto3.
+
+ Arguments:
+ permission: The permission level for the token. Must be 'read_only'
+ or 'read_write'.
+ output_format: The output format for the credentials. Options:
+ 'json' (default), 'boto', 'shell', 'bash', 'cmd', 'powershell'.
+ min_remaining_life: The minimum remaining life (in seconds) for a
+ cached token before a new one is fetched.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The STS credentials in the requested format.
+
+ Raises:
+ ValueError: If the entity does not have an id set.
+
+ Example: Using credentials with boto3
+ Get STS credentials for an STS-enabled folder and use with boto3:
+
+ import boto3
+ from synapseclient.models import Folder
+
+ import synapseclient
+ synapseclient.login()
+
+ folder = Folder(id="syn123").get()
+ credentials = folder.get_sts_storage_token(
+ permission="read_write",
+ output_format="boto",
+ )
+ s3_client = boto3.client('s3', **credentials)
+ """
+ return {}
+
+ def index_files_for_migration(
+ self,
+ dest_storage_location_id: int,
+ db_path: Optional[str] = None,
+ *,
+ source_storage_location_ids: Optional[List[int]] = None,
+ file_version_strategy: str = "new",
+ include_table_files: bool = False,
+ continue_on_error: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> "MigrationResult":
+ """Index files in this entity for migration to a new storage location.
+
+ This is the first step in migrating files to a new storage location.
+ After indexing, use `migrate_indexed_files` to perform the actual migration.
+
+ Arguments:
+ dest_storage_location_id: The destination storage location ID.
+ db_path: Path to the SQLite database file for tracking migration state.
+ If not provided, a temporary directory will be used. The path
+ can be retrieved from the returned MigrationResult.db_path.
+ source_storage_location_ids: Optional list of source storage location IDs
+ to filter which files to migrate. If None, all files are indexed.
+ file_version_strategy: Strategy for handling file versions. Options:
+ 'new' (default) - create new versions, 'all' - migrate all versions,
+ 'latest' - only migrate latest version, 'skip' - skip if file exists.
+ include_table_files: Whether to include files attached to tables.
+ continue_on_error: Whether to continue indexing if an error occurs.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A MigrationResult object containing indexing statistics and the database
+ path (accessible via result.db_path).
+
+ Example: Indexing files for migration
+ Index files in a project for migration:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ project = Project(id="syn123").get()
+ result = project.index_files_for_migration(
+ dest_storage_location_id=12345,
+ )
+ print(f"Database path: {result.db_path}")
+ print(f"Indexed {result.counts_by_status}")
+ """
+ return None
+
+ def migrate_indexed_files(
+ self,
+ db_path: str,
+ *,
+ create_table_snapshots: bool = True,
+ continue_on_error: bool = False,
+ force: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Optional["MigrationResult"]:
+ """Migrate files that have been indexed with `index_files_for_migration`.
+
+ This is the second step in migrating files to a new storage location.
+ Files must first be indexed using `index_files_for_migration`.
+
+ Arguments:
+ db_path: Path to the SQLite database file created by
+ `index_files_for_migration`. You can get this from the
+ MigrationResult.db_path returned by index_files_for_migration.
+ create_table_snapshots: Whether to create table snapshots before
+ migrating table files.
+ continue_on_error: Whether to continue migration if an error occurs.
+ force: Whether to force migration of files that have already been
+ migrated. Also bypasses interactive confirmation.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A MigrationResult object containing migration statistics, or None
+ if the user declined the confirmation prompt.
+
+ Example: Migrating indexed files
+ Migrate previously indexed files:
+
+ from synapseclient.models import Project
+
+ import synapseclient
+ synapseclient.login()
+
+ project = Project(id="syn123").get()
+
+ # Index first
+ index_result = project.index_files_for_migration(
+ dest_storage_location_id=12345,
+ )
+
+ # Then migrate using the db_path from index result
+ result = project.migrate_indexed_files(
+ db_path=index_result.db_path,
+ force=True, # Skip interactive confirmation
+ )
+ print(f"Migrated {result.counts_by_status}")
+ """
+ return None
diff --git a/synapseclient/models/protocols/storage_location_protocol.py b/synapseclient/models/protocols/storage_location_protocol.py
new file mode 100644
index 000000000..e602daaa6
--- /dev/null
+++ b/synapseclient/models/protocols/storage_location_protocol.py
@@ -0,0 +1,159 @@
+"""Protocol for the specific methods of StorageLocation that have synchronous counterparts
+generated at runtime."""
+
+from typing import TYPE_CHECKING, Optional, Protocol, Tuple
+
+from synapseclient import Synapse
+
+if TYPE_CHECKING:
+ from synapseclient.models import Folder
+ from synapseclient.models.storage_location import StorageLocation
+
+
+class StorageLocationSynchronousProtocol(Protocol):
+ """
+ The protocol for methods that are asynchronous but also
+ have a synchronous counterpart that may also be called.
+ """
+
+ def store(
+ self,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> "StorageLocation":
+ """Create this storage location in Synapse. Storage locations are immutable;
+ this always creates a new one. If a storage location with identical properties
+ already exists for this user, the existing one is returned (idempotent).
+
+ Arguments:
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The StorageLocation object with server-assigned fields populated.
+
+ Raises:
+ ValueError: If `storage_type` is not set.
+
+ Example: Creating an external S3 storage location
+ Create a storage location backed by your own S3 bucket:
+
+ from synapseclient.models import StorageLocation, StorageLocationType
+
+ import synapseclient
+ synapseclient.login()
+
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket="my-external-synapse-bucket",
+ base_key="path/within/bucket",
+ ).store()
+
+ print(f"Storage location ID: {storage.storage_location_id}")
+ """
+ return self
+
+ def get(
+ self,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> "StorageLocation":
+ """Retrieve this storage location from Synapse by its ID. Only the creator of
+ a StorageLocationSetting can retrieve it by its id.
+
+ Arguments:
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The StorageLocation object populated with data from Synapse.
+
+ Raises:
+ ValueError: If `storage_location_id` is not set.
+
+ Example: Retrieving a storage location
+ Retrieve a storage location by ID:
+
+ from synapseclient.models import StorageLocation
+
+ import synapseclient
+ synapseclient.login()
+
+ storage = StorageLocation(storage_location_id=12345).get()
+ print(f"Type: {storage.storage_type}, Bucket: {storage.bucket}")
+ """
+ return self
+
+ @classmethod
+ def setup_s3(
+ cls,
+ *,
+ parent: str,
+ folder_name: Optional[str] = None,
+ folder: Optional["Folder"] = None,
+ bucket_name: Optional[str] = None,
+ base_key: Optional[str] = None,
+ sts_enabled: bool = False,
+ synapse_client: Optional[Synapse] = None,
+ ) -> Tuple["Folder", "StorageLocation"]:
+ """Convenience method to create a folder backed by S3 storage. This will:
+
+ 1. Create or retrieve the folder
+ 2. Create the storage location setting
+ 3. Apply the storage location to the folder via project settings
+
+ Arguments:
+ parent: The parent project or folder ID (e.g., "syn123").
+ folder_name: Name for a new folder. Either `folder_name` or `folder`
+ must be provided.
+ folder: An existing Folder object or Synapse ID. Either `folder_name`
+ or `folder` must be provided.
+ bucket_name: The S3 bucket name. If None, uses Synapse default storage.
+ base_key: The base key (prefix) within the bucket. Optional.
+ sts_enabled: Whether to enable STS credentials for this storage location.
+ Default: False.
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ A tuple of (Folder, StorageLocation).
+
+ Raises:
+ ValueError: If neither `folder_name` nor `folder` is provided, or if both
+ are provided.
+
+ Example: Creating an STS-enabled folder with external S3 storage
+ Create a folder with STS-enabled storage:
+
+ from synapseclient.models import StorageLocation
+
+ import synapseclient
+ synapseclient.login()
+
+ folder, storage = StorageLocation.setup_s3(
+ folder_name="my-sts-folder",
+ parent="syn123",
+ bucket_name="my-external-synapse-bucket",
+ base_key="path/within/bucket",
+ sts_enabled=True,
+ )
+ print(f"Folder: {folder.id}, Storage: {storage.storage_location_id}")
+
+ Example: Using an existing folder
+ Apply S3 storage to an existing folder:
+
+ from synapseclient.models import StorageLocation, Folder
+
+ import synapseclient
+ synapseclient.login()
+
+ existing_folder = Folder(id="syn456").get()
+ folder, storage = StorageLocation.setup_s3(
+ folder=existing_folder,
+ bucket_name="my-bucket",
+ )
+ """
+ return None
diff --git a/synapseclient/models/services/__init__.py b/synapseclient/models/services/__init__.py
index d1e7227ca..5ff746bab 100644
--- a/synapseclient/models/services/__init__.py
+++ b/synapseclient/models/services/__init__.py
@@ -1,3 +1,15 @@
+from synapseclient.models.services.migration import (
+ index_files_for_migration_async,
+ migrate_indexed_files_async,
+)
+from synapseclient.models.services.migration_types import (
+ MigrationError,
+ MigrationKey,
+ MigrationResult,
+ MigrationSettings,
+ MigrationStatus,
+ MigrationType,
+)
from synapseclient.models.services.search import get_id
from synapseclient.models.services.storable_entity import store_entity
from synapseclient.models.services.storable_entity_components import (
@@ -5,4 +17,17 @@
store_entity_components,
)
-__all__ = ["store_entity_components", "store_entity", "FailureStrategy", "get_id"]
+__all__ = [
+ "store_entity_components",
+ "store_entity",
+ "FailureStrategy",
+ "get_id",
+ "index_files_for_migration_async",
+ "migrate_indexed_files_async",
+ "MigrationResult",
+ "MigrationStatus",
+ "MigrationType",
+ "MigrationKey",
+ "MigrationSettings",
+ "MigrationError",
+]
diff --git a/synapseclient/models/services/migration.py b/synapseclient/models/services/migration.py
new file mode 100644
index 000000000..e13056821
--- /dev/null
+++ b/synapseclient/models/services/migration.py
@@ -0,0 +1,1518 @@
+"""
+Asynchronous service for indexing, and migrating entities between storage locations.
+
+This module provides native async implementations of the indexing and migration functionality
+"""
+
+import asyncio
+import collections.abc
+import json
+import logging
+import os
+import sys
+import tempfile
+import traceback
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ AsyncGenerator,
+ Dict,
+ List,
+ Optional,
+ Set,
+ Tuple,
+ Union,
+)
+
+from synapseclient.api.entity_services import get_children
+from synapseclient.api.file_services import get_file_handle_for_download_async
+from synapseclient.api.table_services import get_columns
+from synapseclient.core import utils
+from synapseclient.core.constants import concrete_types
+from synapseclient.core.exceptions import SynapseError
+from synapseclient.core.upload.multipart_upload import MAX_NUMBER_OF_PARTS
+from synapseclient.core.upload.multipart_upload_async import multipart_copy_async
+from synapseclient.models.table_components import (
+ AppendableRowSetRequest,
+ PartialRow,
+ PartialRowSet,
+ TableUpdateTransaction,
+)
+
+from .migration_types import (
+ IndexingError,
+ MigrationError,
+ MigrationKey,
+ MigrationResult,
+ MigrationSettings,
+ MigrationStatus,
+ MigrationType,
+)
+
+if TYPE_CHECKING:
+ from synapseclient.models import Table, query_async
+import sqlite3
+
+from synapseclient import Synapse
+from synapseclient.api import get_entity_type, rest_get_paginated_async
+from synapseclient.entity import Entity
+from synapseclient.operations import FileOptions, get_async
+
+# Default part size for multipart copy (100 MB)
+# we use a much larger default part size for part copies than we would for part uploads.
+# with part copies the data transfer is within AWS so don't need to concern ourselves
+# with upload failures of the actual bytes.
+# this value aligns with what some AWS client libraries use e.g.
+# https://github.com/aws/aws-sdk-java/blob/57ed2e4bd57e08f316bf5c6c71f6fd82a27fa240/aws-java-sdk-s3/src/main/java/com/amazonaws/services/s3/transfer/TransferManagerConfiguration.java#L46
+DEFAULT_PART_SIZE = 100 * utils.MB
+
+# Batch size for database operations so the batch operations are chunked.
+BATCH_SIZE = 500
+
+# Maximum concurrent file copy.
+MAX_CONCURRENT_FILE_COPIES = max(int(Synapse().max_threads / 2), 1)
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Indexing Helper Functions
+# =============================================================================
+async def _verify_storage_location_ownership_async(
+ storage_location_id: str,
+ *,
+ synapse_client: Optional[Synapse] = None,
+) -> None:
+ """Verify the user owns the destination storage location.
+ Only the creator of the storage location can can retrieve it by its id.
+
+ Arguments:
+ storage_location_id: The storage location ID to verify.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+
+ Raises:
+ ValueError: If the user does not own the storage location.
+ """
+ try:
+ await synapse_client.rest_get_async(f"/storageLocation/{storage_location_id}")
+ except SynapseError:
+ raise ValueError(
+ f"Unable to verify ownership of storage location {storage_location_id}. "
+ f"You must be the creator of the destination storage location."
+ )
+
+
+def _get_default_db_path(entity_id: str) -> str:
+ """Generate a default temp database path for migration tracking.
+
+ Arguments:
+ entity_id: The Synapse entity ID being migrated.
+
+ Returns:
+ Path to a SQLite database file in a temp directory.
+ """
+ temp_dir = tempfile.mkdtemp(prefix="synapse_migration_")
+ return os.path.join(temp_dir, f"migration_{entity_id}.db")
+
+
+async def _get_version_numbers_async(
+ entity_id: str,
+ synapse_client: "Synapse",
+) -> AsyncGenerator[int, None]:
+ """Get all version numbers for an entity.
+
+ Arguments:
+ entity_id: The entity ID.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+
+ Yields:
+ Version numbers.
+ """
+ async for version_info in rest_get_paginated_async(
+ f"/entity/{entity_id}/version", synapse_client=synapse_client
+ ):
+ yield version_info["versionNumber"]
+
+
+def _escape_column_name(column: Union[str, collections.abc.Mapping]) -> str:
+ """Escape a column name for use in a Synapse table query statement.
+ Arguments:
+ column: A string column name or a dictionary with a 'name' key.
+ Returns:
+ Escaped column name wrapped in double quotes.
+ """
+ col_name = (
+ column["name"] if isinstance(column, collections.abc.Mapping) else str(column)
+ )
+ escaped_name = col_name.replace('"', '""')
+ return f'"{escaped_name}"'
+
+
+def _join_column_names(columns: List[Any]) -> str:
+ """Join column names into a comma-delimited list for table queries.
+ Arguments:
+ columns: A list of column names or column objects with 'name' keys.
+ Returns:
+ Comma-separated string of escaped column names.
+ """
+ return ",".join(_escape_column_name(c) for c in columns)
+
+
+def _check_indexed(cursor: sqlite3.Cursor, entity_id: str) -> bool:
+ """Check if an entity has already been indexed.
+ If so, it can skip reindexing it.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ entity_id: The entity ID to check.
+
+ Returns:
+ True if the entity is already indexed.
+ """
+ indexed_row = cursor.execute(
+ "select 1 from migrations where id = ?", (entity_id,)
+ ).fetchone()
+
+ if indexed_row:
+ logger.debug("%s already indexed, skipping", entity_id)
+ return True
+
+ logger.debug("%s not yet indexed, indexing now", entity_id)
+ return False
+
+
+# =============================================================================
+# Database Helper Functions
+# =============================================================================
+def _ensure_schema(cursor: sqlite3.Cursor) -> None:
+ """Ensure the SQLite database has the required schema.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ """
+ # migration_settings table
+ # A table to store parameters used to create the index.
+ cursor.execute(
+ "CREATE TABLE IF NOT EXISTS migration_settings (settings TEXT NOT NULL)"
+ )
+
+ # Migrations table
+ # The representation of migratable file handles is flat including both file entities
+ # and table attached files, so not all columns are applicable to both. row id and col id
+ # are only used by table attached files.
+ cursor.execute(
+ """
+ CREATE TABLE IF NOT EXISTS migrations (
+ id TEXT NOT NULL,
+ type INTEGER NOT NULL,
+ version INTEGER NULL,
+ row_id INTEGER NULL,
+ col_id INTEGER NULL,
+ parent_id NULL,
+ status INTEGER NOT NULL,
+ exception TEXT NULL,
+ from_storage_location_id NULL,
+ from_file_handle_id TEXT NULL,
+ to_file_handle_id TEXT NULL,
+ file_size INTEGER NULL,
+ PRIMARY KEY (id, type, row_id, col_id, version)
+ )
+ """
+ )
+
+ # Index the status column for faster status-based lookups
+ cursor.execute("CREATE INDEX IF NOT EXISTS ix_status ON migrations(status)")
+ # Index the from_file_handle_id and to_file_handle_id columns for faster file handle-based lookups
+ # This is used to see if there is already a migrated copy of a file handle before doing a copy
+ cursor.execute(
+ "CREATE INDEX IF NOT EXISTS ix_file_handle_ids "
+ "ON migrations(from_file_handle_id, to_file_handle_id)"
+ )
+
+
+def _prepare_migration_db(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
+ db_path: str,
+ root_id: str,
+ dest_storage_location_id: str,
+ source_storage_location_ids: List[str],
+ file_version_strategy: str,
+ include_table_files: bool,
+) -> None:
+ """Prepare the migration database by checking the migration settings for the given parameters.
+ This is a guardrail: it binds a given SQLite index settings to the specific entity and migration options it was created with, enabling safe resumption and preventing mismatched reuse.
+
+ Arguments:
+ conn: The connection to the SQLite database.
+ cursor: The cursor to the SQLite database.
+ db_path: Path to the SQLite database file.
+ root_id: The root entity ID being migrated.
+ dest_storage_location_id: Destination storage location ID.
+ source_storage_location_ids: List of source storage location IDs to filter.
+ file_version_strategy: Strategy for handling file versions.
+ include_table_files: Whether to include table-attached files.
+ """
+ current_settings = MigrationSettings(
+ root_id=root_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ )
+ existing_settings = _retrieve_index_settings(cursor)
+
+ if existing_settings:
+ current_settings.verify_migration_settings(existing_settings, db_path)
+ else:
+ cursor.execute(
+ "INSERT INTO migration_settings (settings) VALUES (?)",
+ (json.dumps(current_settings.to_dict()),),
+ )
+
+ conn.commit()
+
+
+def _retrieve_index_settings(cursor: sqlite3.Cursor) -> Optional[MigrationSettings]:
+ """Retrieve index settings from the database as a MigrationSettings instance.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+
+ Returns:
+ MigrationSettings if a row exists, None otherwise.
+ """
+ row = cursor.execute("SELECT settings FROM migration_settings").fetchone()
+ if row:
+ return MigrationSettings.from_dict(json.loads(row[0]))
+ return None
+
+
+def _insert_file_migration(
+ cursor: sqlite3.Cursor,
+ insert_values: List[
+ Tuple[str, str, Optional[int], Optional[str], int, str, int, MigrationStatus]
+ ],
+) -> None:
+ """Insert a file migration entry to the migrations database.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ insert_values: List of tuples containing the file migration data.
+ """
+ cursor.executemany(
+ """
+ insert into migrations (
+ id,
+ type,
+ version,
+ parent_id,
+ from_storage_location_id,
+ from_file_handle_id,
+ file_size,
+ status
+ ) values (?, ?, ?, ?, ?, ?, ?, ?)
+ """,
+ insert_values,
+ )
+
+
+def _insert_table_file_migration(
+ cursor: sqlite3.Cursor,
+ insert_values: List[
+ Tuple[str, str, Optional[int], Optional[str], int, str, int, MigrationStatus]
+ ],
+) -> None:
+ """Insert a table-attached file migration entry.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ insert_values: List of tuples containing the table-attached file migration data.
+ """
+ cursor.executemany(
+ """
+ INSERT OR IGNORE INTO migrations (
+ id, type, row_id, col_id, version, parent_id,
+ from_storage_location_id, from_file_handle_id,
+ file_size, status
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ """,
+ insert_values,
+ )
+
+
+def _mark_container_indexed(
+ cursor: sqlite3.Cursor,
+ entity_id: str,
+ migration_type: MigrationType,
+ parent_id: Optional[str],
+) -> None:
+ """Mark a container (Project or Folder) as indexed.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ entity_id: The Synapse ID of the container entity.
+ migration_type: The MigrationType of the container.
+ parent_id: The Synapse ID of the parent entity.
+ """
+ cursor.execute(
+ "INSERT OR IGNORE INTO migrations (id, type, parent_id, status) VALUES (?, ?, ?, ?)",
+ [entity_id, migration_type, parent_id, MigrationStatus.INDEXED.value],
+ )
+
+
+def _record_indexing_error(
+ cursor: sqlite3.Cursor,
+ entity_id: str,
+ migration_type: MigrationType,
+ parent_id: Optional[str],
+ tb_str: str,
+) -> None:
+ """Record an indexing error in the database.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ entity_id: The Synapse ID of the entity that failed.
+ migration_type: The MigrationType of the entity.
+ parent_id: The Synapse ID of the parent entity.
+ tb_str: The traceback string.
+ """
+ cursor.execute(
+ """
+ insert into migrations (
+ id,
+ type,
+ parent_id,
+ status,
+ exception
+ ) values (?, ?, ?, ?, ?)
+ """,
+ (
+ entity_id,
+ migration_type,
+ parent_id,
+ MigrationStatus.ERRORED.value,
+ tb_str,
+ ),
+ )
+
+
+# =============================================================================
+# Migration Helper Functions
+# =============================================================================
+def _check_file_handle_exists(
+ cursor: sqlite3.Cursor, from_file_handle_id: str
+) -> Optional[str]:
+ """Check if a file handle has already been copied.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ from_file_handle_id: The source file handle ID.
+
+ Returns:
+ The destination file handle ID if found, None otherwise.
+ """
+ row = cursor.execute(
+ "SELECT to_file_handle_id FROM migrations WHERE from_file_handle_id = ? AND to_file_handle_id IS NOT NULL",
+ (from_file_handle_id,),
+ ).fetchone()
+ return row[0] if row else None
+
+
+def _query_migration_batch(
+ cursor: sqlite3.Cursor,
+ last_key: MigrationKey,
+ pending_file_handle_ids: Set[str],
+ completed_file_handle_ids: Set[str],
+ limit: int,
+) -> List[Dict[str, Any]]:
+ """Query the next batch of items to migrate.
+
+ This matches the original synapseutils query logic:
+ - Forward progress through entities ordered by id, type, row_id, col_id, version
+ - Backtracking to pick up files with completed file handles that were skipped
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ last_key: The last processed MigrationKey.
+ pending_file_handle_ids: Set of file handle IDs currently being processed.
+ completed_file_handles: Set of file handles already completed.
+ limit: Maximum number of items to return.
+
+ Returns:
+ List of migration entries as dictionaries.
+ """
+ query_kwargs = {
+ "indexed_status": MigrationStatus.INDEXED.value,
+ "id": last_key.id,
+ "file_type": MigrationType.FILE.value,
+ "table_type": MigrationType.TABLE_ATTACHED_FILE.value,
+ "version": last_key.version,
+ "row_id": last_key.row_id,
+ "col_id": last_key.col_id,
+ "limit": limit,
+ }
+
+ # Build the IN clauses for file handles
+ pending = "('" + "','".join(pending_file_handle_ids) + "')"
+ completed = "('" + "','".join(completed_file_handle_ids) + "')"
+
+ # Query the next batch of items to migrate.
+ # 1. Forward progress: entities after the current position
+ # 2. Backtracking: entities before current position that share completed file handles
+ results = cursor.execute(
+ f"""
+ SELECT
+ id,
+ type,
+ version,
+ row_id,
+ col_id,
+ from_file_handle_id,
+ file_size
+ FROM migrations
+ WHERE
+ status = :indexed_status
+ AND (
+ (
+ ((id > :id AND type IN (:file_type, :table_type))
+ OR (id = :id AND type = :file_type AND version IS NOT NULL AND version > :version)
+ OR (id = :id AND type = :table_type AND (row_id > :row_id OR (row_id = :row_id AND col_id > :col_id))))
+ AND from_file_handle_id NOT IN {pending}
+ ) OR
+ (
+ id <= :id
+ AND from_file_handle_id IN {completed}
+ )
+ )
+ ORDER BY
+ id,
+ type,
+ row_id,
+ col_id,
+ version
+ LIMIT :limit
+ """, # noqa
+ query_kwargs,
+ )
+
+ batch = []
+ for row in results:
+ batch.append(
+ {
+ "id": row[0],
+ "type": row[1],
+ "version": row[2],
+ "row_id": row[3],
+ "col_id": row[4],
+ "from_file_handle_id": row[5],
+ "file_size": row[6],
+ }
+ )
+ return batch
+
+
+def _update_migration_database(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
+ key: MigrationKey,
+ to_file_handle_id: str,
+ status: MigrationStatus,
+ exception: Optional[Exception] = None,
+) -> None:
+ """Update a migration database record as successful or errored.
+
+ Arguments:
+ conn: The connection to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
+ key: The migration key.
+ to_file_handle_id: The destination file handle ID.
+ status: The migration status.
+ exception: The exception that occurred.
+ """
+ tb_str = (
+ "".join(
+ traceback.format_exception(
+ type(exception), exception, exception.__traceback__
+ )
+ )
+ if exception
+ else None
+ )
+
+ update_sql = """
+ UPDATE migrations SET
+ status = ?,
+ to_file_handle_id = ?,
+ exception = ?
+ WHERE
+ id = ?
+ AND type = ?
+ """
+ update_args = [status, to_file_handle_id, tb_str, key.id, key.type.value]
+ for arg in ("version", "row_id", "col_id"):
+ arg_value = getattr(key, arg)
+ if arg_value is not None:
+ update_sql += "and {} = ?\n".format(arg)
+ update_args.append(arg_value)
+ else:
+ update_sql += "and {} is null\n".format(arg)
+
+ cursor.execute(update_sql, tuple(update_args))
+ conn.commit()
+
+
+def _confirm_migration(
+ cursor: sqlite3.Cursor, dest_storage_location_id: str, force: bool = False
+) -> bool:
+ """Confirm migration with user if in interactive mode.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ dest_storage_location_id: Destination storage location ID.
+ force: If running in an interactive shell, migration requires an interactice confirmation.
+ This can be bypassed by using the force=True option. Defaults to False.
+
+ Returns:
+ True if migration should proceed, False otherwise.
+ """
+
+ if force:
+ return True
+
+ count = cursor.execute(
+ "SELECT count(*) FROM migrations WHERE status = ?",
+ (MigrationStatus.INDEXED.value,),
+ ).fetchone()[0]
+
+ if count == 0:
+ logger.info("No items for migration.")
+ return False
+
+ if sys.stdout.isatty():
+ user_input = input(
+ f"{count} items for migration to {dest_storage_location_id}. Proceed? (y/n)? "
+ )
+ return user_input.strip().lower() == "y"
+ else:
+ logger.info(
+ "%s items for migration. "
+ "force option not used, and console input not available to confirm migration, aborting. "
+ "Use the force option or run from an interactive shell to proceed with migration.",
+ count,
+ )
+ return False
+
+
+def _get_part_size(file_size: int) -> int:
+ """Calculate the part size for multipart copy.
+
+ Arguments:
+ file_size: The file size in bytes.
+
+ Returns:
+ The part size in bytes.
+ """
+ import math
+
+ # Ensure we don't exceed max parts
+ min_part_size = math.ceil(file_size / MAX_NUMBER_OF_PARTS)
+ return max(DEFAULT_PART_SIZE, min_part_size)
+
+
+def _get_file_migration_status(
+ file_handle: Dict[str, Any],
+ source_storage_location_ids: List[str],
+ dest_storage_location_id: str,
+) -> Optional[MigrationStatus]:
+ """
+ Determine whether a file should be included in the migrations database
+ and return its migration status.
+
+ Only S3 file handles are considered for migration. Other handle types
+ (e.g., external URLs) are ignored.
+
+ A file is included according to the following rules:
+ - If the file is already stored in the destination location, it is included
+ and marked as ALREADY_MIGRATED.
+ - If `source_storage_location_ids` is provided, the file's current storage
+ location must be in that list to be included.
+ - If `source_storage_location_ids` is empty, all files not already at the
+ destination are included.
+
+ Args:
+ file_handle: File handle metadata.
+ source_storage_location_ids: Storage location IDs that qualify as
+ migration sources. If empty, all source locations are considered.
+ dest_storage_location_id: Destination storage location ID.
+
+ Returns:
+ MigrationStatus enum (ALREADY_MIGRATED, INDEXED) if the file should be included in the migrations database, or
+ None if the file should not be included in the migrations database.
+ """
+ # Only S3 file handles can be migrated
+ if file_handle.concrete_type != concrete_types.S3_FILE_HANDLE:
+ return None
+
+ current_storage_location_id = str(file_handle.storage_location_id)
+
+ if current_storage_location_id == dest_storage_location_id:
+ return MigrationStatus.ALREADY_MIGRATED.value
+
+ if source_storage_location_ids:
+ if current_storage_location_id not in source_storage_location_ids:
+ return None
+
+ return MigrationStatus.INDEXED.value
+
+
+# =============================================================================
+# Indexing Functions
+# =============================================================================
+async def index_files_for_migration_async(
+ entity: Entity,
+ dest_storage_location_id: str,
+ db_path: Optional[str] = None,
+ *,
+ source_storage_location_ids: Optional[List[str]] = [],
+ file_version_strategy: str = "new",
+ include_table_files: bool = False,
+ continue_on_error: bool = False,
+ synapse_client: Optional[Synapse] = None,
+) -> MigrationResult:
+ """Index files for migration to a new storage location.
+
+ This is the first step in migrating files to a new storage location. This function itself does not modify the given entity but only update the migrations and migration_settings tables in the SQLite database.
+ After indexing, use `migrate_indexed_files_async` to perform the actual migration.
+
+ Arguments:
+ entity: The Synapse entity to migrate (Project, Folder, File, or Table). If it is a container (a Project or Folder), its contents will be recursively indexed.
+ dest_storage_location_id: The destination storage location ID.
+ db_path: A path on disk where the SQLite index database will be created. Must be on a volume with enough space for metadata of all indexed contents. If not provided, a temporary directory will be created and the path will be returned in the MigrationResult object.
+ source_storage_location_ids: Optional list of source storage location IDs that will be migrated. If provided, files outside of one of the listed storage locations will not be indexed for migration. If not provided, then all files not already in the destination storage location will be indexed for migrated.
+ file_version_strategy: Strategy to migrate file versions: "new", "all", "latest", "skip".
+ - `new`: will create a new version of file entities in the new storage location, leaving existing versions unchanged
+ - `all`: all existing versions will be migrated in place to the new storage location
+ - `latest`: the latest version will be migrated in place to the new storage location
+ - `skip`: skip migrating file entities. use this e.g. if wanting to e.g. migrate table attached files in a container while leaving the files unchanged
+
+ include_table_files: Whether to include files attached to tables. If False (default) then e.g. only
+ file entities in the container will be migrated and tables will be untouched.
+ continue_on_error: Whether any errors encountered while indexing an entity will be raised
+ or instead just recorded in the index while allowing the index creation
+ to continue. Defaults to False.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+
+ Returns:
+ A MigrationResult object that can be used to inspect the contents of the index or output the index to a CSV for manual inspection.
+
+ Raises:
+ ValueError: If the file_version_strategy is invalid or if skipping both file entities and table attached files.
+ """
+ client = Synapse.get_client(synapse_client=synapse_client)
+
+ # Validate parameters
+ valid_file_version_strategy = {"new", "all", "latest", "skip"}
+ if file_version_strategy not in valid_file_version_strategy:
+ raise ValueError(
+ f"Invalid file_version_strategy: {file_version_strategy}, "
+ f"must be one of {valid_file_version_strategy}"
+ )
+
+ if file_version_strategy == "skip" and not include_table_files:
+ raise ValueError(
+ "Skipping both file entities and table attached files, nothing to migrate"
+ )
+
+ # Verify ownership
+ await _verify_storage_location_ownership_async(
+ storage_location_id=dest_storage_location_id,
+ synapse_client=client,
+ )
+
+ entity_id = utils.id_of(entity)
+
+ # Create database path if not provided
+ if db_path is None:
+ db_path = _get_default_db_path(entity_id)
+
+ # Initialize database
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ _prepare_migration_db(
+ conn=conn,
+ cursor=cursor,
+ db_path=db_path,
+ root_id=entity_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ )
+ try:
+ await _index_entity_async(
+ conn=conn,
+ cursor=cursor,
+ entity=entity,
+ parent_id=None,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ continue_on_error=continue_on_error,
+ synapse_client=client,
+ )
+ except IndexingError as ex:
+ logger.exception(
+ f"Aborted due to failure to index entity {ex.entity_id} of type {ex.concrete_type}. "
+ "Use continue_on_error=True to skip individual failures."
+ )
+ raise ex.__cause__
+
+ return MigrationResult(db_path=db_path, synapse_client=client)
+
+
+# =============================================================================
+# Indexing Implementation
+# =============================================================================
+async def _index_entity_async(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
+ entity: Entity,
+ parent_id: Optional[str],
+ dest_storage_location_id: str,
+ source_storage_location_ids: List[str],
+ file_version_strategy: str,
+ include_table_files: bool,
+ continue_on_error: bool,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Recursively index an entity and its children into migrations database.
+
+ Arguments:
+ conn: The connection to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
+ entity: The Synapse entity object.
+ parent_id: The parent entity Synapse ID.
+ dest_storage_location_id: Destination storage location ID.
+ source_storage_location_ids: List of source storage locations.
+ file_version_strategy: Strategy for file versions.
+ include_table_files: Whether to include table-attached files.
+ continue_on_error: Whether to continue on errors.
+ synapse_client: The Synapse client.
+ """
+ entity_id = utils.id_of(entity)
+ retrieved_entity = await get_entity_type(entity_id=entity_id)
+ concrete_type = retrieved_entity.type
+
+ # Check if already indexed
+ is_indexed = _check_indexed(cursor, entity_id)
+ try:
+ if not is_indexed:
+ if concrete_type == concrete_types.FILE_ENTITY:
+ if file_version_strategy != "skip":
+ await _index_file_entity_async(
+ cursor=cursor,
+ entity=entity,
+ parent_id=parent_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ synapse_client=synapse_client,
+ )
+
+ elif concrete_type == concrete_types.TABLE_ENTITY:
+ if include_table_files:
+ await _index_table_entity_async(
+ cursor=cursor,
+ entity_id=entity_id,
+ parent_id=parent_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ synapse_client=synapse_client,
+ )
+
+ elif concrete_type in (
+ concrete_types.FOLDER_ENTITY,
+ concrete_types.PROJECT_ENTITY,
+ ):
+ await _index_container_async(
+ conn=conn,
+ cursor=cursor,
+ entity_id=entity_id,
+ parent_id=parent_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ continue_on_error=continue_on_error,
+ synapse_client=synapse_client,
+ )
+ conn.commit()
+
+ except IndexingError:
+ # this is a recursive function, we don't need to log the error at every level so just
+ # pass up exceptions of this type that wrap the underlying exception and indicate
+ # that they were already logged
+ raise
+ except Exception as ex:
+ if continue_on_error:
+ logger.warning(f"Error indexing entity {entity_id}: {ex}")
+ tb_str = "".join(traceback.format_exception(type(ex), ex, ex.__traceback__))
+ migration_type = MigrationType.from_concrete_type(concrete_type).value
+ _record_indexing_error(cursor, entity_id, migration_type, parent_id, tb_str)
+ else:
+ raise IndexingError(entity_id, concrete_type) from ex
+
+
+async def _index_file_entity_async(
+ cursor: sqlite3.Cursor,
+ entity: Entity,
+ parent_id: Optional[str],
+ dest_storage_location_id: str,
+ source_storage_location_ids: List[str],
+ file_version_strategy: str,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Index a file entity for migration.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ entity: The Synapse entity object, a File.
+ parent_id: The parent entity Synapse ID.
+ dest_storage_location_id: Destination storage location ID.
+ source_storage_location_ids: List of source storage locations.
+ file_version_strategy: Strategy for file versions.
+ synapse_client: The Synapse client.
+ """
+ entity_id = utils.id_of(entity)
+ logger.info("Indexing file entity %s", entity_id)
+
+ entity_versions: List[Tuple[Any, Optional[int]]] = []
+
+ if file_version_strategy == "new":
+ entity_versions.append((entity, None))
+
+ elif file_version_strategy == "all":
+ async for version in _get_version_numbers_async(entity_id, synapse_client):
+ entity = await get_async(
+ synapse_id=entity_id,
+ file_options=FileOptions(download_file=False),
+ synapse_client=synapse_client,
+ )
+ entity_versions.append((entity, version))
+
+ elif file_version_strategy == "latest":
+ entity_versions.append((entity, entity.version_number))
+
+ insert_values = []
+ for entity, version in entity_versions:
+ status = _get_file_migration_status(
+ entity.file_handle, source_storage_location_ids, dest_storage_location_id
+ )
+ if status:
+ insert_values.append(
+ (
+ entity_id,
+ MigrationType.FILE.value,
+ version,
+ parent_id,
+ entity.file_handle.storage_location_id,
+ entity.data_file_handle_id,
+ entity.file_handle.content_size,
+ status,
+ )
+ )
+ if insert_values:
+ _insert_file_migration(cursor, insert_values)
+
+
+async def _get_table_file_handle_rows_async(
+ entity_id: str,
+ *,
+ synapse_client: "Synapse",
+) -> List[Tuple[int, int, Dict[str, Any]]]:
+ """Get the table file handle rows for a given entity.
+
+ Arguments:
+ entity_id: The table entity ID.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+
+ Returns:
+ A list of tuples containing the row ID, row version, and file handles.
+ """
+ # Get file handle columns using the async API
+ columns = await get_columns(table_id=entity_id, synapse_client=synapse_client)
+ file_handle_columns = [c for c in columns if c.column_type == "FILEHANDLEID"]
+
+ if file_handle_columns:
+ file_column_select = _join_column_names(
+ file_handle_columns
+ ) # don't think we need this since only one column could be FILEHANDLEID
+ results = await query_async(
+ query=f"select {file_column_select} from {entity_id}",
+ include_row_id_and_row_version=True,
+ )
+ for row in results:
+ file_handles = {}
+
+ # first two cols are row id and row version, rest are file handle ids from our query
+ row_id, row_version = row[:2]
+
+ file_handle_ids = row[2:]
+ for i, file_handle_id in enumerate(file_handle_ids):
+ if file_handle_id:
+ col_id = file_handle_columns[i]["id"]
+ response = await get_file_handle_for_download_async(
+ file_handle_id, entity_id, objectType="TableEntity"
+ )
+ file_handle = response["fileHandle"]
+ file_handles[col_id] = file_handle
+
+ yield row_id, row_version, file_handles
+
+
+async def _index_table_entity_async(
+ cursor: sqlite3.Cursor,
+ entity_id: str,
+ parent_id: Optional[str],
+ dest_storage_location_id: str,
+ source_storage_location_ids: List[str],
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Index a table entity's file attachments for migration.
+
+ Arguments:
+ cursor: The cursor object from the connection to the SQLite database.
+ entity_id: The Synapse ID of the table entity.
+ parent_id: The parent entity Synapse ID.
+ dest_storage_location_id: Destination storage location ID.
+ source_storage_location_ids: List of source storage locations to filter.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+ """
+ logger.info("Indexing table entity %s", entity_id)
+ insert_values = []
+ async for row_id, row_version, file_handles in _get_table_file_handle_rows_async(
+ entity_id=entity_id, synapse_client=synapse_client
+ ):
+ for col_id, file_handle in file_handles.items():
+ status = _get_file_migration_status(
+ file_handle, source_storage_location_ids, dest_storage_location_id
+ )
+ if status:
+ insert_values.append(
+ (
+ entity_id,
+ MigrationType.TABLE_ATTACHED_FILE.value,
+ row_id,
+ col_id,
+ row_version,
+ parent_id,
+ file_handle.storage_location_id,
+ file_handle.id,
+ file_handle.content_size,
+ status,
+ )
+ )
+ if len(insert_values) % BATCH_SIZE == 0:
+ _insert_table_file_migration(cursor, insert_values)
+ insert_values.clear()
+ if insert_values:
+ _insert_table_file_migration(cursor, insert_values)
+
+
+async def _index_container_async(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
+ entity_id: str,
+ parent_id: Optional[str],
+ dest_storage_location_id: str,
+ source_storage_location_ids: List[str],
+ file_version_strategy: str,
+ include_table_files: bool,
+ continue_on_error: bool,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Index a container (Project or Folder) and its children.
+
+ Arguments:
+ conn: The connection to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
+ entity_id: The Synapse ID of the entity, a Project or Folder.
+ parent_id: The Synapse ID of the parent entity.
+ dest_storage_location_id: Destination storage location ID.
+ source_storage_location_ids: List of source storage locations to filter.
+ file_version_strategy: Strategy for file versions.
+ include_table_files: Whether to include table-attached files.
+ continue_on_error: Whether to continue on errors.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+ """
+ retrieved_entity = await get_entity_type(entity_id=entity_id)
+ concrete_type = retrieved_entity.type
+ logger.info(
+ f'Indexing {concrete_type[concrete_type.rindex(".") + 1 :]} {entity_id}'
+ )
+
+ # Determine included types
+ include_types = []
+ if file_version_strategy != "skip":
+ include_types.extend(["folder", "file"])
+ if include_table_files:
+ include_types.append("table")
+
+ # Get children using the async API
+ children = []
+ async for child in get_children(
+ parent=entity_id,
+ include_types=include_types,
+ synapse_client=synapse_client,
+ ):
+ children.append(child)
+
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_FILE_COPIES)
+
+ async def index_child(child: Dict[str, Any]) -> None:
+ async with semaphore:
+ child_entity = await get_async(
+ synapse_id=child["id"], synapse_client=synapse_client
+ )
+
+ await _index_entity_async(
+ conn=conn,
+ cursor=cursor,
+ entity=child_entity,
+ parent_id=entity_id,
+ dest_storage_location_id=dest_storage_location_id,
+ source_storage_location_ids=source_storage_location_ids,
+ file_version_strategy=file_version_strategy,
+ include_table_files=include_table_files,
+ continue_on_error=continue_on_error,
+ synapse_client=synapse_client,
+ )
+
+ # Process children with as_completed for progress tracking
+ tasks = [asyncio.create_task(index_child(child)) for child in children]
+ for task in asyncio.as_completed(tasks):
+ await task
+
+ # Mark container as indexed
+ migration_type = (
+ MigrationType.PROJECT.value
+ if concrete_type == concrete_types.PROJECT_ENTITY
+ else MigrationType.FOLDER.value
+ )
+ _mark_container_indexed(cursor, entity_id, migration_type, parent_id)
+
+
+# =============================================================================
+# Migration Functions
+# =============================================================================
+async def _migrate_item_async(
+ key: MigrationKey,
+ from_file_handle_id: str,
+ to_file_handle_id: Optional[str],
+ file_size: int,
+ dest_storage_location_id: str,
+ semaphore: asyncio.Semaphore,
+ *,
+ synapse_client: "Synapse",
+) -> Dict[str, Any]:
+ """Migrate a single item.
+
+ Arguments:
+ key: The migration key.
+ from_file_handle_id: The source file handle ID.
+ to_file_handle_id: The destination file handle ID (if already copied).
+ file_size: File size in bytes.
+ dest_storage_location_id: The destination storage location ID.
+ semaphore: The concurrency semaphore.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+
+ Returns:
+ Dictionary with the key, from_file_handle_id, and to_file_handle_id.
+ """
+ async with semaphore:
+ try:
+ # copy to a new file handle if we haven't already
+ if not to_file_handle_id:
+ source_association = {
+ "fileHandleId": from_file_handle_id,
+ "associateObjectId": key.id,
+ "associateObjectType": (
+ "FileEntity"
+ if key.type == MigrationType.FILE
+ else "TableEntity"
+ ),
+ }
+
+ to_file_handle_id = await multipart_copy_async(
+ synapse_client,
+ source_association,
+ storage_location_id=dest_storage_location_id,
+ part_size=_get_part_size(file_size),
+ )
+
+ # Update entity with new file handle
+ if key.type == MigrationType.FILE:
+ if key.version is None:
+ await _create_new_file_version_async(
+ entity_id=key.id,
+ to_file_handle_id=to_file_handle_id,
+ synapse_client=synapse_client,
+ )
+ else:
+ await _migrate_file_version_async(
+ entity_id=key.id,
+ version=key.version,
+ from_file_handle_id=from_file_handle_id,
+ to_file_handle_id=to_file_handle_id,
+ synapse_client=synapse_client,
+ )
+ elif key.type == MigrationType.TABLE_ATTACHED_FILE:
+ await _migrate_table_attached_file_async(
+ key=key,
+ to_file_handle_id=to_file_handle_id,
+ synapse_client=synapse_client,
+ )
+
+ return {
+ "key": key,
+ "from_file_handle_id": from_file_handle_id,
+ "to_file_handle_id": to_file_handle_id,
+ }
+
+ except Exception as ex:
+ raise MigrationError(
+ key, from_file_handle_id, to_file_handle_id, cause=ex
+ ) from ex
+
+
+async def _create_new_file_version_async(
+ entity_id: str,
+ to_file_handle_id: str,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Create a new version of a file entity with the new file handle.
+
+ Arguments:
+ entity_id: The file entity ID.
+ to_file_handle_id: The new file handle ID.
+ synapse_client: The Synapse client.
+ """
+ client = Synapse.get_client(synapse_client=synapse_client)
+ client.logger.info("Creating new version for file entity %s", entity_id)
+
+ entity = await get_async(
+ synapse_id=entity_id,
+ file_options=FileOptions(download_file=False),
+ synapse_client=synapse_client,
+ )
+ entity.dataFileHandleId = to_file_handle_id
+ await entity.store_async()
+
+
+async def _migrate_file_version_async(
+ entity_id: str,
+ version: int,
+ from_file_handle_id: str,
+ to_file_handle_id: str,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Migrate/update an existing file version with a new file handle.
+
+ Arguments:
+ entity_id: The file entity ID.
+ version: The version number.
+ from_file_handle_id: The original file handle ID.
+ to_file_handle_id: The new file handle ID.
+ synapse_client: The Synapse client.
+ """
+ client = Synapse.get_client(synapse_client=synapse_client)
+ client.logger.info(
+ "Updating file handle for file entity %s version %s", entity_id, version
+ )
+
+ await client.rest_put_async(
+ f"/entity/{entity_id}/version/{version}/filehandle",
+ body=json.dumps(
+ {
+ "oldFileHandleId": from_file_handle_id,
+ "newFileHandleId": to_file_handle_id,
+ }
+ ),
+ )
+
+
+async def _migrate_table_attached_file_async(
+ key: MigrationKey,
+ to_file_handle_id: str,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Migrate/update a table attached file with a new file handle.
+
+ Arguments:
+ key: The migration key.
+ to_file_handle_id: The new file handle ID.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+ """
+ partial_row = PartialRow(
+ row_id=str(key.row_id),
+ values=[{str(key.col_id): to_file_handle_id}],
+ )
+ partial_row_set = PartialRowSet(
+ table_id=key.id,
+ rows=[partial_row],
+ )
+ appendable_request = AppendableRowSetRequest(
+ entity_id=key.id,
+ to_append=partial_row_set,
+ )
+ transaction = TableUpdateTransaction(
+ entity_id=key.id,
+ changes=[appendable_request],
+ )
+ await transaction.send_job_and_wait_async(synapse_client=synapse_client)
+
+
+async def track_migration_results_async(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
+ active_tasks: Set[asyncio.Task],
+ pending_file_handles: Set[str],
+ completed_file_handles: Set[str],
+ pending_keys: Set[MigrationKey],
+ return_when: asyncio.Future[asyncio.Task],
+ continue_on_error: bool,
+) -> None:
+ """Track the results of the migration tasks.
+
+ Arguments:
+ conn: The connection to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
+ pending_file_handles: The set of pending file handles.
+ completed_file_handles: The set of completed file handles.
+ active_tasks: The set of active migration tasks.
+ pending_keys: The set of pending migration keys.
+ return_when: The return when condition for the asyncio.wait.
+ continue_on_error: Whether to continue on errors.
+
+ Returns:
+ None
+ """
+ done, active_tasks = await asyncio.wait(
+ active_tasks,
+ return_when=return_when,
+ )
+ for completed_task in done:
+ to_file_handle_id = None
+ ex = None
+ try:
+ result = completed_task.result()
+ key = result["key"]
+ from_file_handle_id = result["from_file_handle_id"]
+ to_file_handle_id = result["to_file_handle_id"]
+ status = MigrationStatus.MIGRATED.value
+ completed_file_handles.add(from_file_handle_id)
+
+ except MigrationError as migration_error:
+ key = migration_error.key
+ from_file_handle_id = migration_error.from_file_handle_id
+ ex = migration_error.__cause__
+ status = MigrationStatus.ERRORED.value
+ completed_file_handles.add(from_file_handle_id)
+
+ _update_migration_database(conn, cursor, key, to_file_handle_id, status, ex)
+ pending_keys.discard(key)
+ pending_file_handles.discard(from_file_handle_id)
+
+ if not continue_on_error and ex:
+ raise ex from None
+
+
+# =============================================================================
+# Migration Implementation
+# =============================================================================
+async def migrate_indexed_files_async(
+ db_path: str,
+ *,
+ create_table_snapshots: bool = True,
+ continue_on_error: bool = False,
+ force: bool = False,
+ synapse_client: Optional["Synapse"] = None,
+) -> MigrationResult:
+ """Migrate files that have been indexed.
+
+ This is the second step in migrating files to a new storage location.
+ Files must first be indexed using `index_files_for_migration_async`.
+
+ Arguments:
+ db_path: Path to SQLite database created by index_files_for_migration_async.
+ create_table_snapshots: Whether to create table snapshots before migrating. Defaults to True.
+ continue_on_error: Whether to continue on individual migration errors. Defaults to False.
+ force: If running in an interactive shell, migration requires an interactice confirmation.
+ This can be bypassed by using the force=True option. Defaults to False.
+ max_concurrent_copies: Maximum concurrent file copy operations. Defaults to None.
+ synapse_client: If not passed in and caching was not disabled by `Synapse.allow_client_caching(False)` this will use the last created instance from the Synapse class constructor.
+
+ Returns:
+ MigrationResult object or None if migration was aborted.
+ """
+ client = Synapse.get_client(synapse_client=synapse_client)
+
+ # Retrieve settings
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ existing_settings = _retrieve_index_settings(cursor)
+ if existing_settings is None:
+ raise ValueError(
+ f"Unable to retrieve existing index settings from '{db_path}'. "
+ "Either this path does not represent a previously created migration index "
+ "or the file is corrupt."
+ )
+ dest_storage_location_id = existing_settings.dest_storage_location_id
+
+ # Confirm migration
+ confirmed = _confirm_migration(cursor, dest_storage_location_id, force)
+ if not confirmed:
+ logger.info("Migration aborted.")
+ return None
+
+ # Execute migration
+ await _execute_migration_async(
+ conn=conn,
+ cursor=cursor,
+ dest_storage_location_id=dest_storage_location_id,
+ create_table_snapshots=create_table_snapshots,
+ continue_on_error=continue_on_error,
+ synapse_client=client,
+ )
+ return MigrationResult(db_path=db_path, synapse_client=client)
+
+
+async def _execute_migration_async(
+ conn: sqlite3.Connection,
+ cursor: sqlite3.Cursor,
+ dest_storage_location_id: str,
+ create_table_snapshots: bool,
+ continue_on_error: bool,
+ *,
+ synapse_client: "Synapse",
+) -> None:
+ """Execute the actual file migration.
+
+ Arguments:
+ conn: The connection to the SQLite database.
+ cursor: The cursor object from the connection to the SQLite database.
+ dest_storage_location_id: Destination storage location ID.
+ create_table_snapshots: Whether to create table snapshots.
+ continue_on_error: Whether to continue on errors.
+ max_concurrent: Maximum concurrent operations.
+ synapse_client: The Synapse client.
+ """
+ pending_file_handles: Set[str] = set()
+ completed_file_handles: Set[str] = set()
+ pending_keys: Set[MigrationKey] = set()
+
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_FILE_COPIES)
+ active_tasks: Set[asyncio.Task] = set()
+
+ # Initialize last key to an empty key so the first iteration can proceed.
+ key = MigrationKey(id="", type=None, row_id=-1, col_id=-1, version=-1)
+ while True:
+ # Query next batch
+ batch = _query_migration_batch(
+ cursor,
+ key,
+ pending_file_handles,
+ completed_file_handles,
+ min(BATCH_SIZE, MAX_CONCURRENT_FILE_COPIES - len(active_tasks)),
+ )
+ row_count = 0
+ for item in batch:
+ row_count += 1
+ last_key = key
+ key = MigrationKey(
+ id=item["id"],
+ type=MigrationType(item["type"]),
+ version=item["version"],
+ row_id=item["row_id"],
+ col_id=item["col_id"],
+ )
+ from_file_handle_id = item["from_file_handle_id"]
+ if key in pending_keys or from_file_handle_id in pending_file_handles:
+ # if this record is already being migrated or it shares a file handle
+ # with a record that is being migrated then skip this.
+ # if it the record shares a file handle it will be picked up later
+ # when its file handle is completed.
+ continue
+
+ pending_keys.add(key)
+
+ # Check for existing copy
+ to_file_handle_id = _check_file_handle_exists(cursor, from_file_handle_id)
+
+ if not to_file_handle_id:
+ pending_file_handles.add(from_file_handle_id)
+
+ # Create table snapshot if needed using the async API
+ if (
+ key.type == MigrationType.TABLE_ATTACHED_FILE.value
+ and create_table_snapshots
+ and last_key.id != key.id
+ ):
+ await Table(id=key.id).snapshot_async(synapse_client=synapse_client)
+
+ # Create migration task
+ task = asyncio.create_task(
+ _migrate_item_async(
+ key=key,
+ from_file_handle_id=from_file_handle_id,
+ to_file_handle_id=to_file_handle_id,
+ file_size=item["file_size"] or 0,
+ dest_storage_location_id=dest_storage_location_id,
+ semaphore=semaphore,
+ synapse_client=synapse_client,
+ )
+ )
+ active_tasks.add(task)
+
+ if row_count == 0 and not pending_file_handles:
+ # we've run out of migratable sqlite rows, we have nothing else
+ # to submit, so we break out and wait for all remaining
+ # tasks to conclude.
+ break
+
+ # Wait for tasks if at capacity or end of batch
+ if len(active_tasks) >= MAX_CONCURRENT_FILE_COPIES or len(batch) < BATCH_SIZE:
+ await track_migration_results_async(
+ conn,
+ cursor,
+ active_tasks,
+ pending_file_handles,
+ completed_file_handles,
+ pending_keys,
+ asyncio.FIRST_COMPLETED,
+ continue_on_error,
+ )
+
+ # Wait for any remaining tasks
+ if active_tasks:
+ await track_migration_results_async(
+ conn,
+ cursor,
+ active_tasks,
+ pending_file_handles,
+ completed_file_handles,
+ pending_keys,
+ asyncio.ALL_COMPLETED,
+ continue_on_error,
+ )
diff --git a/synapseclient/models/services/migration_types.py b/synapseclient/models/services/migration_types.py
new file mode 100644
index 000000000..c53e423ab
--- /dev/null
+++ b/synapseclient/models/services/migration_types.py
@@ -0,0 +1,391 @@
+"""
+Data classes and enums for the async migration service.
+
+These types are used to track the state of file migrations between storage locations.
+"""
+
+import asyncio
+import csv
+from dataclasses import dataclass, fields
+from enum import Enum
+from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional
+
+from synapseclient.core.constants import concrete_types
+
+if TYPE_CHECKING:
+ from synapseclient import Synapse
+
+
+class MigrationStatus(Enum):
+ """Internal enum used by the SQLite database to track the state of entities during indexing and migration."""
+
+ INDEXED = 1
+ """The file has been indexed and is ready to be migrated."""
+
+ MIGRATED = 2
+ """The file has been successfully migrated to the new storage location."""
+
+ ALREADY_MIGRATED = 3
+ """The file was already at the destination storage location and no migration is needed."""
+
+ ERRORED = 4
+ """An error occurred during indexing or migration for this entity."""
+
+
+class MigrationType(Enum):
+ """Type of entity being tracked in the migration database.
+ Container types (projects and folders) are only used during the indexing phase.
+ we record the containers we've indexed so we don't reindex them on a subsequent
+ run using the same db file (or reindex them after an indexing dry run)"""
+
+ PROJECT = 1
+ """A project entity."""
+
+ FOLDER = 2
+ """A folder entity."""
+
+ FILE = 3
+ """A file entity."""
+
+ TABLE_ATTACHED_FILE = 4
+ """A file handle that is attached to a table column."""
+
+ @classmethod
+ def from_concrete_type(cls, concrete_type: str) -> "MigrationType":
+ """Convert a Synapse concrete type string to a MigrationType.
+
+ Arguments:
+ concrete_type: The concrete type of the entity.
+
+ Returns:
+ The corresponding MigrationType enum value.
+
+ Raises:
+ ValueError: If the concrete type is not recognized.
+ """
+ if concrete_type == concrete_types.PROJECT_ENTITY:
+ return cls.PROJECT
+ elif concrete_type == concrete_types.FOLDER_ENTITY:
+ return cls.FOLDER
+ elif concrete_type == concrete_types.FILE_ENTITY:
+ return cls.FILE
+ elif concrete_type == concrete_types.TABLE_ENTITY:
+ return cls.TABLE_ATTACHED_FILE
+
+ raise ValueError(f"Unhandled concrete type: {concrete_type}")
+
+
+@dataclass
+class MigrationKey:
+ """Unique identifier for a entry in the migrations database.
+
+ Attributes:
+ id: The Synapse entity ID.
+ type: The migration type of entity being migrated.
+ version: The file version number (None for new versions or containers). #TODO double check if versions are NONE for containers
+ row_id: The table row ID (for table attached files).
+ col_id: The table column ID (for table attached files).
+ """
+
+ id: str
+ type: MigrationType
+ version: Optional[int] = None
+ row_id: Optional[int] = None
+ col_id: Optional[int] = None
+
+ def __hash__(self) -> int:
+ return hash((self.id, self.type, self.version, self.row_id, self.col_id))
+
+ def __eq__(self, other: object) -> bool:
+ if not isinstance(other, MigrationKey):
+ return False
+ return (
+ self.id == other.id
+ and self.type == other.type
+ and self.version == other.version
+ and self.row_id == other.row_id
+ and self.col_id == other.col_id
+ )
+
+
+@dataclass
+class MigrationSettings:
+ """Settings for a migration index stored in the database.
+ TODO: check if this is used anywhere
+
+ Attributes:
+ root_id: The root entity ID being migrated.
+ dest_storage_location_id: The destination storage location ID.
+ source_storage_location_ids: List of of storage location ids that will be migrated.
+ file_version_strategy: Strategy for handling file versions.
+ include_table_files: Whether to include files attached to tables.
+ """
+
+ root_id: str
+ dest_storage_location_id: str
+ source_storage_location_ids: List[str] = None
+ file_version_strategy: str = "new"
+ include_table_files: bool = False
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Return a dict suitable for JSON serialization in the database."""
+ return {
+ "root_id": self.root_id,
+ "dest_storage_location_id": self.dest_storage_location_id,
+ "source_storage_location_ids": self.source_storage_location_ids,
+ "file_version_strategy": self.file_version_strategy,
+ "include_table_files": 1 if self.include_table_files else 0,
+ }
+
+ @classmethod
+ def from_dict(cls, d: Dict[str, Any]) -> "MigrationSettings":
+ """Build MigrationSettings from a dict (e.g. from JSON in the database)."""
+ include = d.get("include_table_files", False)
+ if isinstance(include, int):
+ include = bool(include)
+ return cls(
+ root_id=d["root_id"],
+ dest_storage_location_id=d["dest_storage_location_id"],
+ source_storage_location_ids=d.get("source_storage_location_ids") or [],
+ file_version_strategy=d.get("file_version_strategy", "new"),
+ include_table_files=include,
+ )
+
+ def verify_migration_settings(
+ self, existing_settings: "MigrationSettings", db_path: str
+ ) -> None:
+ """Raise ValueError if the migration settings do not match the existing settings"""
+ # compare all fields
+ for field in fields(self):
+ if getattr(self, field.name) != getattr(existing_settings, field.name):
+ # we can't resume indexing with an existing index file using a different setting.
+ raise ValueError(
+ "Index parameter does not match the setting recorded in the existing index file. "
+ "To change the index settings start over by deleting the file or using a different path. "
+ f"Expected {field.name} {getattr(existing_settings, field.name)}, found {getattr(self, field.name)} in index file {db_path}"
+ )
+
+
+class IndexingError(Exception):
+ """Error during an indexing operation.
+
+ Attributes:
+ entity_id: The entity ID that failed to index.
+ concrete_type: The concrete type of the entity.
+ """
+
+ def __init__(self, entity_id: str, concrete_type: str):
+ self.entity_id = entity_id
+ self.concrete_type = concrete_type
+
+
+@dataclass
+class MigrationResult:
+ """Result of a migration operation - proxy to the SQLite tracking database.
+
+ This class provides methods to query the migration database for status counts,
+ individual migration entries, and CSV export.
+
+ Attributes:
+ db_path: Path to the SQLite database file.
+ synapse_client: Optional Synapse client for column name lookups.
+ """
+
+ db_path: str
+ synapse_client: Optional["Synapse"] = None
+
+ @property
+ def counts_by_status(self) -> Dict[str, int]:
+ """Get counts by migration status (synchronous).
+
+ Returns:
+ Dictionary mapping status names to counts.
+ """
+ return self.get_counts_by_status()
+
+ def get_counts_by_status(self) -> Dict[str, int]:
+ """Get counts by migration status (synchronous).
+
+ Returns:
+ Dictionary mapping status names to counts.
+ """
+ import sqlite3
+
+ with sqlite3.connect(self.db_path) as conn:
+ cursor = conn.cursor()
+
+ # Only count FILE and TABLE_ATTACHED_FILE entries
+ result = cursor.execute(
+ "SELECT status, count(*) FROM migrations "
+ "WHERE type IN (?, ?) GROUP BY status",
+ (MigrationType.FILE.value, MigrationType.TABLE_ATTACHED_FILE.value),
+ )
+
+ counts = {status.name: 0 for status in MigrationStatus}
+ for row in result:
+ status_value = row[0]
+ count = row[1]
+ counts[MigrationStatus(status_value).name] = count
+
+ return counts
+
+ async def get_counts_by_status_async(self) -> Dict[str, int]:
+ """Get counts by migration status (asynchronous).
+
+ Returns:
+ Dictionary mapping status names to counts.
+ """
+ return await asyncio.to_thread(self.get_counts_by_status)
+
+ def get_migrations(self) -> Iterator[Dict[str, Any]]:
+ """Iterate over all migration entries (synchronous).
+
+ Yields:
+ Dictionary for each migration entry with keys:
+ id, type, version, row_id, col_name, from_storage_location_id,
+ from_file_handle_id, to_file_handle_id, file_size, status, exception.
+ """
+ import sqlite3
+
+ with sqlite3.connect(self.db_path) as conn:
+ cursor = conn.cursor()
+
+ batch_size = 500
+ rowid = -1
+ column_names_cache: Dict[int, str] = {}
+
+ while True:
+ results = cursor.execute(
+ """
+ SELECT
+ rowid,
+ id,
+ type,
+ version,
+ row_id,
+ col_id,
+ from_storage_location_id,
+ from_file_handle_id,
+ to_file_handle_id,
+ file_size,
+ status,
+ exception
+ FROM migrations
+ WHERE
+ rowid > ?
+ AND type IN (?, ?)
+ ORDER BY rowid
+ LIMIT ?
+ """,
+ (
+ rowid,
+ MigrationType.FILE.value,
+ MigrationType.TABLE_ATTACHED_FILE.value,
+ batch_size,
+ ),
+ )
+
+ rows = results.fetchall()
+ if not rows:
+ break
+
+ for row in rows:
+ rowid = row[0]
+ col_id = row[5]
+
+ # Resolve column name if needed
+ col_name = None
+ if col_id is not None and self.synapse_client:
+ if col_id not in column_names_cache:
+ try:
+ col_info = self.synapse_client.restGET(
+ f"/column/{col_id}"
+ )
+ column_names_cache[col_id] = col_info.get("name", "")
+ except Exception:
+ column_names_cache[col_id] = ""
+ col_name = column_names_cache[col_id]
+
+ yield {
+ "id": row[1],
+ "type": (
+ "file" if row[2] == MigrationType.FILE.value else "table"
+ ),
+ "version": row[3],
+ "row_id": row[4],
+ "col_name": col_name,
+ "from_storage_location_id": row[6],
+ "from_file_handle_id": row[7],
+ "to_file_handle_id": row[8],
+ "file_size": row[9],
+ "status": MigrationStatus(row[10]).name,
+ "exception": row[11],
+ }
+
+ async def get_migrations_async(self) -> List[Dict[str, Any]]:
+ """Get all migration entries (asynchronous).
+
+ Returns:
+ List of dictionaries for each migration entry.
+ """
+ # Convert to list since generators can't be returned from to_thread
+ return await asyncio.to_thread(lambda: list(self.get_migrations()))
+
+ def as_csv(self, path: str) -> None:
+ """Export migration results to a CSV file (synchronous).
+
+ Arguments:
+ path: Path to write the CSV file.
+ """
+ fieldnames = [
+ "id",
+ "type",
+ "version",
+ "row_id",
+ "col_name",
+ "from_storage_location_id",
+ "from_file_handle_id",
+ "to_file_handle_id",
+ "file_size",
+ "status",
+ "exception",
+ ]
+
+ with open(path, "w", newline="") as csvfile:
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+ writer.writeheader()
+ for migration in self.get_migrations():
+ writer.writerow(migration)
+
+ async def as_csv_async(self, path: str) -> None:
+ """Export migration results to a CSV file (asynchronous).
+
+ Arguments:
+ path: Path to write the CSV file.
+ """
+ await asyncio.to_thread(self.as_csv, path)
+
+
+class MigrationError(Exception):
+ """Error during a migration operation.
+
+ Attributes:
+ key: The migration key that failed.
+ from_file_handle_id: The source file handle ID.
+ to_file_handle_id: The destination file handle ID (if partially complete).
+ """
+
+ def __init__(
+ self,
+ key: MigrationKey,
+ from_file_handle_id: str,
+ to_file_handle_id: Optional[str] = None,
+ cause: Optional[Exception] = None,
+ ):
+ self.key = key
+ self.from_file_handle_id = from_file_handle_id
+ self.to_file_handle_id = to_file_handle_id
+ message = f"Migration failed for {key.id}"
+ if cause is not None:
+ message += f": {cause}"
+ super().__init__(message)
diff --git a/synapseclient/models/storage_location.py b/synapseclient/models/storage_location.py
new file mode 100644
index 000000000..5398c71d6
--- /dev/null
+++ b/synapseclient/models/storage_location.py
@@ -0,0 +1,518 @@
+"""StorageLocation model for managing storage location settings in Synapse."""
+
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, Optional
+
+from synapseclient import Synapse
+from synapseclient.api.storage_location_services import (
+ create_storage_location_setting,
+ get_storage_location_setting,
+)
+from synapseclient.core.async_utils import async_to_sync, otel_trace_method
+from synapseclient.models.mixins.enum_coercion import EnumCoercionMixin
+from synapseclient.models.protocols.storage_location_protocol import (
+ StorageLocationSynchronousProtocol,
+)
+
+
+@dataclass(frozen=True)
+class StorageLocationType:
+ """Describes a Synapse storage location type.
+
+ Each instance is a distinct object identified by its ``name``, so SFTP and
+ HTTPS remain separate even though they share the same backend
+ ``concreteType`` (``ExternalStorageLocationSetting``).
+
+ Attributes:
+ name: Human-readable identifier (e.g. ``"EXTERNAL_SFTP"``).
+ concrete_type: The ``concreteType`` suffix sent to the Synapse REST API.
+ """
+
+ name: str
+ concrete_type: str
+
+ def __str__(self) -> str:
+ return self.name
+
+ def __repr__(self) -> str:
+ return f"StorageLocationType.{self.name}"
+
+
+StorageLocationType.SYNAPSE_S3 = StorageLocationType(
+ "SYNAPSE_S3", "S3StorageLocationSetting"
+)
+StorageLocationType.EXTERNAL_S3 = StorageLocationType(
+ "EXTERNAL_S3", "ExternalS3StorageLocationSetting"
+)
+StorageLocationType.EXTERNAL_GOOGLE_CLOUD = StorageLocationType(
+ "EXTERNAL_GOOGLE_CLOUD", "ExternalGoogleCloudStorageLocationSetting"
+)
+StorageLocationType.EXTERNAL_SFTP = StorageLocationType(
+ "EXTERNAL_SFTP", "ExternalStorageLocationSetting"
+)
+StorageLocationType.EXTERNAL_HTTPS = StorageLocationType(
+ "EXTERNAL_HTTPS", "ExternalStorageLocationSetting"
+)
+StorageLocationType.EXTERNAL_OBJECT_STORE = StorageLocationType(
+ "EXTERNAL_OBJECT_STORE", "ExternalObjectStorageLocationSetting"
+)
+StorageLocationType.PROXY = StorageLocationType("PROXY", "ProxyStorageLocationSettings")
+
+
+class UploadType(str, Enum):
+ """Enumeration of upload types for storage locations.
+
+ Attributes:
+ S3: Amazon S3 compatible upload.
+ GOOGLE_CLOUD_STORAGE: Google Cloud Storage upload.
+ SFTP: SFTP upload.
+ HTTPS: HTTPS upload (typically used with proxy storage).
+ NONE: No upload type specified.
+ """
+
+ S3 = "S3"
+ GOOGLE_CLOUD_STORAGE = "GOOGLECLOUDSTORAGE"
+ SFTP = "SFTP"
+ HTTPS = "HTTPS"
+ PROXYLOCAL = "PROXYLOCAL"
+ NONE = "NONE"
+
+
+# Mapping from StorageLocationType to default UploadType
+_STORAGE_TYPE_TO_UPLOAD_TYPE: Dict[StorageLocationType, UploadType] = {
+ StorageLocationType.SYNAPSE_S3: UploadType.S3,
+ StorageLocationType.EXTERNAL_S3: UploadType.S3,
+ StorageLocationType.EXTERNAL_GOOGLE_CLOUD: UploadType.GOOGLE_CLOUD_STORAGE,
+ StorageLocationType.EXTERNAL_SFTP: UploadType.SFTP,
+ StorageLocationType.EXTERNAL_HTTPS: UploadType.HTTPS,
+ StorageLocationType.EXTERNAL_OBJECT_STORE: UploadType.S3,
+ StorageLocationType.PROXY: UploadType.PROXYLOCAL,
+}
+
+# Mapping from (concreteType suffix, uploadType value) -> StorageLocationType.
+# The tuple key is required because EXTERNAL_SFTP and EXTERNAL_HTTPS share the
+# same concreteType and are disambiguated by uploadType.
+_CONCRETE_UPLOAD_TO_STORAGE_TYPE: Dict[tuple, StorageLocationType] = {
+ (storage_type.concrete_type, upload_type.value): storage_type
+ for storage_type, upload_type in _STORAGE_TYPE_TO_UPLOAD_TYPE.items()
+}
+
+# Mapping from StorageLocationType to its type-specific (field_name, api_key) pairs.
+# Only fields listed here are populated by fill_from_dict for a given type.
+_STORAGE_TYPE_SPECIFIC_FIELDS: Dict[StorageLocationType, Dict[str, str]] = {
+ StorageLocationType.SYNAPSE_S3: {
+ "base_key": "baseKey",
+ "sts_enabled": "stsEnabled",
+ },
+ StorageLocationType.EXTERNAL_S3: {
+ "bucket": "bucket",
+ "base_key": "baseKey",
+ "sts_enabled": "stsEnabled",
+ "endpoint_url": "endpointUrl",
+ },
+ StorageLocationType.EXTERNAL_GOOGLE_CLOUD: {
+ "bucket": "bucket",
+ "base_key": "baseKey",
+ },
+ StorageLocationType.EXTERNAL_OBJECT_STORE: {
+ "bucket": "bucket",
+ "endpoint_url": "endpointUrl",
+ },
+ StorageLocationType.EXTERNAL_SFTP: {
+ "url": "url",
+ "supports_subfolders": "supportsSubfolders",
+ },
+ StorageLocationType.EXTERNAL_HTTPS: {
+ "url": "url",
+ "supports_subfolders": "supportsSubfolders",
+ },
+ StorageLocationType.PROXY: {
+ "proxy_url": "proxyUrl",
+ "secret_key": "secretKey",
+ "benefactor_id": "benefactorId",
+ },
+}
+
+
+@dataclass()
+@async_to_sync
+class StorageLocation(EnumCoercionMixin, StorageLocationSynchronousProtocol):
+ """A storage location setting describes where files are uploaded to and
+ downloaded from via Synapse. Storage location settings may be created for
+ external locations, such as user-owned Amazon S3 buckets, Google Cloud
+ Storage buckets, SFTP servers, or proxy storage.
+
+ Attributes:
+ storage_location_id: (Read Only) The unique ID for this storage location,
+ assigned by the server on creation.
+ storage_type: The type of storage location. Required when creating a new
+ storage location via `store()`. Determines the `concreteType` sent to
+ the Synapse REST API.
+ banner: The banner text to display to a user every time a file is uploaded.
+ This field is optional.
+ description: A description of the storage location. This description is
+ shown when a user has to choose which upload destination to use.
+
+ Attributes:
+ bucket: The name of the S3 or Google Cloud Storage bucket. Applicable to
+ SYNAPSE_S3, EXTERNAL_S3, EXTERNAL_GOOGLE_CLOUD, and
+ EXTERNAL_OBJECT_STORE types.
+ base_key: The optional base key (prefix/folder) within the bucket.
+ Applicable to SYNAPSE_S3, EXTERNAL_S3, and EXTERNAL_GOOGLE_CLOUD types.
+ sts_enabled: Whether STS (AWS Security Token Service) is enabled on this
+ storage location. Applicable to SYNAPSE_S3 and EXTERNAL_S3 types.
+ endpoint_url: The endpoint URL of the S3 service. Applicable to
+ EXTERNAL_S3 (default: https://s3.amazonaws.com) and
+ EXTERNAL_OBJECT_STORE types.
+
+ Attributes:
+ url: The base URL for uploading to the external destination. Applicable to
+ EXTERNAL_SFTP type.
+ supports_subfolders: Whether the destination supports creating subfolders
+ under the base url. Applicable to EXTERNAL_SFTP type. Default: False.
+
+ Attributes:
+ proxy_url: The HTTPS URL of the proxy used for upload and download.
+ Applicable to PROXY type.
+ secret_key: The encryption key used to sign all pre-signed URLs used to
+ communicate with the proxy. Applicable to PROXY type.
+ benefactor_id: An Entity ID (such as a Project ID). When set, any user with
+ the 'create' permission on the given benefactorId will be allowed to
+ create ProxyFileHandle using its storage location ID. Applicable to
+ PROXY type.
+
+ Attributes:
+ upload_type: (Read Only) The upload type for this storage location.
+ Automatically derived from `storage_type`.
+ etag: (Read Only) Synapse employs an Optimistic Concurrency Control (OCC)
+ scheme. The E-Tag changes every time the setting is updated.
+ created_on: (Read Only) The date this storage location setting was created.
+ created_by: (Read Only) The ID of the user that created this storage
+ location setting.
+
+ Example: Creating an external S3 storage location
+ Create a storage location backed by your own S3 bucket:
+
+ from synapseclient.models import StorageLocation, StorageLocationType
+
+ import synapseclient
+ synapseclient.login()
+
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket="my-external-synapse-bucket",
+ base_key="path/within/bucket",
+ ).store()
+
+ print(f"Storage location ID: {storage.storage_location_id}")
+
+ Example: Creating an STS-enabled S3 storage location with a folder
+ Use the convenience classmethod to create a folder with STS-enabled
+ storage:
+
+ from synapseclient.models import StorageLocation
+
+ import synapseclient
+ synapseclient.login()
+
+ folder, storage = StorageLocation.setup_s3(
+ folder_name="my-sts-folder",
+ parent="syn123",
+ bucket_name="my-external-synapse-bucket",
+ base_key="path/within/bucket",
+ sts_enabled=True,
+ )
+ print(f"Folder: {folder.id}, Storage: {storage.storage_location_id}")
+
+ Example: Creating a Google Cloud storage location
+ Create a storage location backed by your own GCS bucket:
+
+ from synapseclient.models import StorageLocation, StorageLocationType
+
+ import synapseclient
+ synapseclient.login()
+
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_GOOGLE_CLOUD,
+ bucket="my-gcs-bucket",
+ base_key="path/within/bucket",
+ ).store()
+ """
+
+ _ENUM_FIELDS = {
+ "upload_type": UploadType,
+ }
+
+ # Core fields - present on all storage locations
+ storage_location_id: Optional[int] = None
+ """(Read Only) The unique ID for this storage location, assigned by the server
+ on creation."""
+
+ storage_type: Optional[StorageLocationType] = None
+ """The type of storage location. Required when creating a new storage location
+ via `store()`. Determines the `concreteType` sent to the Synapse REST API."""
+
+ banner: Optional[str] = None
+ """The banner text to display to a user every time a file is uploaded."""
+
+ description: Optional[str] = None
+ """A description of the storage location. This description is shown when a user
+ has to choose which upload destination to use."""
+
+ # S3/GCS specific fields
+ bucket: Optional[str] = None
+ """The name of the S3 or Google Cloud Storage bucket. Applicable to SYNAPSE_S3,
+ EXTERNAL_S3, EXTERNAL_GOOGLE_CLOUD, and EXTERNAL_OBJECT_STORE types."""
+
+ base_key: Optional[str] = None
+ """The optional base key (prefix/folder) within the bucket. Applicable to
+ SYNAPSE_S3, EXTERNAL_S3, and EXTERNAL_GOOGLE_CLOUD types."""
+
+ sts_enabled: Optional[bool] = None
+ """Whether STS (AWS Security Token Service) is enabled on this storage location.
+ Applicable to SYNAPSE_S3 and EXTERNAL_S3 types."""
+
+ endpoint_url: Optional[str] = None
+ """The endpoint URL of the S3 service. Applicable to EXTERNAL_S3
+ (default: https://s3.amazonaws.com) and EXTERNAL_OBJECT_STORE types."""
+
+ # SFTP specific fields
+ url: Optional[str] = None
+ """The base URL for uploading to the external destination. Applicable to
+ EXTERNAL_SFTP type."""
+
+ supports_subfolders: Optional[bool] = None
+ """Whether the destination supports creating subfolders under the base url.
+ Applicable to EXTERNAL_SFTP type. Default: False."""
+
+ # Proxy specific fields
+ proxy_url: Optional[str] = None
+ """The HTTPS URL of the proxy used for upload and download. Applicable to
+ PROXY type."""
+
+ secret_key: Optional[str] = None
+ """The encryption key used to sign all pre-signed URLs used to communicate
+ with the proxy. Applicable to PROXY type."""
+
+ benefactor_id: Optional[str] = None
+ """An Entity ID (such as a Project ID). When set, any user with the 'create'
+ permission on the given benefactorId will be allowed to create ProxyFileHandle
+ using its storage location ID. Applicable to PROXY type."""
+
+ # Read-only fields
+ upload_type: Optional[UploadType] = field(default=None, repr=False, compare=False)
+ """(Read Only) The upload type for this storage location. Automatically derived
+ from `storage_type`."""
+
+ etag: Optional[str] = field(default=None, compare=False)
+ """(Read Only) Synapse employs an Optimistic Concurrency Control (OCC) scheme.
+ The E-Tag changes every time the setting is updated."""
+
+ created_on: Optional[str] = field(default=None, compare=False)
+ """(Read Only) The date this storage location setting was created."""
+
+ created_by: Optional[int] = field(default=None, compare=False)
+ """(Read Only) The ID of the user that created this storage location setting."""
+
+ def __repr__(self) -> str:
+ common = {
+ "concrete_type": self.concrete_type,
+ "storage_location_id": self.storage_location_id,
+ "storage_type": self.storage_type,
+ "upload_type": self.upload_type,
+ "banner": self.banner,
+ "description": self.description,
+ "etag": self.etag,
+ "created_on": self.created_on,
+ "created_by": self.created_by,
+ }
+ type_specific = {
+ field_name: getattr(self, field_name)
+ for field_name in _STORAGE_TYPE_SPECIFIC_FIELDS.get(self.storage_type, {})
+ }
+ parts = [f"{k}={v!r}" for k, v in {**common, **type_specific}.items()]
+ return f"StorageLocation({', '.join(parts)})"
+
+ def fill_from_dict(self, synapse_response: Dict[str, Any]) -> "StorageLocation":
+ """Converts a response from the REST API into this dataclass.
+
+ Arguments:
+ synapse_response: The response from the REST API.
+
+ Returns:
+ The StorageLocation object.
+ """
+ self.storage_location_id = synapse_response.get("storageLocationId", None)
+ self.banner = (
+ synapse_response.get("banner", None)
+ if synapse_response.get("banner", None) is not None
+ else None
+ )
+ self.description = (
+ synapse_response.get("description", None)
+ if synapse_response.get("description", None) is not None
+ else None
+ )
+ self.etag = synapse_response.get("etag", None)
+ self.created_on = synapse_response.get("createdOn", None)
+ self.created_by = synapse_response.get("createdBy", None)
+
+ self.upload_type = synapse_response.get("uploadType", None)
+
+ # Parse storage type from concreteType + uploadType.
+ # Both are needed to distinguish EXTERNAL_SFTP from EXTERNAL_HTTPS.
+ self.concrete_type = synapse_response.get("concreteType", "")
+ if self.concrete_type:
+ type_suffix = (
+ self.concrete_type.split(".")[-1] if "." in self.concrete_type else ""
+ )
+ self.upload_type = synapse_response.get("uploadType", "")
+ key = (type_suffix, self.upload_type)
+ if key in _CONCRETE_UPLOAD_TO_STORAGE_TYPE:
+ self.storage_type = _CONCRETE_UPLOAD_TO_STORAGE_TYPE[key]
+ # Type-specific fields — only populate attributes relevant to this storage type
+ if self.storage_type:
+ for field_name, api_key in _STORAGE_TYPE_SPECIFIC_FIELDS.get(
+ self.storage_type, {}
+ ).items():
+ setattr(self, field_name, synapse_response.get(api_key, None))
+ return self
+
+ def _to_synapse_request(self) -> Dict[str, Any]:
+ """Convert this dataclass to a request body for the REST API.
+
+ Returns:
+ A dictionary suitable for the REST API.
+ """
+ if not self.storage_type:
+ raise ValueError(
+ "storage_type is required when creating a storage location"
+ )
+
+ # Build the concrete type
+ concrete_type = (
+ f"org.sagebionetworks.repo.model.project.{self.storage_type.concrete_type}"
+ )
+ # Determine upload type
+ upload_type = self.upload_type or _STORAGE_TYPE_TO_UPLOAD_TYPE.get(
+ self.storage_type
+ )
+
+ body: Dict[str, Any] = {
+ "concreteType": concrete_type,
+ "uploadType": upload_type.value,
+ }
+
+ # Add optional common fields
+ body["banner"] = self.banner if self.banner is not None else None
+ body["description"] = self.description if self.description is not None else None
+ # Add type-specific fields using the same mapping used by fill_from_dict
+ for field_name, api_key in _STORAGE_TYPE_SPECIFIC_FIELDS.get(
+ self.storage_type, {}
+ ).items():
+ value = getattr(self, field_name, None)
+ if value is not None:
+ body[api_key] = value
+ return body
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"StorageLocation_Store: {self.storage_type}"
+ )
+ async def store_async(
+ self,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> "StorageLocation":
+ """Create this storage location in Synapse. Storage locations are immutable;
+ this always creates a new one. If a storage location with identical properties
+ already exists for this user, the existing one is returned (idempotent).
+
+ Arguments:
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The StorageLocation object with server-assigned fields populated.
+
+ Raises:
+ ValueError: If `storage_type` is not set.
+
+ Example: Using this function
+ Create an external S3 storage location:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import StorageLocation, StorageLocationType
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ storage = await StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket="my-bucket",
+ base_key="my/prefix",
+ ).store_async()
+ print(f"Created storage location: {storage.storage_location_id}")
+
+ asyncio.run(main())
+ """
+ request = self._to_synapse_request()
+ response = await create_storage_location_setting(
+ request=request,
+ synapse_client=synapse_client,
+ )
+ self.fill_from_dict(response)
+ return self
+
+ @otel_trace_method(
+ method_to_trace_name=lambda self, **kwargs: f"StorageLocation_Get: {self.storage_location_id}"
+ )
+ async def get_async(
+ self,
+ *,
+ synapse_client: Optional[Synapse] = None,
+ ) -> "StorageLocation":
+ """Retrieve this storage location from Synapse by its ID. Only the creator of
+ a StorageLocationSetting can retrieve it by its id.
+
+ Arguments:
+ synapse_client: If not passed in and caching was not disabled by
+ `Synapse.allow_client_caching(False)` this will use the last created
+ instance from the Synapse class constructor.
+
+ Returns:
+ The StorageLocation object populated with data from Synapse.
+
+ Raises:
+ ValueError: If `storage_location_id` is not set.
+
+ Example: Using this function
+ Retrieve a storage location by ID:
+
+ import asyncio
+ from synapseclient import Synapse
+ from synapseclient.models import StorageLocation
+
+ syn = Synapse()
+ syn.login()
+
+ async def main():
+ storage = await StorageLocation(storage_location_id=12345).get_async()
+ print(f"Type: {storage.storage_type}, Bucket: {storage.bucket}")
+
+ asyncio.run(main())
+ """
+ if not self.storage_location_id:
+ raise ValueError(
+ "storage_location_id is required to retrieve a storage location"
+ )
+
+ response = await get_storage_location_setting(
+ storage_location_id=self.storage_location_id,
+ synapse_client=synapse_client,
+ )
+ self.fill_from_dict(response)
+ return self
diff --git a/tests/unit/synapseclient/api/unit_test_storage_location_services.py b/tests/unit/synapseclient/api/unit_test_storage_location_services.py
new file mode 100644
index 000000000..bebc80d50
--- /dev/null
+++ b/tests/unit/synapseclient/api/unit_test_storage_location_services.py
@@ -0,0 +1,215 @@
+"""Unit tests for storage_location_services utility functions."""
+
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+import synapseclient.api.storage_location_services as storage_location_services
+
+
+class TestCreateStorageLocationSetting:
+ """Tests for create_storage_location_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_create_storage_location_setting(self, mock_synapse):
+ """Test create_storage_location_setting creates a storage location."""
+ # GIVEN a mock client that returns a storage location
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_post_async.return_value = {
+ "storageLocationId": 12345,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-bucket",
+ }
+
+ # WHEN I call create_storage_location_setting
+ body = {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-bucket",
+ }
+ result = await storage_location_services.create_storage_location_setting(
+ body=body,
+ synapse_client=None,
+ )
+
+ # THEN I expect the storage location to be returned
+ assert result["storageLocationId"] == 12345
+ assert result["bucket"] == "my-bucket"
+ mock_client.rest_post_async.assert_awaited_once()
+
+
+class TestGetStorageLocationSetting:
+ """Tests for get_storage_location_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_get_storage_location_setting(self, mock_synapse):
+ """Test get_storage_location_setting retrieves a storage location."""
+ # GIVEN a mock client that returns a storage location
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_get_async.return_value = {
+ "storageLocationId": 12345,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-bucket",
+ }
+
+ # WHEN I call get_storage_location_setting
+ result = await storage_location_services.get_storage_location_setting(
+ storage_location_id=12345,
+ synapse_client=None,
+ )
+
+ # THEN I expect the storage location to be returned
+ assert result["storageLocationId"] == 12345
+ assert result["bucket"] == "my-bucket"
+ mock_client.rest_get_async.assert_awaited_once_with(
+ uri="/storageLocation/12345",
+ )
+
+
+class TestGetProjectSetting:
+ """Tests for get_project_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_get_project_setting_exists(self, mock_synapse):
+ """Test get_project_setting when setting exists."""
+ # GIVEN a mock client that returns a project setting
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_get_async.return_value = {
+ "id": "setting123",
+ "projectId": "syn456",
+ "settingsType": "upload",
+ "locations": [12345],
+ }
+
+ # WHEN I call get_project_setting
+ result = await storage_location_services.get_project_setting(
+ project_id="syn456",
+ setting_type="upload",
+ synapse_client=None,
+ )
+
+ # THEN I expect the project setting to be returned
+ assert result["id"] == "setting123"
+ assert result["locations"] == [12345]
+ mock_client.rest_get_async.assert_awaited_once_with(
+ uri="/projectSettings/syn456/type/upload",
+ )
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_get_project_setting_not_exists(self, mock_synapse):
+ """Test get_project_setting when setting does not exist."""
+ # GIVEN a mock client that returns empty response
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_get_async.return_value = ""
+
+ # WHEN I call get_project_setting
+ result = await storage_location_services.get_project_setting(
+ project_id="syn456",
+ setting_type="upload",
+ synapse_client=None,
+ )
+
+ # THEN I expect None to be returned
+ assert result is None
+
+
+class TestCreateProjectSetting:
+ """Tests for create_project_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_create_project_setting(self, mock_synapse):
+ """Test create_project_setting creates a project setting."""
+ # GIVEN a mock client that returns a project setting
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_post_async.return_value = {
+ "id": "setting123",
+ "projectId": "syn456",
+ "settingsType": "upload",
+ "locations": [12345],
+ }
+
+ # WHEN I call create_project_setting
+ body = {
+ "concreteType": "org.sagebionetworks.repo.model.project.UploadDestinationListSetting",
+ "settingsType": "upload",
+ "locations": [12345],
+ "projectId": "syn456",
+ }
+ result = await storage_location_services.create_project_setting(
+ body=body,
+ synapse_client=None,
+ )
+
+ # THEN I expect the project setting to be returned
+ assert result["id"] == "setting123"
+ mock_client.rest_post_async.assert_awaited_once()
+
+
+class TestUpdateProjectSetting:
+ """Tests for update_project_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_update_project_setting(self, mock_synapse):
+ """Test update_project_setting updates a project setting."""
+ # GIVEN a mock client that returns an updated project setting
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_put_async.return_value = {
+ "id": "setting123",
+ "projectId": "syn456",
+ "settingsType": "upload",
+ "locations": [12345, 67890],
+ }
+
+ # WHEN I call update_project_setting
+ body = {
+ "id": "setting123",
+ "projectId": "syn456",
+ "settingsType": "upload",
+ "locations": [12345, 67890],
+ }
+ result = await storage_location_services.update_project_setting(
+ body=body,
+ synapse_client=None,
+ )
+
+ # THEN I expect the updated project setting to be returned
+ assert result["locations"] == [12345, 67890]
+ mock_client.rest_put_async.assert_awaited_once()
+
+
+class TestDeleteProjectSetting:
+ """Tests for delete_project_setting function."""
+
+ @pytest.mark.asyncio
+ @patch("synapseclient.Synapse")
+ async def test_delete_project_setting(self, mock_synapse):
+ """Test delete_project_setting deletes a project setting."""
+ # GIVEN a mock client
+ mock_client = AsyncMock()
+ mock_synapse.get_client.return_value = mock_client
+ mock_client.rest_delete_async.return_value = None
+
+ # WHEN I call delete_project_setting
+ await storage_location_services.delete_project_setting(
+ setting_id="setting123",
+ synapse_client=None,
+ )
+
+ # THEN I expect the delete to be called
+ mock_client.rest_delete_async.assert_awaited_once_with(
+ uri="/projectSettings/setting123",
+ )
diff --git a/tests/unit/synapseclient/models/unit_test_storage_location.py b/tests/unit/synapseclient/models/unit_test_storage_location.py
new file mode 100644
index 000000000..ee33ad4ff
--- /dev/null
+++ b/tests/unit/synapseclient/models/unit_test_storage_location.py
@@ -0,0 +1,471 @@
+"""Unit tests for the synapseclient.models.StorageLocation class."""
+
+import pytest
+
+from synapseclient.models import StorageLocation, StorageLocationType, UploadType
+
+
+class TestStorageLocation:
+ """Unit tests for basic StorageLocation model functionality."""
+
+ def test_storage_location_type_concrete_type_values(self):
+ """Test that StorageLocationType instances have the correct concrete_type values."""
+ assert (
+ StorageLocationType.SYNAPSE_S3.concrete_type == "S3StorageLocationSetting"
+ )
+ assert (
+ StorageLocationType.EXTERNAL_S3.concrete_type
+ == "ExternalS3StorageLocationSetting"
+ )
+ assert (
+ StorageLocationType.EXTERNAL_GOOGLE_CLOUD.concrete_type
+ == "ExternalGoogleCloudStorageLocationSetting"
+ )
+ assert (
+ StorageLocationType.EXTERNAL_SFTP.concrete_type
+ == "ExternalStorageLocationSetting"
+ )
+ # EXTERNAL_SFTP and EXTERNAL_HTTPS share the same concreteType but are distinct objects
+ assert (
+ StorageLocationType.EXTERNAL_HTTPS.concrete_type
+ == "ExternalStorageLocationSetting"
+ )
+ assert (
+ StorageLocationType.EXTERNAL_SFTP is not StorageLocationType.EXTERNAL_HTTPS
+ )
+ assert (
+ StorageLocationType.EXTERNAL_OBJECT_STORE.concrete_type
+ == "ExternalObjectStorageLocationSetting"
+ )
+ assert StorageLocationType.PROXY.concrete_type == "ProxyStorageLocationSettings"
+
+ def test_upload_type_enum_values(self):
+ """Test that UploadType enum has correct values."""
+ assert UploadType.S3.value == "S3"
+ assert UploadType.GOOGLE_CLOUD_STORAGE.value == "GOOGLECLOUDSTORAGE"
+ assert UploadType.SFTP.value == "SFTP"
+ assert UploadType.HTTPS.value == "HTTPS"
+ assert UploadType.PROXYLOCAL.value == "PROXYLOCAL"
+ assert UploadType.NONE.value == "NONE"
+
+ def test_to_synapse_request_external_s3(self):
+ """Test generating a request body for EXTERNAL_S3 storage location."""
+ # GIVEN an EXTERNAL_S3 storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_S3,
+ bucket="my-bucket",
+ base_key="my/prefix",
+ sts_enabled=True,
+ banner="Upload banner",
+ description="Test storage location",
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting",
+ "uploadType": "S3",
+ "banner": "Upload banner",
+ "description": "Test storage location",
+ "bucket": "my-bucket",
+ "baseKey": "my/prefix",
+ "stsEnabled": True,
+ }
+
+ def test_to_synapse_request_synapse_s3(self):
+ """Test generating a request body for SYNAPSE_S3 storage location."""
+ # GIVEN a SYNAPSE_S3 storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.SYNAPSE_S3,
+ sts_enabled=False,
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.S3StorageLocationSetting",
+ "uploadType": "S3",
+ "banner": None,
+ "description": None,
+ "stsEnabled": False,
+ }
+
+ def test_to_synapse_request_google_cloud(self):
+ """Test generating a request body for EXTERNAL_GOOGLE_CLOUD storage location."""
+ # GIVEN a EXTERNAL_GOOGLE_CLOUD storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_GOOGLE_CLOUD,
+ bucket="my-gcs-bucket",
+ base_key="gcs/prefix",
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting",
+ "uploadType": "GOOGLECLOUDSTORAGE",
+ "banner": None,
+ "description": None,
+ "bucket": "my-gcs-bucket",
+ "baseKey": "gcs/prefix",
+ }
+
+ def test_to_synapse_request_sftp(self):
+ """Test generating a request body for EXTERNAL_SFTP storage location."""
+ # GIVEN an EXTERNAL_SFTP storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_SFTP,
+ url="sftp://example.com/path",
+ supports_subfolders=True,
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure with SFTP uploadType.
+ # EXTERNAL_SFTP and EXTERNAL_HTTPS are distinct objects with separate entries
+ # in _STORAGE_TYPE_TO_UPLOAD_TYPE, so EXTERNAL_SFTP correctly maps to SFTP.
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting",
+ "uploadType": "SFTP",
+ "banner": None,
+ "description": None,
+ "url": "sftp://example.com/path",
+ "supportsSubfolders": True,
+ }
+
+ def test_to_synapse_request_external_https(self):
+ """Test generating a request body for EXTERNAL_HTTPS storage location."""
+ # GIVEN an EXTERNAL_HTTPS storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_HTTPS,
+ url="https://example.com/data",
+ supports_subfolders=False,
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the same concreteType as EXTERNAL_SFTP but HTTPS uploadType
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting",
+ "uploadType": "HTTPS",
+ "banner": None,
+ "description": None,
+ "url": "https://example.com/data",
+ "supportsSubfolders": False,
+ }
+
+ def test_to_synapse_request_proxy(self):
+ """Test generating a request body for PROXY storage location."""
+ # GIVEN a PROXY storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.PROXY,
+ proxy_url="https://proxy.example.com",
+ secret_key="my-secret-key",
+ benefactor_id="syn123",
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ProxyStorageLocationSettings",
+ "uploadType": "PROXYLOCAL",
+ "banner": None,
+ "description": None,
+ "proxyUrl": "https://proxy.example.com",
+ "secretKey": "my-secret-key",
+ "benefactorId": "syn123",
+ }
+
+ def test_to_synapse_request_external_object_store(self):
+ """Test generating a request body for EXTERNAL_OBJECT_STORE storage location."""
+ # GIVEN an EXTERNAL_OBJECT_STORE storage location
+ storage = StorageLocation(
+ storage_type=StorageLocationType.EXTERNAL_OBJECT_STORE,
+ bucket="my-s3-like-bucket",
+ endpoint_url="https://s3.custom.com",
+ )
+
+ # WHEN we generate a request body
+ request_body = storage._to_synapse_request()
+
+ # THEN it should have the correct structure
+ assert request_body == {
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalObjectStorageLocationSetting",
+ "uploadType": "S3",
+ "banner": None,
+ "description": None,
+ "bucket": "my-s3-like-bucket",
+ "endpointUrl": "https://s3.custom.com",
+ }
+
+ def test_to_synapse_request_missing_storage_type(self):
+ """Test that _to_synapse_request raises ValueError when storage_type is missing."""
+ # GIVEN a storage location without a storage_type
+ storage = StorageLocation(
+ bucket="my-bucket",
+ )
+
+ # THEN it should raise ValueError
+ with pytest.raises(ValueError, match="storage_type is required"):
+ storage._to_synapse_request()
+
+ def test_fill_from_dict_external_s3(self):
+ """Test filling from a REST API response for EXTERNAL_S3."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 12345,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalS3StorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-bucket",
+ "baseKey": "my/prefix",
+ "stsEnabled": True,
+ "banner": "Upload banner",
+ "description": "Test storage location",
+ "etag": "abc123",
+ "createdOn": "2024-01-01T00:00:00.000Z",
+ "createdBy": 123456,
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 12345
+ assert storage.storage_type == StorageLocationType.EXTERNAL_S3
+ assert storage.upload_type == UploadType.S3
+ assert storage.bucket == "my-bucket"
+ assert storage.base_key == "my/prefix"
+ assert storage.sts_enabled is True
+ assert storage.banner == "Upload banner"
+ assert storage.description == "Test storage location"
+ assert storage.etag == "abc123"
+ assert storage.created_on == "2024-01-01T00:00:00.000Z"
+ assert storage.created_by == 123456
+
+ def test_fill_from_dict_synapse_s3(self):
+ """Test filling from a REST API response for SYNAPSE_S3."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 1,
+ "concreteType": "org.sagebionetworks.repo.model.project.S3StorageLocationSetting",
+ "uploadType": "S3",
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 1
+ assert storage.storage_type == StorageLocationType.SYNAPSE_S3
+
+ def test_fill_from_dict_google_cloud(self):
+ """Test filling from a REST API response for EXTERNAL_GOOGLE_CLOUD."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 67890,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalGoogleCloudStorageLocationSetting",
+ "uploadType": "GOOGLECLOUDSTORAGE",
+ "bucket": "my-gcs-bucket",
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 67890
+ assert storage.storage_type == StorageLocationType.EXTERNAL_GOOGLE_CLOUD
+ assert storage.upload_type == UploadType.GOOGLE_CLOUD_STORAGE
+ assert storage.bucket == "my-gcs-bucket"
+
+ def test_fill_from_dict_sftp(self):
+ """Test filling from a REST API response for EXTERNAL_SFTP."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 11111,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting",
+ "uploadType": "SFTP",
+ "url": "sftp://example.com/path",
+ "supportsSubfolders": True,
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 11111
+ assert storage.storage_type == StorageLocationType.EXTERNAL_SFTP
+ assert storage.upload_type == UploadType.SFTP
+ assert storage.url == "sftp://example.com/path"
+ assert storage.supports_subfolders is True
+
+ def test_fill_from_dict_external_https(self):
+ """Test filling from a REST API response for EXTERNAL_HTTPS."""
+ storage = StorageLocation()
+
+ response = {
+ "storageLocationId": 11112,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting",
+ "uploadType": "HTTPS",
+ "url": "https://example.com/data",
+ "supportsSubfolders": False,
+ }
+
+ storage.fill_from_dict(response)
+
+ assert storage.storage_location_id == 11112
+ assert storage.storage_type == StorageLocationType.EXTERNAL_HTTPS
+ assert storage.upload_type == UploadType.HTTPS
+ assert storage.url == "https://example.com/data"
+ assert storage.supports_subfolders is False
+
+ def test_fill_from_dict_proxy(self):
+ """Test filling from a REST API response for PROXY."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 22222,
+ "concreteType": "org.sagebionetworks.repo.model.project.ProxyStorageLocationSettings",
+ "uploadType": "PROXYLOCAL",
+ "proxyUrl": "https://proxy.example.com",
+ "secretKey": "my-secret-key",
+ "benefactorId": "syn123",
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 22222
+ assert storage.storage_type == StorageLocationType.PROXY
+ assert storage.upload_type == UploadType.PROXYLOCAL
+ assert storage.proxy_url == "https://proxy.example.com"
+ assert storage.secret_key == "my-secret-key"
+ assert storage.benefactor_id == "syn123"
+
+ def test_fill_from_dict_external_object_store(self):
+ """Test filling from a REST API response for EXTERNAL_OBJECT_STORE."""
+ # GIVEN a storage location
+ storage = StorageLocation()
+
+ # AND a response from the REST API
+ response = {
+ "storageLocationId": 33333,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalObjectStorageLocationSetting",
+ "uploadType": "S3",
+ "bucket": "my-object-store-bucket",
+ "endpointUrl": "https://s3.custom.com",
+ }
+
+ # WHEN we fill from the response
+ storage.fill_from_dict(response)
+
+ # THEN the storage location should be populated correctly
+ assert storage.storage_location_id == 33333
+ assert storage.storage_type == StorageLocationType.EXTERNAL_OBJECT_STORE
+ assert storage.upload_type == UploadType.S3
+ assert storage.bucket == "my-object-store-bucket"
+ assert storage.endpoint_url == "https://s3.custom.com"
+
+ def test_fill_from_dict_type_isolation(self):
+ """Test that fill_from_dict only populates fields relevant to the storage type."""
+ # GIVEN an EXTERNAL_SFTP response (no S3 or proxy fields)
+ sftp_response = {
+ "storageLocationId": 44444,
+ "concreteType": "org.sagebionetworks.repo.model.project.ExternalStorageLocationSetting",
+ "uploadType": "SFTP",
+ "url": "sftp://example.com/path",
+ }
+ storage = StorageLocation()
+ storage.fill_from_dict(sftp_response)
+
+ # THEN S3/proxy fields are not populated
+ assert storage.bucket is None
+ assert storage.base_key is None
+ assert storage.sts_enabled is None
+ assert storage.endpoint_url is None
+ assert storage.proxy_url is None
+ assert storage.secret_key is None
+
+ # GIVEN a PROXY response (no S3 or SFTP fields)
+ proxy_response = {
+ "storageLocationId": 55555,
+ "concreteType": "org.sagebionetworks.repo.model.project.ProxyStorageLocationSettings",
+ "uploadType": "PROXYLOCAL",
+ "proxyUrl": "https://proxy.example.com",
+ }
+ storage2 = StorageLocation()
+ storage2.fill_from_dict(proxy_response)
+
+ # THEN SFTP/S3 fields are not populated
+ assert storage2.bucket is None
+ assert storage2.url is None
+ assert storage2.supports_subfolders is None
+ assert storage2.sts_enabled is None
+
+ def test_upload_type_enum_coercion_on_init(self):
+ """Test that upload_type string values are coerced to UploadType via EnumCoercionMixin."""
+ # GIVEN a StorageLocation constructed with a string value for upload_type
+ # (upload_type is the only field in _ENUM_FIELDS; storage_type is not coerced)
+ storage = StorageLocation(upload_type="S3")
+
+ # THEN upload_type is coerced to the enum type
+ assert storage.upload_type is UploadType.S3
+
+ def test_upload_type_enum_coercion_on_setattr(self):
+ """Test that assigning a string to upload_type coerces it to the enum type."""
+ # GIVEN a StorageLocation
+ storage = StorageLocation()
+
+ # WHEN we assign a string value to upload_type
+ storage.upload_type = "HTTPS"
+
+ # THEN it is coerced to the enum type
+ assert storage.upload_type is UploadType.HTTPS
+
+
+class TestStorageLocationAsync:
+ """Async unit tests for StorageLocation model."""
+
+ @pytest.mark.asyncio
+ async def test_get_async_missing_id(self):
+ """Test that get_async raises ValueError when storage_location_id is missing."""
+ # GIVEN a storage location without an ID
+ storage = StorageLocation()
+
+ # THEN it should raise ValueError
+ with pytest.raises(ValueError, match="storage_location_id is required"):
+ await storage.get_async()
+
+ @pytest.mark.asyncio
+ async def test_store_async_missing_storage_type(self):
+ """Test that store_async raises ValueError when storage_type is missing."""
+ # GIVEN a storage location without a storage_type
+ storage = StorageLocation(bucket="my-bucket")
+
+ # THEN it should raise ValueError
+ with pytest.raises(ValueError, match="storage_type is required"):
+ await storage.store_async()
diff --git a/tests/unit/synapseclient/services/unit_test_migration_and_types_async.py b/tests/unit/synapseclient/services/unit_test_migration_and_types_async.py
new file mode 100644
index 000000000..a95ad4e27
--- /dev/null
+++ b/tests/unit/synapseclient/services/unit_test_migration_and_types_async.py
@@ -0,0 +1,2481 @@
+"""Unit tests for synapseclient.models.services.migration and migration_types (sync and async)."""
+import asyncio
+import csv
+import json
+import os
+import sqlite3
+import tempfile
+from dataclasses import fields
+from typing import Any, Dict
+from unittest import mock
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from synapseclient.core.constants import concrete_types
+from synapseclient.core.exceptions import SynapseError
+from synapseclient.models.services.migration import (
+ BATCH_SIZE,
+ DEFAULT_PART_SIZE,
+ _check_file_handle_exists,
+ _check_indexed,
+ _confirm_migration,
+ _create_new_file_version_async,
+ _ensure_schema,
+ _escape_column_name,
+ _execute_migration_async,
+ _get_default_db_path,
+ _get_file_migration_status,
+ _get_part_size,
+ _get_table_file_handle_rows_async,
+ _get_version_numbers_async,
+ _index_container_async,
+ _index_entity_async,
+ _index_file_entity_async,
+ _index_table_entity_async,
+ _insert_file_migration,
+ _insert_table_file_migration,
+ _join_column_names,
+ _mark_container_indexed,
+ _migrate_file_version_async,
+ _migrate_item_async,
+ _migrate_table_attached_file_async,
+ _prepare_migration_db,
+ _query_migration_batch,
+ _record_indexing_error,
+ _retrieve_index_settings,
+ _update_migration_database,
+ _verify_storage_location_ownership_async,
+ index_files_for_migration_async,
+ migrate_indexed_files_async,
+ track_migration_results_async,
+)
+from synapseclient.models.services.migration_types import (
+ IndexingError,
+ MigrationError,
+ MigrationKey,
+ MigrationResult,
+ MigrationSettings,
+ MigrationStatus,
+ MigrationType,
+)
+
+# =============================================================================
+# Fixtures
+# =============================================================================
+
+MODULE = "synapseclient.models.services.migration"
+
+
+@pytest.fixture
+def in_memory_db():
+ """Return an in-memory SQLite connection with schema applied."""
+ conn = sqlite3.connect(":memory:")
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ conn.commit()
+ yield conn, cursor
+ conn.close()
+
+
+@pytest.fixture
+def db_file():
+ """Return a path to a temporary SQLite file with schema applied."""
+ with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
+ path = f.name
+ with sqlite3.connect(path) as conn:
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ conn.commit()
+ yield path
+ os.unlink(path)
+
+
+@pytest.fixture
+def db_file_with_settings():
+ """A temp db file with MigrationSettings already populated."""
+ with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
+ path = f.name
+ settings = MigrationSettings(
+ root_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="new",
+ include_table_files=False,
+ )
+ with sqlite3.connect(path) as conn:
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ cursor.execute(
+ "INSERT INTO migration_settings (settings) VALUES (?)",
+ (json.dumps(settings.to_dict()),),
+ )
+ conn.commit()
+ yield path, settings
+ os.unlink(path)
+
+
+def _populate_db(db_path: str) -> None:
+ """Insert sample rows into a migration database for MigrationResult tests."""
+ rows = [
+ # (id, type, version, row_id, col_id, parent_id, status, exception, from_sl, from_fh, to_fh, file_size)
+ (
+ "syn1",
+ MigrationType.PROJECT.value,
+ None,
+ None,
+ None,
+ None,
+ MigrationStatus.INDEXED.value,
+ None,
+ None,
+ None,
+ None,
+ None,
+ ),
+ (
+ "syn2",
+ MigrationType.FOLDER.value,
+ None,
+ None,
+ None,
+ "syn1",
+ MigrationStatus.INDEXED.value,
+ None,
+ None,
+ None,
+ None,
+ None,
+ ),
+ (
+ "syn3",
+ MigrationType.FILE.value,
+ 1,
+ None,
+ None,
+ "syn1",
+ MigrationStatus.MIGRATED.value,
+ None,
+ "10",
+ "fh_a",
+ "fh_b",
+ 1024,
+ ),
+ (
+ "syn4",
+ MigrationType.TABLE_ATTACHED_FILE.value,
+ 2,
+ 5,
+ 7,
+ "syn1",
+ MigrationStatus.MIGRATED.value,
+ None,
+ "10",
+ "fh_c",
+ "fh_d",
+ 512,
+ ),
+ (
+ "syn5",
+ MigrationType.FILE.value,
+ 3,
+ None,
+ None,
+ "syn1",
+ MigrationStatus.ERRORED.value,
+ "boom",
+ None,
+ None,
+ None,
+ None,
+ ),
+ (
+ "syn6",
+ MigrationType.FILE.value,
+ 4,
+ None,
+ None,
+ "syn1",
+ MigrationStatus.INDEXED.value,
+ None,
+ "10",
+ "fh_e",
+ None,
+ 256,
+ ),
+ (
+ "syn7",
+ MigrationType.FILE.value,
+ 5,
+ None,
+ None,
+ "syn1",
+ MigrationStatus.ALREADY_MIGRATED.value,
+ None,
+ "20",
+ "fh_f",
+ None,
+ 128,
+ ),
+ ]
+ with sqlite3.connect(db_path) as conn:
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ cursor.executemany(
+ """
+ INSERT INTO migrations (id, type, version, row_id, col_id, parent_id, status, exception,
+ from_storage_location_id, from_file_handle_id, to_file_handle_id, file_size)
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ """,
+ rows,
+ )
+ conn.commit()
+
+
+@pytest.fixture
+def result_db():
+ with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
+ path = f.name
+ _populate_db(path)
+ yield path
+ os.unlink(path)
+
+
+def _make_mock_client():
+ client = MagicMock()
+ client.rest_get_async = AsyncMock()
+ client.rest_put_async = AsyncMock()
+ client.logger = MagicMock()
+ return client
+
+
+def _make_file_handle(
+ concrete_type=None, storage_location_id="10", content_size=1024, fh_id="fh1"
+):
+ fh = MagicMock()
+ fh.concrete_type = concrete_type or concrete_types.S3_FILE_HANDLE
+ fh.storage_location_id = storage_location_id
+ fh.content_size = content_size
+ fh.id = fh_id
+ return fh
+
+
+def _make_entity(
+ entity_id="syn3", version_number=1, file_handle=None, data_file_handle_id="fh1"
+):
+ entity = MagicMock()
+ entity.id = entity_id
+ entity.version_number = version_number
+ entity.file_handle = file_handle or _make_file_handle()
+ entity.data_file_handle_id = data_file_handle_id
+ entity.dataFileHandleId = data_file_handle_id
+ entity.store_async = AsyncMock()
+ return entity
+
+
+async def _aiter(*items):
+ """Helper: yield items from an async generator."""
+ for item in items:
+ yield item
+
+
+# =============================================================================
+# migration_types.py – MigrationStatus
+# =============================================================================
+
+
+class TestMigrationStatus:
+ def test_values(self):
+ assert MigrationStatus.INDEXED.value == 1
+ assert MigrationStatus.MIGRATED.value == 2
+ assert MigrationStatus.ALREADY_MIGRATED.value == 3
+ assert MigrationStatus.ERRORED.value == 4
+
+ def test_names(self):
+ assert MigrationStatus(1).name == "INDEXED"
+ assert MigrationStatus(2).name == "MIGRATED"
+ assert MigrationStatus(3).name == "ALREADY_MIGRATED"
+ assert MigrationStatus(4).name == "ERRORED"
+
+
+# =============================================================================
+# migration_types.py – MigrationType
+# =============================================================================
+
+
+class TestMigrationType:
+ def test_values(self):
+ assert MigrationType.PROJECT.value == 1
+ assert MigrationType.FOLDER.value == 2
+ assert MigrationType.FILE.value == 3
+ assert MigrationType.TABLE_ATTACHED_FILE.value == 4
+
+ @pytest.mark.parametrize(
+ "concrete_type,expected",
+ [
+ (concrete_types.PROJECT_ENTITY, MigrationType.PROJECT),
+ (concrete_types.FOLDER_ENTITY, MigrationType.FOLDER),
+ (concrete_types.FILE_ENTITY, MigrationType.FILE),
+ (concrete_types.TABLE_ENTITY, MigrationType.TABLE_ATTACHED_FILE),
+ ],
+ )
+ def test_from_concrete_type(self, concrete_type, expected):
+ assert MigrationType.from_concrete_type(concrete_type) == expected
+
+ def test_from_concrete_type_unknown_raises(self):
+ with pytest.raises(ValueError, match="Unhandled concrete type"):
+ MigrationType.from_concrete_type("org.sagebionetworks.repo.model.Unknown")
+
+
+# =============================================================================
+# migration_types.py – MigrationKey
+# =============================================================================
+
+
+class TestMigrationKey:
+ def test_equality_same(self):
+ k1 = MigrationKey("syn1", MigrationType.FILE, version=2)
+ k2 = MigrationKey("syn1", MigrationType.FILE, version=2)
+ assert k1 == k2
+
+ def test_equality_different_version(self):
+ k1 = MigrationKey("syn1", MigrationType.FILE, version=1)
+ k2 = MigrationKey("syn1", MigrationType.FILE, version=2)
+ assert k1 != k2
+
+ def test_equality_different_id(self):
+ k1 = MigrationKey("syn1", MigrationType.FILE)
+ k2 = MigrationKey("syn2", MigrationType.FILE)
+ assert k1 != k2
+
+ def test_equality_different_type(self):
+ k1 = MigrationKey("syn1", MigrationType.FILE)
+ k2 = MigrationKey("syn1", MigrationType.TABLE_ATTACHED_FILE)
+ assert k1 != k2
+
+ def test_equality_with_row_col(self):
+ k1 = MigrationKey("syn1", MigrationType.TABLE_ATTACHED_FILE, row_id=1, col_id=2)
+ k2 = MigrationKey("syn1", MigrationType.TABLE_ATTACHED_FILE, row_id=1, col_id=2)
+ assert k1 == k2
+
+ def test_not_equal_to_other_type(self):
+ k = MigrationKey("syn1", MigrationType.FILE)
+ assert k != "not a key"
+
+ def test_hashable_usable_in_set(self):
+ k1 = MigrationKey("syn1", MigrationType.FILE, version=1)
+ k2 = MigrationKey("syn1", MigrationType.FILE, version=1)
+ k3 = MigrationKey("syn2", MigrationType.FILE, version=1)
+ s = {k1, k2, k3}
+ assert len(s) == 2
+
+ def test_default_optional_fields_are_none(self):
+ k = MigrationKey("syn1", MigrationType.FOLDER)
+ assert k.version is None
+ assert k.row_id is None
+ assert k.col_id is None
+
+
+# =============================================================================
+# migration_types.py – MigrationSettings
+# =============================================================================
+
+
+class TestMigrationSettings:
+ def _make_settings(self, **kwargs):
+ defaults = dict(
+ root_id="syn1",
+ dest_storage_location_id="123",
+ source_storage_location_ids=["10", "20"],
+ file_version_strategy="new",
+ include_table_files=False,
+ )
+ defaults.update(kwargs)
+ return MigrationSettings(**defaults)
+
+ def test_to_dict_round_trip(self):
+ s = self._make_settings()
+ d = s.to_dict()
+ assert d["root_id"] == "syn1"
+ assert d["dest_storage_location_id"] == "123"
+ assert d["source_storage_location_ids"] == ["10", "20"]
+ assert d["file_version_strategy"] == "new"
+ assert d["include_table_files"] == 0
+
+ def test_to_dict_include_table_files_true(self):
+ s = self._make_settings(include_table_files=True)
+ assert s.to_dict()["include_table_files"] == 1
+
+ def test_from_dict(self):
+ d = {
+ "root_id": "syn5",
+ "dest_storage_location_id": "99",
+ "source_storage_location_ids": ["5"],
+ "file_version_strategy": "all",
+ "include_table_files": 1,
+ }
+ s = MigrationSettings.from_dict(d)
+ assert s.root_id == "syn5"
+ assert s.dest_storage_location_id == "99"
+ assert s.source_storage_location_ids == ["5"]
+ assert s.file_version_strategy == "all"
+ assert s.include_table_files is True
+
+ def test_from_dict_int_false(self):
+ d = {
+ "root_id": "syn5",
+ "dest_storage_location_id": "99",
+ "include_table_files": 0,
+ }
+ s = MigrationSettings.from_dict(d)
+ assert s.include_table_files is False
+
+ def test_from_dict_missing_optional_fields(self):
+ d = {"root_id": "syn1", "dest_storage_location_id": "5"}
+ s = MigrationSettings.from_dict(d)
+ assert s.source_storage_location_ids == []
+ assert s.file_version_strategy == "new"
+ assert s.include_table_files is False
+
+ def test_verify_migration_settings_matching(self):
+ s = self._make_settings()
+ # Should not raise
+ s.verify_migration_settings(s, "/tmp/test.db")
+
+ @pytest.mark.parametrize(
+ "field_name,bad_value",
+ [
+ ("root_id", "syn999"),
+ ("dest_storage_location_id", "9999"),
+ ("file_version_strategy", "all"),
+ ("include_table_files", True),
+ ],
+ )
+ def test_verify_migration_settings_mismatch_raises(self, field_name, bad_value):
+ existing = self._make_settings()
+ current_kwargs = {field_name: bad_value}
+ current = self._make_settings(**current_kwargs)
+ with pytest.raises(ValueError, match="Index parameter does not match"):
+ current.verify_migration_settings(existing, "/tmp/test.db")
+
+
+# =============================================================================
+# migration_types.py – IndexingError
+# =============================================================================
+
+
+class TestIndexingError:
+ def test_attributes(self):
+ err = IndexingError("syn42", concrete_types.FILE_ENTITY)
+ assert err.entity_id == "syn42"
+ assert err.concrete_type == concrete_types.FILE_ENTITY
+
+ def test_is_exception(self):
+ assert issubclass(IndexingError, Exception)
+
+
+# =============================================================================
+# migration_types.py – MigrationError
+# =============================================================================
+
+
+class TestMigrationError:
+ def test_basic_message(self):
+ key = MigrationKey("syn1", MigrationType.FILE)
+ err = MigrationError(key, from_file_handle_id="fh1")
+ assert "syn1" in str(err)
+ assert err.key is key
+ assert err.from_file_handle_id == "fh1"
+ assert err.to_file_handle_id is None
+
+ def test_with_cause(self):
+ key = MigrationKey("syn1", MigrationType.FILE)
+ cause = RuntimeError("network failure")
+ err = MigrationError(key, from_file_handle_id="fh1", cause=cause)
+ assert "network failure" in str(err)
+
+ def test_with_to_handle(self):
+ key = MigrationKey("syn1", MigrationType.FILE)
+ err = MigrationError(key, from_file_handle_id="fh1", to_file_handle_id="fh2")
+ assert err.to_file_handle_id == "fh2"
+
+ def test_is_exception(self):
+ assert issubclass(MigrationError, Exception)
+
+
+# =============================================================================
+# migration_types.py – MigrationResult
+# =============================================================================
+
+
+class TestMigrationResult:
+ def test_get_counts_by_status(self, result_db):
+ result = MigrationResult(db_path=result_db)
+ counts = result.get_counts_by_status()
+ # Containers (PROJECT, FOLDER) are excluded from counts
+ assert counts["MIGRATED"] == 2
+ assert counts["ERRORED"] == 1
+ assert counts["INDEXED"] == 1
+ assert counts["ALREADY_MIGRATED"] == 1
+
+ def test_counts_by_status_property(self, result_db):
+ result = MigrationResult(db_path=result_db)
+ assert result.counts_by_status == result.get_counts_by_status()
+
+ def test_get_migrations_returns_only_file_and_table(self, result_db):
+ result = MigrationResult(db_path=result_db)
+ migrations = list(result.get_migrations())
+ types = {m["type"] for m in migrations}
+ assert types <= {"file", "table"}
+
+ def test_get_migrations_file_entry(self, result_db):
+ result = MigrationResult(db_path=result_db)
+ migrations = list(result.get_migrations())
+ file_migrations = [m for m in migrations if m["id"] == "syn3"]
+ assert len(file_migrations) == 1
+ m = file_migrations[0]
+ assert m["type"] == "file"
+ assert m["version"] == 1
+ assert m["status"] == "MIGRATED"
+ assert m["from_file_handle_id"] == "fh_a"
+ assert m["to_file_handle_id"] == "fh_b"
+
+ def test_get_migrations_table_entry(self, result_db):
+ result = MigrationResult(db_path=result_db)
+ migrations = list(result.get_migrations())
+ table_migrations = [m for m in migrations if m["id"] == "syn4"]
+ assert len(table_migrations) == 1
+ m = table_migrations[0]
+ assert m["type"] == "table"
+ assert m["row_id"] == 5
+
+ def test_get_migrations_error_entry(self, result_db):
+ result = MigrationResult(db_path=result_db)
+ migrations = list(result.get_migrations())
+ errored = [m for m in migrations if m["status"] == "ERRORED"]
+ assert len(errored) == 1
+ assert errored[0]["exception"] == "boom"
+
+ def test_get_migrations_col_name_resolved_via_client(self, result_db):
+ mock_client = mock.MagicMock()
+ mock_client.restGET.return_value = {"name": "my_col"}
+ result = MigrationResult(db_path=result_db, synapse_client=mock_client)
+ migrations = list(result.get_migrations())
+ table_m = [m for m in migrations if m["type"] == "table"][0]
+ assert table_m["col_name"] == "my_col"
+
+ def test_as_csv(self, result_db):
+ result = MigrationResult(db_path=result_db)
+ with tempfile.NamedTemporaryFile(suffix=".csv", delete=False, mode="w") as f:
+ csv_path = f.name
+ try:
+ result.as_csv(csv_path)
+ with open(csv_path, newline="") as f:
+ reader = csv.DictReader(f)
+ rows = list(reader)
+ ids = {r["id"] for r in rows}
+ # Should include file and table-attached entries, not containers
+ assert "syn3" in ids
+ assert "syn4" in ids
+ assert "syn1" not in ids # PROJECT
+ assert "syn2" not in ids # FOLDER
+ assert "id" in reader.fieldnames
+ assert "status" in reader.fieldnames
+ finally:
+ os.unlink(csv_path)
+
+ @pytest.mark.asyncio
+ async def test_get_counts_by_status_async(self, result_db):
+ result = MigrationResult(db_path=result_db)
+ counts = await result.get_counts_by_status_async()
+ assert counts["MIGRATED"] == 2
+
+ @pytest.mark.asyncio
+ async def test_get_migrations_async(self, result_db):
+ result = MigrationResult(db_path=result_db)
+ migrations = await result.get_migrations_async()
+ assert isinstance(migrations, list)
+ assert len(migrations) > 0
+
+ @pytest.mark.asyncio
+ async def test_as_csv_async(self, result_db):
+ result = MigrationResult(db_path=result_db)
+ with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as f:
+ csv_path = f.name
+ try:
+ await result.as_csv_async(csv_path)
+ assert os.path.exists(csv_path)
+ with open(csv_path) as f:
+ content = f.read()
+ assert "id" in content
+ finally:
+ os.unlink(csv_path)
+
+
+# =============================================================================
+# migration.py – pure helper functions
+# =============================================================================
+
+
+class TestGetDefaultDbPath:
+ def test_returns_path_with_entity_id(self):
+ path = _get_default_db_path("syn123")
+ assert "migration_syn123.db" in path
+ assert os.path.exists(os.path.dirname(path))
+
+
+class TestEscapeColumnName:
+ def test_plain_string(self):
+ assert _escape_column_name("my_col") == '"my_col"'
+
+ def test_dict_with_name_key(self):
+ assert _escape_column_name({"name": "col_name"}) == '"col_name"'
+
+ def test_escapes_double_quotes(self):
+ assert _escape_column_name('col"name') == '"col""name"'
+
+ def test_dict_escapes_double_quotes(self):
+ assert _escape_column_name({"name": 'a"b'}) == '"a""b"'
+
+
+class TestJoinColumnNames:
+ def test_single(self):
+ assert _join_column_names(["col1"]) == '"col1"'
+
+ def test_multiple(self):
+ result = _join_column_names(["a", "b", "c"])
+ assert result == '"a","b","c"'
+
+ def test_dict_columns(self):
+ cols = [{"name": "x"}, {"name": "y"}]
+ assert _join_column_names(cols) == '"x","y"'
+
+
+class TestGetPartSize:
+ def test_small_file_uses_default(self):
+ size = 1 * 1024 * 1024 # 1 MB
+ assert _get_part_size(size) == DEFAULT_PART_SIZE
+
+ def test_large_file_exceeds_default(self):
+ from synapseclient.core.upload.multipart_upload import MAX_NUMBER_OF_PARTS
+
+ # File so large that default part size would require too many parts
+ size = DEFAULT_PART_SIZE * MAX_NUMBER_OF_PARTS + 1
+ part_size = _get_part_size(size)
+ assert part_size > DEFAULT_PART_SIZE
+
+
+class TestGetFileMigrationStatus:
+ def _make_handle(self, concrete_type, storage_location_id):
+ handle = mock.MagicMock()
+ handle.concrete_type = concrete_type
+ handle.storage_location_id = storage_location_id
+ return handle
+
+ def test_non_s3_handle_returns_none(self):
+ handle = self._make_handle(
+ "org.sagebionetworks.repo.model.file.ExternalFileHandle", "10"
+ )
+ result = _get_file_migration_status(handle, [], "20")
+ assert result is None
+
+ def test_already_at_destination_returns_already_migrated(self):
+ handle = self._make_handle(concrete_types.S3_FILE_HANDLE, "20")
+ result = _get_file_migration_status(handle, [], "20")
+ assert result == MigrationStatus.ALREADY_MIGRATED.value
+
+ def test_no_source_filter_returns_indexed(self):
+ handle = self._make_handle(concrete_types.S3_FILE_HANDLE, "10")
+ result = _get_file_migration_status(handle, [], "20")
+ assert result == MigrationStatus.INDEXED.value
+
+ def test_source_filter_match_returns_indexed(self):
+ handle = self._make_handle(concrete_types.S3_FILE_HANDLE, "10")
+ result = _get_file_migration_status(handle, ["10", "11"], "20")
+ assert result == MigrationStatus.INDEXED.value
+
+ def test_source_filter_no_match_returns_none(self):
+ handle = self._make_handle(concrete_types.S3_FILE_HANDLE, "99")
+ result = _get_file_migration_status(handle, ["10", "11"], "20")
+ assert result is None
+
+
+# =============================================================================
+# migration.py – database helper functions
+# =============================================================================
+
+
+class TestEnsureSchema:
+ def test_creates_migrations_table(self, in_memory_db):
+ conn, cursor = in_memory_db
+ tables = cursor.execute(
+ "SELECT name FROM sqlite_master WHERE type='table'"
+ ).fetchall()
+ table_names = {t[0] for t in tables}
+ assert "migrations" in table_names
+ assert "migration_settings" in table_names
+
+ def test_idempotent(self, in_memory_db):
+ conn, cursor = in_memory_db
+ # Running again should not raise
+ _ensure_schema(cursor)
+
+
+class TestCheckIndexed:
+ def test_not_indexed(self, in_memory_db):
+ conn, cursor = in_memory_db
+ assert _check_indexed(cursor, "syn999") is False
+
+ def test_indexed(self, in_memory_db):
+ conn, cursor = in_memory_db
+ cursor.execute(
+ "INSERT INTO migrations (id, type, status) VALUES (?, ?, ?)",
+ ("syn1", MigrationType.FILE.value, MigrationStatus.INDEXED.value),
+ )
+ conn.commit()
+ assert _check_indexed(cursor, "syn1") is True
+
+
+class TestMarkContainerIndexed:
+ def test_inserts_row(self, in_memory_db):
+ conn, cursor = in_memory_db
+ # Callers always pass migration_type as .value (int)
+ _mark_container_indexed(cursor, "syn10", MigrationType.FOLDER.value, "syn1")
+ conn.commit()
+ row = cursor.execute(
+ "SELECT id, type, parent_id, status FROM migrations WHERE id = 'syn10'"
+ ).fetchone()
+ assert row is not None
+ assert row[1] == MigrationType.FOLDER.value
+ assert row[2] == "syn1"
+ assert row[3] == MigrationStatus.INDEXED.value
+
+ def test_check_indexed_prevents_double_insert(self, in_memory_db):
+ """In practice, _check_indexed guards against re-indexing containers.
+ After marking a container indexed, _check_indexed should return True."""
+ conn, cursor = in_memory_db
+ _mark_container_indexed(cursor, "syn10", MigrationType.FOLDER.value, "syn1")
+ conn.commit()
+ assert _check_indexed(cursor, "syn10") is True
+
+
+class TestRecordIndexingError:
+ def test_inserts_error_row(self, in_memory_db):
+ conn, cursor = in_memory_db
+ _record_indexing_error(
+ cursor, "syn11", MigrationType.FILE.value, "syn1", "Traceback..."
+ )
+ conn.commit()
+ row = cursor.execute(
+ "SELECT status, exception FROM migrations WHERE id='syn11'"
+ ).fetchone()
+ assert row[0] == MigrationStatus.ERRORED.value
+ assert row[1] == "Traceback..."
+
+
+class TestInsertFileMigration:
+ def test_inserts_row(self, in_memory_db):
+ conn, cursor = in_memory_db
+ insert_values = [
+ (
+ "syn3",
+ MigrationType.FILE.value,
+ 1,
+ "syn1",
+ "10",
+ "fh_a",
+ 1024,
+ MigrationStatus.INDEXED.value,
+ ),
+ ]
+ _insert_file_migration(cursor, insert_values)
+ conn.commit()
+ row = cursor.execute(
+ "SELECT id, type, version, from_file_handle_id FROM migrations WHERE id='syn3'"
+ ).fetchone()
+ assert row is not None
+ assert row[2] == 1
+ assert row[3] == "fh_a"
+
+ def test_inserts_multiple(self, in_memory_db):
+ conn, cursor = in_memory_db
+ insert_values = [
+ (
+ "syn3",
+ MigrationType.FILE.value,
+ 1,
+ "syn1",
+ "10",
+ "fh_a",
+ 1024,
+ MigrationStatus.INDEXED.value,
+ ),
+ (
+ "syn4",
+ MigrationType.FILE.value,
+ 2,
+ "syn1",
+ "10",
+ "fh_b",
+ 2048,
+ MigrationStatus.INDEXED.value,
+ ),
+ ]
+ _insert_file_migration(cursor, insert_values)
+ conn.commit()
+ count = cursor.execute("SELECT count(*) FROM migrations").fetchone()[0]
+ assert count == 2
+
+
+class TestInsertTableFileMigration:
+ def test_inserts_row(self, in_memory_db):
+ conn, cursor = in_memory_db
+ insert_values = [
+ (
+ "syn5",
+ MigrationType.TABLE_ATTACHED_FILE.value,
+ 1,
+ 2,
+ 3,
+ "syn1",
+ "10",
+ "fh_x",
+ 512,
+ MigrationStatus.INDEXED.value,
+ ),
+ ]
+ _insert_table_file_migration(cursor, insert_values)
+ conn.commit()
+ row = cursor.execute(
+ "SELECT id, row_id, col_id FROM migrations WHERE id='syn5'"
+ ).fetchone()
+ assert row is not None
+ assert row[1] == 1
+ assert row[2] == 2
+
+ def test_ignore_on_duplicate(self, in_memory_db):
+ conn, cursor = in_memory_db
+ insert_values = [
+ (
+ "syn5",
+ MigrationType.TABLE_ATTACHED_FILE.value,
+ 1,
+ 2,
+ 3,
+ "syn1",
+ "10",
+ "fh_x",
+ 512,
+ MigrationStatus.INDEXED.value,
+ ),
+ (
+ "syn5",
+ MigrationType.TABLE_ATTACHED_FILE.value,
+ 1,
+ 2,
+ 3,
+ "syn1",
+ "10",
+ "fh_x",
+ 512,
+ MigrationStatus.INDEXED.value,
+ ),
+ ]
+ _insert_table_file_migration(cursor, insert_values)
+ conn.commit()
+ count = cursor.execute(
+ "SELECT count(*) FROM migrations WHERE id='syn5'"
+ ).fetchone()[0]
+ assert count == 1
+
+
+class TestRetrieveIndexSettings:
+ def test_returns_none_when_empty(self, in_memory_db):
+ conn, cursor = in_memory_db
+ assert _retrieve_index_settings(cursor) is None
+
+ def test_returns_settings_when_present(self, in_memory_db):
+ conn, cursor = in_memory_db
+ settings = MigrationSettings(
+ root_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=["5"],
+ file_version_strategy="all",
+ include_table_files=True,
+ )
+ cursor.execute(
+ "INSERT INTO migration_settings (settings) VALUES (?)",
+ (json.dumps(settings.to_dict()),),
+ )
+ conn.commit()
+ retrieved = _retrieve_index_settings(cursor)
+ assert retrieved.root_id == "syn1"
+ assert retrieved.dest_storage_location_id == "99"
+ assert retrieved.include_table_files is True
+
+
+class TestPrepareMigrationDb:
+ def test_inserts_settings_on_first_run(self, in_memory_db):
+ conn, cursor = in_memory_db
+ _prepare_migration_db(
+ conn=conn,
+ cursor=cursor,
+ db_path=":memory:",
+ root_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=["5"],
+ file_version_strategy="new",
+ include_table_files=False,
+ )
+ retrieved = _retrieve_index_settings(cursor)
+ assert retrieved is not None
+ assert retrieved.root_id == "syn1"
+
+ def test_no_error_on_matching_settings(self, in_memory_db):
+ conn, cursor = in_memory_db
+ kwargs = dict(
+ conn=conn,
+ cursor=cursor,
+ db_path=":memory:",
+ root_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=["5"],
+ file_version_strategy="new",
+ include_table_files=False,
+ )
+ _prepare_migration_db(**kwargs)
+ # Should not raise on second call with same settings
+ _prepare_migration_db(**kwargs)
+
+ def test_raises_on_mismatched_settings(self, in_memory_db):
+ conn, cursor = in_memory_db
+ _prepare_migration_db(
+ conn=conn,
+ cursor=cursor,
+ db_path=":memory:",
+ root_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="new",
+ include_table_files=False,
+ )
+ with pytest.raises(ValueError, match="Index parameter does not match"):
+ _prepare_migration_db(
+ conn=conn,
+ cursor=cursor,
+ db_path=":memory:",
+ root_id="syn_different", # changed
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="new",
+ include_table_files=False,
+ )
+
+
+class TestCheckFileHandleExists:
+ def test_returns_none_when_not_found(self, in_memory_db):
+ conn, cursor = in_memory_db
+ assert _check_file_handle_exists(cursor, "fh_missing") is None
+
+ def test_returns_to_handle_when_found(self, in_memory_db):
+ conn, cursor = in_memory_db
+ cursor.execute(
+ """INSERT INTO migrations (id, type, status, from_file_handle_id, to_file_handle_id)
+ VALUES (?, ?, ?, ?, ?)""",
+ (
+ "syn1",
+ MigrationType.FILE.value,
+ MigrationStatus.MIGRATED.value,
+ "fh_a",
+ "fh_b",
+ ),
+ )
+ conn.commit()
+ assert _check_file_handle_exists(cursor, "fh_a") == "fh_b"
+
+ def test_returns_none_when_to_handle_is_null(self, in_memory_db):
+ conn, cursor = in_memory_db
+ cursor.execute(
+ """INSERT INTO migrations (id, type, status, from_file_handle_id)
+ VALUES (?, ?, ?, ?)""",
+ ("syn1", MigrationType.FILE.value, MigrationStatus.INDEXED.value, "fh_a"),
+ )
+ conn.commit()
+ assert _check_file_handle_exists(cursor, "fh_a") is None
+
+
+class TestUpdateMigrationDatabase:
+ def _insert_indexed_file(self, cursor, entity_id="syn1", version=1):
+ cursor.execute(
+ """INSERT INTO migrations (id, type, version, status, from_file_handle_id)
+ VALUES (?, ?, ?, ?, ?)""",
+ (
+ entity_id,
+ MigrationType.FILE.value,
+ version,
+ MigrationStatus.INDEXED.value,
+ "fh_src",
+ ),
+ )
+
+ def test_updates_to_migrated(self, in_memory_db):
+ conn, cursor = in_memory_db
+ self._insert_indexed_file(cursor)
+ conn.commit()
+ key = MigrationKey("syn1", MigrationType.FILE, version=1)
+ # Callers always pass status as .value (int)
+ _update_migration_database(
+ conn, cursor, key, "fh_dest", MigrationStatus.MIGRATED.value
+ )
+ row = cursor.execute(
+ "SELECT status, to_file_handle_id FROM migrations WHERE id='syn1'"
+ ).fetchone()
+ assert row[0] == MigrationStatus.MIGRATED.value
+ assert row[1] == "fh_dest"
+
+ def test_stores_exception_traceback(self, in_memory_db):
+ conn, cursor = in_memory_db
+ self._insert_indexed_file(cursor)
+ conn.commit()
+ key = MigrationKey("syn1", MigrationType.FILE, version=1)
+ cause = RuntimeError("disk full")
+ _update_migration_database(
+ conn, cursor, key, None, MigrationStatus.ERRORED.value, exception=cause
+ )
+ row = cursor.execute(
+ "SELECT status, exception FROM migrations WHERE id='syn1'"
+ ).fetchone()
+ assert row[0] == MigrationStatus.ERRORED.value
+ assert "disk full" in row[1]
+
+
+class TestConfirmMigration:
+ def test_force_returns_true(self, in_memory_db):
+ conn, cursor = in_memory_db
+ assert _confirm_migration(cursor, "99", force=True) is True
+
+ def test_no_items_returns_false(self, in_memory_db):
+ conn, cursor = in_memory_db
+ assert _confirm_migration(cursor, "99", force=False) is False
+
+ def test_non_tty_returns_false_without_input(self, in_memory_db):
+ conn, cursor = in_memory_db
+ cursor.execute(
+ "INSERT INTO migrations (id, type, status) VALUES (?, ?, ?)",
+ ("syn1", MigrationType.FILE.value, MigrationStatus.INDEXED.value),
+ )
+ conn.commit()
+ with mock.patch("sys.stdout") as mock_stdout:
+ mock_stdout.isatty.return_value = False
+ result = _confirm_migration(cursor, "99", force=False)
+ assert result is False
+
+ def test_tty_yes_returns_true(self, in_memory_db):
+ conn, cursor = in_memory_db
+ cursor.execute(
+ "INSERT INTO migrations (id, type, status) VALUES (?, ?, ?)",
+ ("syn1", MigrationType.FILE.value, MigrationStatus.INDEXED.value),
+ )
+ conn.commit()
+ with mock.patch("sys.stdout") as mock_stdout, mock.patch(
+ "builtins.input", return_value="y"
+ ):
+ mock_stdout.isatty.return_value = True
+ result = _confirm_migration(cursor, "99", force=False)
+ assert result is True
+
+ def test_tty_no_returns_false(self, in_memory_db):
+ conn, cursor = in_memory_db
+ cursor.execute(
+ "INSERT INTO migrations (id, type, status) VALUES (?, ?, ?)",
+ ("syn1", MigrationType.FILE.value, MigrationStatus.INDEXED.value),
+ )
+ conn.commit()
+ with mock.patch("sys.stdout") as mock_stdout, mock.patch(
+ "builtins.input", return_value="n"
+ ):
+ mock_stdout.isatty.return_value = True
+ result = _confirm_migration(cursor, "99", force=False)
+ assert result is False
+
+
+class TestQueryMigrationBatch:
+ def _insert_indexed(
+ self,
+ cursor,
+ entity_id,
+ migration_type,
+ version=None,
+ row_id=None,
+ col_id=None,
+ from_fh="fh_x",
+ ):
+ cursor.execute(
+ """INSERT INTO migrations (id, type, version, row_id, col_id, status, from_file_handle_id)
+ VALUES (?, ?, ?, ?, ?, ?, ?)""",
+ (
+ entity_id,
+ migration_type.value,
+ version,
+ row_id,
+ col_id,
+ MigrationStatus.INDEXED.value,
+ from_fh,
+ ),
+ )
+
+ def test_returns_forward_progress(self, in_memory_db):
+ conn, cursor = in_memory_db
+ self._insert_indexed(
+ cursor, "syn2", MigrationType.FILE, version=1, from_fh="fh_1"
+ )
+ self._insert_indexed(
+ cursor, "syn3", MigrationType.FILE, version=1, from_fh="fh_2"
+ )
+ conn.commit()
+
+ start_key = MigrationKey("", MigrationType.FILE)
+ results = _query_migration_batch(cursor, start_key, set(), set(), limit=10)
+ ids = [r["id"] for r in results]
+ assert "syn2" in ids
+ assert "syn3" in ids
+
+ def test_excludes_pending_file_handles(self, in_memory_db):
+ conn, cursor = in_memory_db
+ self._insert_indexed(
+ cursor, "syn2", MigrationType.FILE, version=1, from_fh="fh_pending"
+ )
+ self._insert_indexed(
+ cursor, "syn3", MigrationType.FILE, version=1, from_fh="fh_ok"
+ )
+ conn.commit()
+
+ start_key = MigrationKey("", MigrationType.FILE)
+ results = _query_migration_batch(
+ cursor, start_key, {"fh_pending"}, set(), limit=10
+ )
+ ids = [r["id"] for r in results]
+ assert "syn2" not in ids
+ assert "syn3" in ids
+
+ def test_respects_limit(self, in_memory_db):
+ conn, cursor = in_memory_db
+ for i in range(5):
+ self._insert_indexed(
+ cursor, f"syn{i+10}", MigrationType.FILE, version=1, from_fh=f"fh_{i}"
+ )
+ conn.commit()
+
+ start_key = MigrationKey("", MigrationType.FILE)
+ results = _query_migration_batch(cursor, start_key, set(), set(), limit=2)
+ assert len(results) <= 2
+
+
+# =============================================================================
+# _verify_storage_location_ownership_async
+# =============================================================================
+
+
+class TestVerifyStorageLocationOwnershipAsync:
+ @pytest.mark.asyncio
+ async def test_success(self):
+ client = _make_mock_client()
+ client.rest_get_async.return_value = {"storageLocationId": "99"}
+ # Should not raise
+ await _verify_storage_location_ownership_async("99", synapse_client=client)
+ client.rest_get_async.assert_awaited_once_with("/storageLocation/99")
+
+ @pytest.mark.asyncio
+ async def test_synapse_error_raises_value_error(self):
+ client = _make_mock_client()
+ client.rest_get_async.side_effect = SynapseError("forbidden")
+ with pytest.raises(ValueError, match="Unable to verify ownership"):
+ await _verify_storage_location_ownership_async("99", synapse_client=client)
+
+
+# =============================================================================
+# _get_version_numbers_async
+# =============================================================================
+
+
+class TestGetVersionNumbersAsync:
+ @pytest.mark.asyncio
+ async def test_yields_version_numbers(self):
+ client = _make_mock_client()
+ pages = [{"versionNumber": 3}, {"versionNumber": 2}, {"versionNumber": 1}]
+
+ async def _mock_paginated(path, *, synapse_client):
+ for p in pages:
+ yield p
+
+ with patch(f"{MODULE}.rest_get_paginated_async", _mock_paginated):
+ versions = [v async for v in _get_version_numbers_async("syn1", client)]
+
+ assert versions == [3, 2, 1]
+
+ @pytest.mark.asyncio
+ async def test_empty_yields_nothing(self):
+ client = _make_mock_client()
+
+ async def _mock_paginated(path, *, synapse_client):
+ return
+ yield # make it an async generator
+
+ with patch(f"{MODULE}.rest_get_paginated_async", _mock_paginated):
+ versions = [v async for v in _get_version_numbers_async("syn1", client)]
+
+ assert versions == []
+
+
+# =============================================================================
+# index_files_for_migration_async – validation
+# =============================================================================
+
+
+class TestIndexFilesForMigrationAsyncValidation:
+ @pytest.mark.asyncio
+ async def test_invalid_file_version_strategy_raises(self):
+ client = _make_mock_client()
+ with patch(f"{MODULE}.Synapse.get_client", return_value=client):
+ with pytest.raises(ValueError, match="Invalid file_version_strategy"):
+ await index_files_for_migration_async(
+ entity="syn1",
+ dest_storage_location_id="99",
+ file_version_strategy="bogus",
+ synapse_client=client,
+ )
+
+ @pytest.mark.asyncio
+ async def test_skip_strategy_with_no_table_files_raises(self):
+ client = _make_mock_client()
+ with patch(f"{MODULE}.Synapse.get_client", return_value=client):
+ with pytest.raises(ValueError, match="nothing to migrate"):
+ await index_files_for_migration_async(
+ entity="syn1",
+ dest_storage_location_id="99",
+ file_version_strategy="skip",
+ include_table_files=False,
+ synapse_client=client,
+ )
+
+ @pytest.mark.asyncio
+ async def test_ownership_failure_raises(self):
+ client = _make_mock_client()
+ client.rest_get_async.side_effect = SynapseError("forbidden")
+
+ with patch(f"{MODULE}.Synapse.get_client", return_value=client), patch(
+ f"{MODULE}.utils.id_of", return_value="syn1"
+ ):
+ with pytest.raises(ValueError, match="Unable to verify ownership"):
+ await index_files_for_migration_async(
+ entity="syn1",
+ dest_storage_location_id="99",
+ synapse_client=client,
+ )
+
+ @pytest.mark.asyncio
+ async def test_successful_indexing_returns_migration_result(self):
+ client = _make_mock_client()
+ entity = _make_entity("syn3")
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ db_path = os.path.join(tmpdir, "test.db")
+
+ with (
+ patch(f"{MODULE}.Synapse.get_client", return_value=client),
+ patch(f"{MODULE}.utils.id_of", return_value="syn3"),
+ patch(
+ f"{MODULE}._verify_storage_location_ownership_async",
+ new=AsyncMock(),
+ ),
+ patch(f"{MODULE}._index_entity_async", new=AsyncMock()),
+ ):
+ result = await index_files_for_migration_async(
+ entity=entity,
+ dest_storage_location_id="99",
+ db_path=db_path,
+ synapse_client=client,
+ )
+
+ assert isinstance(result, MigrationResult)
+ assert result.db_path == db_path
+
+ @pytest.mark.asyncio
+ async def test_indexing_error_is_reraised(self):
+ client = _make_mock_client()
+ entity = _make_entity("syn3")
+ underlying = RuntimeError("network down")
+ indexing_err = IndexingError("syn3", concrete_types.FILE_ENTITY)
+ indexing_err.__cause__ = underlying
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ db_path = os.path.join(tmpdir, "test.db")
+ with (
+ patch(f"{MODULE}.Synapse.get_client", return_value=client),
+ patch(f"{MODULE}.utils.id_of", return_value="syn3"),
+ patch(
+ f"{MODULE}._verify_storage_location_ownership_async",
+ new=AsyncMock(),
+ ),
+ patch(f"{MODULE}._index_entity_async", side_effect=indexing_err),
+ ):
+ with pytest.raises(RuntimeError, match="network down"):
+ await index_files_for_migration_async(
+ entity=entity,
+ dest_storage_location_id="99",
+ db_path=db_path,
+ synapse_client=client,
+ )
+
+
+# =============================================================================
+# _index_entity_async
+# =============================================================================
+
+
+class TestIndexEntityAsync:
+ def _common_kwargs(self, conn, cursor, client, entity_id="syn3"):
+ return dict(
+ conn=conn,
+ cursor=cursor,
+ entity=entity_id,
+ parent_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="new",
+ include_table_files=False,
+ continue_on_error=False,
+ synapse_client=client,
+ )
+
+ def _mock_entity_type(self, concrete_type):
+ et = MagicMock()
+ et.type = concrete_type
+ return et
+
+ @pytest.mark.asyncio
+ async def test_routes_file_entity(self, in_memory_db):
+ conn, cursor = in_memory_db
+ client = _make_mock_client()
+
+ with (
+ patch(f"{MODULE}.utils.id_of", return_value="syn3"),
+ patch(
+ f"{MODULE}.get_entity_type",
+ new=AsyncMock(
+ return_value=self._mock_entity_type(concrete_types.FILE_ENTITY)
+ ),
+ ),
+ patch(
+ f"{MODULE}._index_file_entity_async", new=AsyncMock()
+ ) as mock_index_file,
+ ):
+ await _index_entity_async(**self._common_kwargs(conn, cursor, client))
+
+ mock_index_file.assert_awaited_once()
+
+ @pytest.mark.asyncio
+ async def test_skips_file_entity_when_strategy_is_skip(self, in_memory_db):
+ conn, cursor = in_memory_db
+ client = _make_mock_client()
+ kwargs = self._common_kwargs(conn, cursor, client)
+ kwargs["file_version_strategy"] = "skip"
+
+ with (
+ patch(f"{MODULE}.utils.id_of", return_value="syn3"),
+ patch(
+ f"{MODULE}.get_entity_type",
+ new=AsyncMock(
+ return_value=self._mock_entity_type(concrete_types.FILE_ENTITY)
+ ),
+ ),
+ patch(
+ f"{MODULE}._index_file_entity_async", new=AsyncMock()
+ ) as mock_index_file,
+ ):
+ await _index_entity_async(**kwargs)
+
+ mock_index_file.assert_not_awaited()
+
+ @pytest.mark.asyncio
+ async def test_routes_table_entity_when_include_tables(self, in_memory_db):
+ conn, cursor = in_memory_db
+ client = _make_mock_client()
+ kwargs = self._common_kwargs(conn, cursor, client)
+ kwargs["include_table_files"] = True
+
+ with (
+ patch(f"{MODULE}.utils.id_of", return_value="syn5"),
+ patch(
+ f"{MODULE}.get_entity_type",
+ new=AsyncMock(
+ return_value=self._mock_entity_type(concrete_types.TABLE_ENTITY)
+ ),
+ ),
+ patch(
+ f"{MODULE}._index_table_entity_async", new=AsyncMock()
+ ) as mock_index_table,
+ ):
+ await _index_entity_async(**kwargs)
+
+ mock_index_table.assert_awaited_once()
+
+ @pytest.mark.asyncio
+ async def test_skips_table_entity_when_include_tables_false(self, in_memory_db):
+ conn, cursor = in_memory_db
+ client = _make_mock_client()
+
+ with (
+ patch(f"{MODULE}.utils.id_of", return_value="syn5"),
+ patch(
+ f"{MODULE}.get_entity_type",
+ new=AsyncMock(
+ return_value=self._mock_entity_type(concrete_types.TABLE_ENTITY)
+ ),
+ ),
+ patch(
+ f"{MODULE}._index_table_entity_async", new=AsyncMock()
+ ) as mock_index_table,
+ ):
+ await _index_entity_async(**self._common_kwargs(conn, cursor, client))
+
+ mock_index_table.assert_not_awaited()
+
+ @pytest.mark.asyncio
+ async def test_routes_folder_entity(self, in_memory_db):
+ conn, cursor = in_memory_db
+ client = _make_mock_client()
+
+ with (
+ patch(f"{MODULE}.utils.id_of", return_value="syn2"),
+ patch(
+ f"{MODULE}.get_entity_type",
+ new=AsyncMock(
+ return_value=self._mock_entity_type(concrete_types.FOLDER_ENTITY)
+ ),
+ ),
+ patch(
+ f"{MODULE}._index_container_async", new=AsyncMock()
+ ) as mock_container,
+ ):
+ await _index_entity_async(**self._common_kwargs(conn, cursor, client))
+
+ mock_container.assert_awaited_once()
+
+ @pytest.mark.asyncio
+ async def test_routes_project_entity(self, in_memory_db):
+ conn, cursor = in_memory_db
+ client = _make_mock_client()
+
+ with (
+ patch(f"{MODULE}.utils.id_of", return_value="syn1"),
+ patch(
+ f"{MODULE}.get_entity_type",
+ new=AsyncMock(
+ return_value=self._mock_entity_type(concrete_types.PROJECT_ENTITY)
+ ),
+ ),
+ patch(
+ f"{MODULE}._index_container_async", new=AsyncMock()
+ ) as mock_container,
+ ):
+ await _index_entity_async(**self._common_kwargs(conn, cursor, client))
+
+ mock_container.assert_awaited_once()
+
+ @pytest.mark.asyncio
+ async def test_skips_already_indexed_entity(self, in_memory_db):
+ conn, cursor = in_memory_db
+ client = _make_mock_client()
+ # Pre-insert the entity as indexed
+ cursor.execute(
+ "INSERT INTO migrations (id, type, status) VALUES (?, ?, ?)",
+ ("syn3", MigrationType.FILE.value, MigrationStatus.INDEXED.value),
+ )
+ conn.commit()
+
+ with (
+ patch(f"{MODULE}.utils.id_of", return_value="syn3"),
+ patch(
+ f"{MODULE}.get_entity_type",
+ new=AsyncMock(
+ return_value=self._mock_entity_type(concrete_types.FILE_ENTITY)
+ ),
+ ),
+ patch(
+ f"{MODULE}._index_file_entity_async", new=AsyncMock()
+ ) as mock_index_file,
+ ):
+ await _index_entity_async(**self._common_kwargs(conn, cursor, client))
+
+ mock_index_file.assert_not_awaited()
+
+ @pytest.mark.asyncio
+ async def test_error_without_continue_raises_indexing_error(self, in_memory_db):
+ conn, cursor = in_memory_db
+ client = _make_mock_client()
+
+ with (
+ patch(f"{MODULE}.utils.id_of", return_value="syn3"),
+ patch(
+ f"{MODULE}.get_entity_type",
+ new=AsyncMock(
+ return_value=self._mock_entity_type(concrete_types.FILE_ENTITY)
+ ),
+ ),
+ patch(
+ f"{MODULE}._index_file_entity_async", side_effect=RuntimeError("boom")
+ ),
+ ):
+ with pytest.raises(IndexingError):
+ await _index_entity_async(**self._common_kwargs(conn, cursor, client))
+
+ @pytest.mark.asyncio
+ async def test_error_with_continue_records_error(self, in_memory_db):
+ conn, cursor = in_memory_db
+ client = _make_mock_client()
+ kwargs = self._common_kwargs(conn, cursor, client)
+ kwargs["continue_on_error"] = True
+
+ with (
+ patch(f"{MODULE}.utils.id_of", return_value="syn3"),
+ patch(
+ f"{MODULE}.get_entity_type",
+ new=AsyncMock(
+ return_value=self._mock_entity_type(concrete_types.FILE_ENTITY)
+ ),
+ ),
+ patch(
+ f"{MODULE}._index_file_entity_async",
+ side_effect=RuntimeError("transient"),
+ ),
+ ):
+ # Should not raise
+ await _index_entity_async(**kwargs)
+
+ row = cursor.execute("SELECT status FROM migrations WHERE id='syn3'").fetchone()
+ assert row[0] == MigrationStatus.ERRORED.value
+
+
+# =============================================================================
+# _index_file_entity_async
+# =============================================================================
+
+
+class TestIndexFileEntityAsync:
+ def _make_cursor(self):
+ conn = sqlite3.connect(":memory:")
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ conn.commit()
+ return conn, cursor
+
+ @pytest.mark.asyncio
+ async def test_new_strategy_inserts_with_none_version(self):
+ conn, cursor = self._make_cursor()
+ client = _make_mock_client()
+ fh = _make_file_handle(storage_location_id="10")
+ entity = _make_entity("syn3", file_handle=fh)
+
+ with patch(f"{MODULE}.utils.id_of", return_value="syn3"):
+ await _index_file_entity_async(
+ cursor=cursor,
+ entity=entity,
+ parent_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="new",
+ synapse_client=client,
+ )
+
+ conn.commit()
+ row = cursor.execute(
+ "SELECT id, version, status FROM migrations WHERE id='syn3'"
+ ).fetchone()
+ assert row is not None
+ assert row[1] is None # new strategy → version=None
+ assert row[2] == MigrationStatus.INDEXED.value
+
+ @pytest.mark.asyncio
+ async def test_latest_strategy_inserts_with_version_number(self):
+ conn, cursor = self._make_cursor()
+ client = _make_mock_client()
+ fh = _make_file_handle(storage_location_id="10")
+ entity = _make_entity("syn3", version_number=5, file_handle=fh)
+
+ with patch(f"{MODULE}.utils.id_of", return_value="syn3"):
+ await _index_file_entity_async(
+ cursor=cursor,
+ entity=entity,
+ parent_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="latest",
+ synapse_client=client,
+ )
+
+ conn.commit()
+ row = cursor.execute(
+ "SELECT version FROM migrations WHERE id='syn3'"
+ ).fetchone()
+ assert row[0] == 5
+
+ @pytest.mark.asyncio
+ async def test_all_strategy_inserts_each_version(self):
+ conn, cursor = self._make_cursor()
+ client = _make_mock_client()
+ fh = _make_file_handle(storage_location_id="10")
+ entity = _make_entity("syn3", file_handle=fh)
+
+ async def _mock_versions(entity_id, syn_client):
+ for v in [1, 2, 3]:
+ yield v
+
+ with (
+ patch(f"{MODULE}.utils.id_of", return_value="syn3"),
+ patch(f"{MODULE}._get_version_numbers_async", _mock_versions),
+ patch(f"{MODULE}.get_async", new=AsyncMock(return_value=entity)),
+ ):
+ await _index_file_entity_async(
+ cursor=cursor,
+ entity=entity,
+ parent_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="all",
+ synapse_client=client,
+ )
+
+ conn.commit()
+ count = cursor.execute(
+ "SELECT count(*) FROM migrations WHERE id='syn3'"
+ ).fetchone()[0]
+ assert count == 3
+
+ @pytest.mark.asyncio
+ async def test_already_migrated_file_skipped(self):
+ conn, cursor = self._make_cursor()
+ client = _make_mock_client()
+ # storage_location_id matches dest → ALREADY_MIGRATED → should still insert
+ fh = _make_file_handle(storage_location_id="99")
+ entity = _make_entity("syn3", file_handle=fh)
+
+ with patch(f"{MODULE}.utils.id_of", return_value="syn3"):
+ await _index_file_entity_async(
+ cursor=cursor,
+ entity=entity,
+ parent_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="new",
+ synapse_client=client,
+ )
+
+ conn.commit()
+ row = cursor.execute("SELECT status FROM migrations WHERE id='syn3'").fetchone()
+ assert row[0] == MigrationStatus.ALREADY_MIGRATED.value
+
+ @pytest.mark.asyncio
+ async def test_source_filter_excludes_non_matching(self):
+ conn, cursor = self._make_cursor()
+ client = _make_mock_client()
+ fh = _make_file_handle(storage_location_id="99") # not in source list
+ entity = _make_entity("syn3", file_handle=fh)
+
+ with patch(f"{MODULE}.utils.id_of", return_value="syn3"):
+ await _index_file_entity_async(
+ cursor=cursor,
+ entity=entity,
+ parent_id="syn1",
+ dest_storage_location_id="20",
+ source_storage_location_ids=["10"], # "99" not in list
+ file_version_strategy="new",
+ synapse_client=client,
+ )
+
+ conn.commit()
+ count = cursor.execute(
+ "SELECT count(*) FROM migrations WHERE id='syn3'"
+ ).fetchone()[0]
+ assert count == 0
+
+
+# =============================================================================
+# _get_table_file_handle_rows_async
+# =============================================================================
+
+
+class TestGetTableFileHandleRowsAsync:
+ @pytest.mark.asyncio
+ async def test_no_file_handle_columns_yields_nothing(self):
+ client = _make_mock_client()
+ col = MagicMock()
+ col.column_type = "STRING" # not FILEHANDLEID
+
+ with patch(f"{MODULE}.get_columns", new=AsyncMock(return_value=[col])):
+ rows = [
+ r
+ async for r in _get_table_file_handle_rows_async(
+ "syn5", synapse_client=client
+ )
+ ]
+
+ assert rows == []
+
+ @pytest.mark.asyncio
+ async def test_file_handle_columns_yields_rows(self):
+ client = _make_mock_client()
+ col = MagicMock()
+ col.column_type = "FILEHANDLEID"
+ col.__getitem__ = MagicMock(
+ side_effect=lambda k: "col_42" if k == "id" else None
+ )
+ col.id = "col_42"
+
+ fh = _make_file_handle()
+
+ # Row: [row_id, row_version, file_handle_id]
+ query_results = [[1, 2, "fh_abc"]]
+
+ # query_async is TYPE_CHECKING-only import so patch requires create=True
+ with (
+ patch(f"{MODULE}.get_columns", new=AsyncMock(return_value=[col])),
+ patch(
+ f"{MODULE}.query_async",
+ new=AsyncMock(return_value=query_results),
+ create=True,
+ ),
+ patch(
+ f"{MODULE}.get_file_handle_for_download_async",
+ new=AsyncMock(return_value={"fileHandle": fh}),
+ ),
+ ):
+ rows = [
+ r
+ async for r in _get_table_file_handle_rows_async(
+ "syn5", synapse_client=client
+ )
+ ]
+
+ assert len(rows) == 1
+ row_id, row_version, file_handles = rows[0]
+ assert row_id == 1
+ assert row_version == 2
+ assert "col_42" in file_handles
+
+
+# =============================================================================
+# _index_table_entity_async
+# =============================================================================
+
+
+class TestIndexTableEntityAsync:
+ @pytest.mark.asyncio
+ async def test_inserts_table_file_entries(self):
+ conn = sqlite3.connect(":memory:")
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ conn.commit()
+
+ client = _make_mock_client()
+ fh = _make_file_handle(storage_location_id="10", fh_id="fh_t1")
+
+ async def _mock_rows(entity_id, *, synapse_client):
+ yield 1, 2, {"col_7": fh}
+
+ with patch(f"{MODULE}._get_table_file_handle_rows_async", _mock_rows):
+ await _index_table_entity_async(
+ cursor=cursor,
+ entity_id="syn5",
+ parent_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ synapse_client=client,
+ )
+
+ conn.commit()
+ row = cursor.execute(
+ "SELECT id, type, row_id, col_id FROM migrations WHERE id='syn5'"
+ ).fetchone()
+ assert row is not None
+ assert row[1] == MigrationType.TABLE_ATTACHED_FILE.value
+ assert row[2] == 1 # row_id
+ assert row[3] == "col_7" # col_id
+
+ @pytest.mark.asyncio
+ async def test_skips_non_s3_file_handles(self):
+ conn = sqlite3.connect(":memory:")
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ conn.commit()
+
+ client = _make_mock_client()
+ fh = _make_file_handle(
+ concrete_type="org.sagebionetworks.repo.model.file.ExternalFileHandle",
+ storage_location_id="10",
+ )
+
+ async def _mock_rows(entity_id, *, synapse_client):
+ yield 1, 2, {"col_7": fh}
+
+ with patch(f"{MODULE}._get_table_file_handle_rows_async", _mock_rows):
+ await _index_table_entity_async(
+ cursor=cursor,
+ entity_id="syn5",
+ parent_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ synapse_client=client,
+ )
+
+ conn.commit()
+ count = cursor.execute("SELECT count(*) FROM migrations").fetchone()[0]
+ assert count == 0
+
+
+# =============================================================================
+# _index_container_async
+# =============================================================================
+
+
+class TestIndexContainerAsync:
+ @pytest.mark.asyncio
+ async def test_indexes_children_and_marks_container(self):
+ conn = sqlite3.connect(":memory:")
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ conn.commit()
+
+ client = _make_mock_client()
+ child_entity = _make_entity("syn3")
+
+ et = MagicMock()
+ et.type = concrete_types.PROJECT_ENTITY
+
+ async def _mock_get_children(parent, include_types, synapse_client):
+ yield {"id": "syn3"}
+
+ with (
+ patch(f"{MODULE}.get_entity_type", new=AsyncMock(return_value=et)),
+ patch(f"{MODULE}.get_children", _mock_get_children),
+ patch(f"{MODULE}.get_async", new=AsyncMock(return_value=child_entity)),
+ patch(f"{MODULE}._index_entity_async", new=AsyncMock()) as mock_index,
+ ):
+ await _index_container_async(
+ conn=conn,
+ cursor=cursor,
+ entity_id="syn1",
+ parent_id=None,
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="new",
+ include_table_files=False,
+ continue_on_error=False,
+ synapse_client=client,
+ )
+
+ mock_index.assert_awaited_once()
+ # Container should be marked as indexed
+ row = cursor.execute("SELECT id FROM migrations WHERE id='syn1'").fetchone()
+ assert row is not None
+
+ @pytest.mark.asyncio
+ async def test_includes_table_type_when_flag_set(self):
+ conn = sqlite3.connect(":memory:")
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ conn.commit()
+
+ client = _make_mock_client()
+ et = MagicMock()
+ et.type = concrete_types.FOLDER_ENTITY
+
+ captured_types = []
+
+ async def _mock_get_children(parent, include_types, synapse_client):
+ captured_types.extend(include_types)
+ return
+ yield # empty generator
+
+ with (
+ patch(f"{MODULE}.get_entity_type", new=AsyncMock(return_value=et)),
+ patch(f"{MODULE}.get_children", _mock_get_children),
+ ):
+ await _index_container_async(
+ conn=conn,
+ cursor=cursor,
+ entity_id="syn2",
+ parent_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="new",
+ include_table_files=True,
+ continue_on_error=False,
+ synapse_client=client,
+ )
+
+ assert "table" in captured_types
+
+ @pytest.mark.asyncio
+ async def test_excludes_file_types_when_strategy_is_skip(self):
+ conn = sqlite3.connect(":memory:")
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ conn.commit()
+
+ client = _make_mock_client()
+ et = MagicMock()
+ et.type = concrete_types.FOLDER_ENTITY
+
+ captured_types = []
+
+ async def _mock_get_children(parent, include_types, synapse_client):
+ captured_types.extend(include_types)
+ return
+ yield
+
+ with (
+ patch(f"{MODULE}.get_entity_type", new=AsyncMock(return_value=et)),
+ patch(f"{MODULE}.get_children", _mock_get_children),
+ ):
+ await _index_container_async(
+ conn=conn,
+ cursor=cursor,
+ entity_id="syn2",
+ parent_id="syn1",
+ dest_storage_location_id="99",
+ source_storage_location_ids=[],
+ file_version_strategy="skip",
+ include_table_files=True,
+ continue_on_error=False,
+ synapse_client=client,
+ )
+
+ assert "file" not in captured_types
+ assert "folder" not in captured_types
+
+
+# =============================================================================
+# _migrate_item_async
+# =============================================================================
+
+
+class TestMigrateItemAsync:
+ @pytest.mark.asyncio
+ async def test_copies_file_handle_and_creates_new_version(self):
+ client = _make_mock_client()
+ key = MigrationKey("syn3", MigrationType.FILE, version=None)
+ semaphore = asyncio.Semaphore(10)
+
+ with (
+ patch(
+ f"{MODULE}.multipart_copy_async", new=AsyncMock(return_value="fh_new")
+ ),
+ patch(
+ f"{MODULE}._create_new_file_version_async", new=AsyncMock()
+ ) as mock_create,
+ ):
+ result = await _migrate_item_async(
+ key=key,
+ from_file_handle_id="fh_old",
+ to_file_handle_id=None,
+ file_size=1024,
+ dest_storage_location_id="99",
+ semaphore=semaphore,
+ synapse_client=client,
+ )
+
+ assert result["to_file_handle_id"] == "fh_new"
+ assert result["from_file_handle_id"] == "fh_old"
+ mock_create.assert_awaited_once()
+
+ @pytest.mark.asyncio
+ async def test_reuses_existing_file_handle(self):
+ client = _make_mock_client()
+ key = MigrationKey("syn3", MigrationType.FILE, version=None)
+ semaphore = asyncio.Semaphore(10)
+
+ with (
+ patch(f"{MODULE}.multipart_copy_async", new=AsyncMock()) as mock_copy,
+ patch(f"{MODULE}._create_new_file_version_async", new=AsyncMock()),
+ ):
+ result = await _migrate_item_async(
+ key=key,
+ from_file_handle_id="fh_old",
+ to_file_handle_id="fh_existing", # already copied
+ file_size=1024,
+ dest_storage_location_id="99",
+ semaphore=semaphore,
+ synapse_client=client,
+ )
+
+ mock_copy.assert_not_awaited()
+ assert result["to_file_handle_id"] == "fh_existing"
+
+ @pytest.mark.asyncio
+ async def test_migrates_versioned_file(self):
+ client = _make_mock_client()
+ key = MigrationKey("syn3", MigrationType.FILE, version=2)
+ semaphore = asyncio.Semaphore(10)
+
+ with (
+ patch(
+ f"{MODULE}.multipart_copy_async", new=AsyncMock(return_value="fh_new")
+ ),
+ patch(
+ f"{MODULE}._migrate_file_version_async", new=AsyncMock()
+ ) as mock_migrate_ver,
+ ):
+ await _migrate_item_async(
+ key=key,
+ from_file_handle_id="fh_old",
+ to_file_handle_id=None,
+ file_size=1024,
+ dest_storage_location_id="99",
+ semaphore=semaphore,
+ synapse_client=client,
+ )
+
+ mock_migrate_ver.assert_awaited_once()
+
+ @pytest.mark.asyncio
+ async def test_migrates_table_attached_file(self):
+ client = _make_mock_client()
+ key = MigrationKey(
+ "syn5", MigrationType.TABLE_ATTACHED_FILE, row_id=1, col_id=2
+ )
+ semaphore = asyncio.Semaphore(10)
+
+ with (
+ patch(
+ f"{MODULE}.multipart_copy_async", new=AsyncMock(return_value="fh_new")
+ ),
+ patch(
+ f"{MODULE}._migrate_table_attached_file_async", new=AsyncMock()
+ ) as mock_table,
+ ):
+ await _migrate_item_async(
+ key=key,
+ from_file_handle_id="fh_old",
+ to_file_handle_id=None,
+ file_size=512,
+ dest_storage_location_id="99",
+ semaphore=semaphore,
+ synapse_client=client,
+ )
+
+ mock_table.assert_awaited_once()
+
+ @pytest.mark.asyncio
+ async def test_exception_wrapped_as_migration_error(self):
+ client = _make_mock_client()
+ key = MigrationKey("syn3", MigrationType.FILE, version=None)
+ semaphore = asyncio.Semaphore(10)
+
+ with patch(
+ f"{MODULE}.multipart_copy_async", side_effect=RuntimeError("S3 error")
+ ):
+ with pytest.raises(MigrationError) as exc_info:
+ await _migrate_item_async(
+ key=key,
+ from_file_handle_id="fh_old",
+ to_file_handle_id=None,
+ file_size=1024,
+ dest_storage_location_id="99",
+ semaphore=semaphore,
+ synapse_client=client,
+ )
+
+ assert exc_info.value.key is key
+ assert "S3 error" in str(exc_info.value)
+
+
+# =============================================================================
+# _create_new_file_version_async
+# =============================================================================
+
+
+class TestCreateNewFileVersionAsync:
+ @pytest.mark.asyncio
+ async def test_sets_file_handle_and_stores(self):
+ client = _make_mock_client()
+ entity = _make_entity("syn3")
+
+ with (
+ patch(f"{MODULE}.Synapse.get_client", return_value=client),
+ patch(f"{MODULE}.get_async", new=AsyncMock(return_value=entity)),
+ ):
+ await _create_new_file_version_async(
+ entity_id="syn3",
+ to_file_handle_id="fh_new",
+ synapse_client=client,
+ )
+
+ assert entity.dataFileHandleId == "fh_new"
+ entity.store_async.assert_awaited_once()
+
+
+# =============================================================================
+# _migrate_file_version_async
+# =============================================================================
+
+
+class TestMigrateFileVersionAsync:
+ @pytest.mark.asyncio
+ async def test_calls_rest_put_with_correct_payload(self):
+ client = _make_mock_client()
+
+ with patch(f"{MODULE}.Synapse.get_client", return_value=client):
+ await _migrate_file_version_async(
+ entity_id="syn3",
+ version=2,
+ from_file_handle_id="fh_old",
+ to_file_handle_id="fh_new",
+ synapse_client=client,
+ )
+
+ client.rest_put_async.assert_awaited_once()
+ call_args = client.rest_put_async.call_args
+ assert call_args[0][0] == "/entity/syn3/version/2/filehandle"
+ body = json.loads(call_args[1]["body"])
+ assert body["oldFileHandleId"] == "fh_old"
+ assert body["newFileHandleId"] == "fh_new"
+
+
+# =============================================================================
+# _migrate_table_attached_file_async
+# =============================================================================
+
+
+class TestMigrateTableAttachedFileAsync:
+ @pytest.mark.asyncio
+ async def test_sends_transaction(self):
+ client = _make_mock_client()
+ key = MigrationKey(
+ "syn5", MigrationType.TABLE_ATTACHED_FILE, row_id=7, col_id=3
+ )
+
+ mock_transaction = MagicMock()
+ mock_transaction.send_job_and_wait_async = AsyncMock()
+
+ with patch(f"{MODULE}.TableUpdateTransaction", return_value=mock_transaction):
+ await _migrate_table_attached_file_async(
+ key=key,
+ to_file_handle_id="fh_new",
+ synapse_client=client,
+ )
+
+ mock_transaction.send_job_and_wait_async.assert_awaited_once()
+
+
+# =============================================================================
+# track_migration_results_async
+# =============================================================================
+
+
+class TestTrackMigrationResultsAsync:
+ def _make_db(self, from_fh="fh_src", entity_id="syn3", version=1):
+ conn = sqlite3.connect(":memory:")
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ cursor.execute(
+ """INSERT INTO migrations (id, type, version, status, from_file_handle_id)
+ VALUES (?, ?, ?, ?, ?)""",
+ (
+ entity_id,
+ MigrationType.FILE.value,
+ version,
+ MigrationStatus.INDEXED.value,
+ from_fh,
+ ),
+ )
+ conn.commit()
+ return conn, cursor
+
+ @pytest.mark.asyncio
+ async def test_successful_task_marks_migrated(self):
+ conn, cursor = self._make_db()
+ key = MigrationKey("syn3", MigrationType.FILE, version=1)
+ from_fh = "fh_src"
+
+ async def _successful_migrate():
+ return {
+ "key": key,
+ "from_file_handle_id": from_fh,
+ "to_file_handle_id": "fh_dst",
+ }
+
+ task = asyncio.create_task(_successful_migrate())
+ await asyncio.sleep(0) # let it complete
+
+ pending_fh = {from_fh}
+ completed_fh = set()
+ pending_keys = {key}
+
+ await track_migration_results_async(
+ conn=conn,
+ cursor=cursor,
+ active_tasks={task},
+ pending_file_handles=pending_fh,
+ completed_file_handles=completed_fh,
+ pending_keys=pending_keys,
+ return_when=asyncio.ALL_COMPLETED,
+ continue_on_error=False,
+ )
+
+ row = cursor.execute(
+ "SELECT status, to_file_handle_id FROM migrations WHERE id='syn3'"
+ ).fetchone()
+ assert row[0] == MigrationStatus.MIGRATED.value
+ assert row[1] == "fh_dst"
+ assert from_fh in completed_fh
+ assert key not in pending_keys
+
+ @pytest.mark.asyncio
+ async def test_failed_task_marks_errored(self):
+ conn, cursor = self._make_db()
+ key = MigrationKey("syn3", MigrationType.FILE, version=1)
+ from_fh = "fh_src"
+ inner_error = RuntimeError("network")
+
+ async def _failing_migrate():
+ err = MigrationError(key, from_fh)
+ err.__cause__ = inner_error
+ raise err
+
+ task = asyncio.create_task(_failing_migrate())
+ await asyncio.sleep(0)
+
+ pending_fh = {from_fh}
+ completed_fh = set()
+ pending_keys = {key}
+
+ await track_migration_results_async(
+ conn=conn,
+ cursor=cursor,
+ active_tasks={task},
+ pending_file_handles=pending_fh,
+ completed_file_handles=completed_fh,
+ pending_keys=pending_keys,
+ return_when=asyncio.ALL_COMPLETED,
+ continue_on_error=True, # don't re-raise
+ )
+
+ row = cursor.execute("SELECT status FROM migrations WHERE id='syn3'").fetchone()
+ assert row[0] == MigrationStatus.ERRORED.value
+ assert from_fh in completed_fh
+
+ @pytest.mark.asyncio
+ async def test_failed_task_reraises_when_not_continue_on_error(self):
+ conn, cursor = self._make_db()
+ key = MigrationKey("syn3", MigrationType.FILE, version=1)
+ from_fh = "fh_src"
+ inner_error = RuntimeError("critical failure")
+
+ async def _failing_migrate():
+ err = MigrationError(key, from_fh)
+ err.__cause__ = inner_error
+ raise err
+
+ task = asyncio.create_task(_failing_migrate())
+ await asyncio.sleep(0)
+
+ with pytest.raises(RuntimeError, match="critical failure"):
+ await track_migration_results_async(
+ conn=conn,
+ cursor=cursor,
+ active_tasks={task},
+ pending_file_handles={from_fh},
+ completed_file_handles=set(),
+ pending_keys={key},
+ return_when=asyncio.ALL_COMPLETED,
+ continue_on_error=False,
+ )
+
+
+# =============================================================================
+# migrate_indexed_files_async
+# =============================================================================
+
+
+class TestMigrateIndexedFilesAsync:
+ @pytest.mark.asyncio
+ async def test_raises_if_no_settings_in_db(self):
+ with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
+ path = f.name
+ try:
+ with sqlite3.connect(path) as conn:
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ conn.commit()
+
+ client = _make_mock_client()
+ with patch(f"{MODULE}.Synapse.get_client", return_value=client):
+ with pytest.raises(
+ ValueError, match="Unable to retrieve existing index settings"
+ ):
+ await migrate_indexed_files_async(
+ db_path=path, synapse_client=client
+ )
+ finally:
+ os.unlink(path)
+
+ @pytest.mark.asyncio
+ async def test_returns_none_when_migration_not_confirmed(
+ self, db_file_with_settings
+ ):
+ path, _ = db_file_with_settings
+ # Add an indexed row so there's something to confirm
+ with sqlite3.connect(path) as conn:
+ cursor = conn.cursor()
+ cursor.execute(
+ "INSERT INTO migrations (id, type, status) VALUES (?, ?, ?)",
+ ("syn3", MigrationType.FILE.value, MigrationStatus.INDEXED.value),
+ )
+ conn.commit()
+
+ client = _make_mock_client()
+ with (
+ patch(f"{MODULE}.Synapse.get_client", return_value=client),
+ patch("sys.stdout") as mock_stdout,
+ ):
+ mock_stdout.isatty.return_value = False
+ result = await migrate_indexed_files_async(
+ db_path=path,
+ force=False,
+ synapse_client=client,
+ )
+
+ assert result is None
+
+ @pytest.mark.asyncio
+ async def test_returns_migration_result_on_success(self, db_file_with_settings):
+ path, _ = db_file_with_settings
+ client = _make_mock_client()
+
+ with (
+ patch(f"{MODULE}.Synapse.get_client", return_value=client),
+ patch(f"{MODULE}._execute_migration_async", new=AsyncMock()),
+ ):
+ result = await migrate_indexed_files_async(
+ db_path=path,
+ force=True,
+ synapse_client=client,
+ )
+
+ assert isinstance(result, MigrationResult)
+ assert result.db_path == path
+
+
+# =============================================================================
+# _execute_migration_async
+# =============================================================================
+
+
+class TestExecuteMigrationAsync:
+ def _make_db_with_indexed_file(self, from_fh="fh_src", entity_id="syn3", version=1):
+ conn = sqlite3.connect(":memory:")
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ cursor.execute(
+ """INSERT INTO migrations (id, type, version, status, from_file_handle_id, file_size)
+ VALUES (?, ?, ?, ?, ?, ?)""",
+ (
+ entity_id,
+ MigrationType.FILE.value,
+ version,
+ MigrationStatus.INDEXED.value,
+ from_fh,
+ 1024,
+ ),
+ )
+ conn.commit()
+ return conn, cursor
+
+ @pytest.mark.asyncio
+ async def test_migrates_single_item(self):
+ conn, cursor = self._make_db_with_indexed_file()
+ client = _make_mock_client()
+
+ key = MigrationKey("syn3", MigrationType.FILE, version=1)
+
+ async def _mock_migrate_item(
+ key,
+ from_file_handle_id,
+ to_file_handle_id,
+ file_size,
+ dest_storage_location_id,
+ semaphore,
+ *,
+ synapse_client,
+ ):
+ return {
+ "key": key,
+ "from_file_handle_id": from_file_handle_id,
+ "to_file_handle_id": "fh_dst",
+ }
+
+ with patch(f"{MODULE}._migrate_item_async", _mock_migrate_item):
+ await _execute_migration_async(
+ conn=conn,
+ cursor=cursor,
+ dest_storage_location_id="99",
+ create_table_snapshots=False,
+ continue_on_error=False,
+ synapse_client=client,
+ )
+
+ row = cursor.execute("SELECT status FROM migrations WHERE id='syn3'").fetchone()
+ assert row[0] == MigrationStatus.MIGRATED.value
+
+ @pytest.mark.asyncio
+ async def test_empty_db_completes_without_error(self):
+ conn = sqlite3.connect(":memory:")
+ cursor = conn.cursor()
+ _ensure_schema(cursor)
+ conn.commit()
+
+ client = _make_mock_client()
+
+ await _execute_migration_async(
+ conn=conn,
+ cursor=cursor,
+ dest_storage_location_id="99",
+ create_table_snapshots=False,
+ continue_on_error=False,
+ synapse_client=client,
+ )
+
+ @pytest.mark.asyncio
+ async def test_continue_on_error_records_failure(self):
+ conn, cursor = self._make_db_with_indexed_file()
+ client = _make_mock_client()
+
+ key = MigrationKey("syn3", MigrationType.FILE, version=1)
+
+ async def _failing_migrate(
+ key,
+ from_file_handle_id,
+ to_file_handle_id,
+ file_size,
+ dest_storage_location_id,
+ semaphore,
+ *,
+ synapse_client,
+ ):
+ err = MigrationError(key, from_file_handle_id)
+ err.__cause__ = RuntimeError("disk full")
+ raise err
+
+ with patch(f"{MODULE}._migrate_item_async", _failing_migrate):
+ await _execute_migration_async(
+ conn=conn,
+ cursor=cursor,
+ dest_storage_location_id="99",
+ create_table_snapshots=False,
+ continue_on_error=True,
+ synapse_client=client,
+ )
+
+ row = cursor.execute("SELECT status FROM migrations WHERE id='syn3'").fetchone()
+ assert row[0] == MigrationStatus.ERRORED.value