-
Notifications
You must be signed in to change notification settings - Fork 440
Open
Description
Apache Iceberg version
0.10.0 (latest release)
Please describe the bug 🐞
Projected reads on a partitioned table can fail with:
ValueError: Could not find field with id: 2
This occurs when scanning with selected_fields that exclude the partition source column (for example projecting only field1 while the table is partitioned by day(timestamp)).
Expected behavior
A projected read should succeed even if the projection does not include partition source columns.
Actual behavior
table.scan(..., selected_fields=(...)).to_arrow() fails in PyIceberg planning/execution with:
ValueError: Could not find field with id: 2
Environment
- Python: 3.14.2
- pyiceberg: 0.10.0
- pyiceberg-core: 0.6.0
- backend: SQL catalog
- IO impl:
pyiceberg.io.pyarrow.PyArrowFileIO
Repro
from datetime import UTC, datetime
from pathlib import Path
import tempfile
import pyarrow as pa
from pyiceberg.catalog import load_catalog
from pyiceberg.expressions import AlwaysTrue
from pyiceberg.partitioning import PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.transforms import DayTransform
from pyiceberg.types import NestedField, StringType, TimestamptzType
with tempfile.TemporaryDirectory(prefix="pyiceberg-repro-") as tmp:
tmp_path = Path(tmp)
db_path = tmp_path / "catalog.db"
warehouse_path = tmp_path / "warehouse"
warehouse_path.mkdir(parents=True, exist_ok=True)
catalog = load_catalog(
"default",
**{
"type": "sql",
"uri": f"sqlite:///{db_path}",
"warehouse": warehouse_path.as_uri(),
"py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
},
)
catalog.create_namespace_if_not_exists("ns")
schema_v1 = Schema(
NestedField(field_id=1, name="timestamp", field_type=TimestamptzType(), required=True),
NestedField(field_id=2, name="value", field_type=StringType(), required=False),
)
spec = PartitionSpec(
PartitionField(source_id=1, field_id=1000, transform=DayTransform(), name="_day"),
)
table = catalog.create_table("ns.tbl", schema=schema_v1, partition_spec=spec)
# File 1: old schema (no new_col)
table.append(
pa.Table.from_pylist(
[{"timestamp": datetime(2025, 1, 1, tzinfo=UTC), "value": "old"}],
schema=pa.schema(
[
pa.field("timestamp", pa.timestamp("us", tz="UTC"), nullable=False),
pa.field("value", pa.string()),
]
),
)
)
# Evolve schema
with table.update_schema() as u:
u.add_column("new_col", StringType())
table = catalog.load_table("ns.tbl")
# File 2: new schema (has new_col)
table.append(
pa.Table.from_pylist(
[{"timestamp": datetime(2025, 1, 2, tzinfo=UTC), "value": "new", "new_col": "x"}],
schema=pa.schema(
[
pa.field("timestamp", pa.timestamp("us", tz="UTC"), nullable=False),
pa.field("value", pa.string()),
pa.field("new_col", pa.string()),
]
),
)
)
# Repro on affected versions:
print(table.scan(row_filter=AlwaysTrue(), selected_fields=("new_col",)).to_arrow())
# Workaround:
# print(table.scan(row_filter=AlwaysTrue(), selected_fields=("new_col", "timestamp")).to_arrow())Willingness to contribute
- I can contribute a fix for this bug independently
- I would be willing to contribute a fix for this bug with guidance from the Iceberg community
- I cannot contribute a fix for this bug at this time
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels