Skip to content

Commit d3db840

Browse files
authored
Add as_arrow() to Schema class (#532)
* Add as_arrow() to Schema class * fixup! Add as_arrow() to Schema class
1 parent 9495bff commit d3db840

File tree

3 files changed

+41
-0
lines changed

3 files changed

+41
-0
lines changed

mkdocs/docs/api.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,23 @@ long: [[4.896029,-122.431297,6.0989,2.349014],[6.56667]]
295295

296296
The nested lists indicate the different Arrow buffers, where the first write results into a buffer, and the second append in a separate buffer. This is expected since it will read two parquet files.
297297

298+
To avoid any type errors during writing, you can enforce the PyArrow table types using the Iceberg table schema:
299+
300+
```python
301+
from pyiceberg.catalog import load_catalog
302+
import pyarrow as pa
303+
304+
catalog = load_catalog("default")
305+
table = catalog.load_table("default.cities")
306+
schema = table.schema().as_arrow()
307+
308+
df = pa.Table.from_pylist(
309+
[{"city": "Groningen", "lat": 53.21917, "long": 6.56667}], schema=schema
310+
)
311+
312+
table.append(df)
313+
```
314+
298315
<!-- prettier-ignore-start -->
299316

300317
!!! example "Under development"

pyiceberg/schema.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@
6464
)
6565

6666
if TYPE_CHECKING:
67+
import pyarrow as pa
68+
6769
from pyiceberg.table.name_mapping import (
6870
NameMapping,
6971
)
@@ -180,6 +182,12 @@ def as_struct(self) -> StructType:
180182
"""Return the schema as a struct."""
181183
return StructType(*self.fields)
182184

185+
def as_arrow(self) -> "pa.Schema":
186+
"""Return the schema as an Arrow schema."""
187+
from pyiceberg.io.pyarrow import schema_to_pyarrow
188+
189+
return schema_to_pyarrow(self)
190+
183191
def find_field(self, name_or_id: Union[str, int], case_sensitive: bool = True) -> NestedField:
184192
"""Find a field using a field name or field ID.
185193

tests/test_schema.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1600,3 +1600,19 @@ def test_union_with_pa_schema(primitive_fields: NestedField) -> None:
16001600
)
16011601

16021602
assert new_schema == expected_schema
1603+
1604+
1605+
def test_arrow_schema() -> None:
1606+
base_schema = Schema(
1607+
NestedField(field_id=1, name="foo", field_type=StringType(), required=True),
1608+
NestedField(field_id=2, name="bar", field_type=IntegerType(), required=False),
1609+
NestedField(field_id=3, name="baz", field_type=BooleanType(), required=False),
1610+
)
1611+
1612+
expected_schema = pa.schema([
1613+
pa.field("foo", pa.string(), nullable=False),
1614+
pa.field("bar", pa.int32(), nullable=True),
1615+
pa.field("baz", pa.bool_(), nullable=True),
1616+
])
1617+
1618+
assert base_schema.as_arrow() == expected_schema

0 commit comments

Comments
 (0)