From 9fbfa52274968715c5908a2791c6664138dec709 Mon Sep 17 00:00:00 2001 From: Anastasiia Sliusar Date: Fri, 2 Jan 2026 13:29:05 +0100 Subject: [PATCH 1/5] feat(server): add columns info --- arbalister/routes.py | 24 ++++++++++++++++++++++-- src/model.ts | 2 ++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arbalister/routes.py b/arbalister/routes.py index 009df18..6442570 100644 --- a/arbalister/routes.py +++ b/arbalister/routes.py @@ -128,6 +128,12 @@ class SchemaInfo: mimetype: str = "application/vnd.apache.arrow.stream" encoding: str = "base64" +@dataclasses.dataclass(frozen=True, slots=True) +class ColumnInfo: + + name: str + dtype: str + nullable: bool @dataclasses.dataclass(frozen=True, slots=True) class StatsResponse: @@ -136,6 +142,7 @@ class StatsResponse: schema: SchemaInfo num_rows: int = 0 num_cols: int = 0 + columns: list[ColumnInfo] class StatsRouteHandler(BaseRouteHandler): @@ -184,6 +191,7 @@ class SqliteFileInfo: """Sqlite specific information about a file.""" table_names: list[str] + columns: list[ColumnInfo] = dataclasses.field(default_factory=list) @dataclasses.dataclass(frozen=True, slots=True) @@ -191,6 +199,7 @@ class CsvFileInfo: """Csv specific information about a file.""" delimiters: list[str] = dataclasses.field(default_factory=lambda: [",", ";", "\\t", "|", "#"]) + columns: list[ColumnInfo] = dataclasses.field(default_factory=list) FileInfo = SqliteFileInfo | CsvFileInfo @@ -218,10 +227,21 @@ async def get(self, path: str) -> None: """HTTP GET return file-specific information.""" file = self.data_file(path) file_format = ff.FileFormat.from_filename(file) + + df = self.dataframe(path) + + # FIXME this is not optimal for ORC/CSV where we can read_metadata, but it is not read + # via DataFusion. + schema = df.schema() + columns = [ + ColumnInfo(name=field.name, dtype=str(field.type), nullable=field.nullable) + for field in schema + ] + match file_format: case ff.FileFormat.Csv: - info = CsvFileInfo() + info = CsvFileInfo(columns=columns) csv_response = CsvFileInfoResponse( info=info, default_options=CsvReadOptions(delimiter=info.delimiters[0]), @@ -233,7 +253,7 @@ async def get(self, path: str) -> None: table_names = adbc.SqliteDataFrame.get_table_names(file) sqlite_response = SqliteFileInfoResponse( - info=SqliteFileInfo(table_names=table_names), + info=SqliteFileInfo(table_names=table_names, columns=columns), default_options=SqliteReadOptions(table_name=table_names[0]), ) await self.finish(dataclasses.asdict(sqlite_response)) diff --git a/src/model.ts b/src/model.ts index 8f02ba2..78f66bb 100644 --- a/src/model.ts +++ b/src/model.ts @@ -27,6 +27,8 @@ export class ArrowModel extends DataModel { const { info: fileInfo, default_options: fileOptions } = await fetchFileInfo({ path: loadingOptions.path, }); + + console.log('fileInfo',fileInfo); return new ArrowModel(loadingOptions, fileOptions, fileInfo); } From 780f331ad216cb97dc0d3cdb630e25ee31ba4780 Mon Sep 17 00:00:00 2001 From: Anastasiia Sliusar Date: Fri, 2 Jan 2026 14:28:53 +0100 Subject: [PATCH 2/5] fix types --- arbalister/routes.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arbalister/routes.py b/arbalister/routes.py index 6442570..2e03b01 100644 --- a/arbalister/routes.py +++ b/arbalister/routes.py @@ -142,7 +142,7 @@ class StatsResponse: schema: SchemaInfo num_rows: int = 0 num_cols: int = 0 - columns: list[ColumnInfo] + columns: list[ColumnInfo] = dataclasses.field(default_factory=list) class StatsRouteHandler(BaseRouteHandler): @@ -228,10 +228,7 @@ async def get(self, path: str) -> None: file = self.data_file(path) file_format = ff.FileFormat.from_filename(file) - df = self.dataframe(path) - - # FIXME this is not optimal for ORC/CSV where we can read_metadata, but it is not read - # via DataFusion. + df = self.dataframe(path) schema = df.schema() columns = [ ColumnInfo(name=field.name, dtype=str(field.type), nullable=field.nullable) From 9bc537fe6b77b92d5d8c98988356947a4dc44d2d Mon Sep 17 00:00:00 2001 From: Anastasiia Sliusar Date: Fri, 2 Jan 2026 14:52:48 +0100 Subject: [PATCH 3/5] fix formatting --- arbalister/routes.py | 5 +++-- src/model.ts | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arbalister/routes.py b/arbalister/routes.py index 2e03b01..9f98957 100644 --- a/arbalister/routes.py +++ b/arbalister/routes.py @@ -130,6 +130,7 @@ class SchemaInfo: @dataclasses.dataclass(frozen=True, slots=True) class ColumnInfo: + """Column information with data types.""" name: str dtype: str @@ -227,14 +228,14 @@ async def get(self, path: str) -> None: """HTTP GET return file-specific information.""" file = self.data_file(path) file_format = ff.FileFormat.from_filename(file) - + df = self.dataframe(path) schema = df.schema() columns = [ ColumnInfo(name=field.name, dtype=str(field.type), nullable=field.nullable) for field in schema ] - + match file_format: case ff.FileFormat.Csv: diff --git a/src/model.ts b/src/model.ts index 78f66bb..d269003 100644 --- a/src/model.ts +++ b/src/model.ts @@ -28,7 +28,7 @@ export class ArrowModel extends DataModel { path: loadingOptions.path, }); - console.log('fileInfo',fileInfo); + console.log("fileInfo", fileInfo); return new ArrowModel(loadingOptions, fileOptions, fileInfo); } From ee63f3b24bed1548198d7f52825e159bf88af16b Mon Sep 17 00:00:00 2001 From: Anastasiia Sliusar Date: Fri, 2 Jan 2026 17:49:52 +0100 Subject: [PATCH 4/5] get parquet stats --- arbalister/arrow.py | 29 ++++++++++++++++++++++++++++ arbalister/routes.py | 45 ++++++++++++++++++++++++++++++-------------- src/model.ts | 4 +++- src/requests.ts | 2 +- 4 files changed, 64 insertions(+), 16 deletions(-) diff --git a/arbalister/arrow.py b/arbalister/arrow.py index b4b7c3e..9c86d4f 100644 --- a/arbalister/arrow.py +++ b/arbalister/arrow.py @@ -153,3 +153,32 @@ def write_csv( out = adbc.write_sqlite return out + +def get_parquet_column_stats(path:str): + import pyarrow.parquet as pq + file = pq.ParquetFile(path) + metadata = file.metadata + schema = file.schema + columns_stats = [] + for i, field in enumerate(schema): + min_val = None + max_val = None + null_count = 0 + + for row_group_index in range(metadata.num_row_groups): + row = metadata.row_group(row_group_index) + col_chunk = row.column(i) + stats = col_chunk.statistics + if stats: + if min_val is None or stats.min < min_val: + min_val = stats.min + if max_val is None or stats.max < max_val: + max_val = stats.max + null_count +=stats.null_count if stats.null_count is not None else 0 + columns_stats.append({ + "name":field.name, + "min":min_val, + "max": max_val, + "null_count": null_count + }) + return columns_stats diff --git a/arbalister/routes.py b/arbalister/routes.py index 9f98957..05f0510 100644 --- a/arbalister/routes.py +++ b/arbalister/routes.py @@ -129,12 +129,13 @@ class SchemaInfo: encoding: str = "base64" @dataclasses.dataclass(frozen=True, slots=True) -class ColumnInfo: - """Column information with data types.""" +class ColumnStats: + """Column stats for a parquet file""" name: str - dtype: str - nullable: bool + min: object | None = None + max: object | None = None + null_count: int | None = None @dataclasses.dataclass(frozen=True, slots=True) class StatsResponse: @@ -143,7 +144,7 @@ class StatsResponse: schema: SchemaInfo num_rows: int = 0 num_cols: int = 0 - columns: list[ColumnInfo] = dataclasses.field(default_factory=list) + columns_stats: list[ColumnStats] | None = None class StatsRouteHandler(BaseRouteHandler): @@ -152,6 +153,17 @@ class StatsRouteHandler(BaseRouteHandler): @tornado.web.authenticated async def get(self, path: str) -> None: """HTTP GET return statistics.""" + + file = self.data_file(path) + file_format = ff.FileFormat.from_filename(file) + + columns_stats = None + if file_format == ff.FileFormat.Parquet: + parquet_columns_stats = abw.get_parquet_column_stats(file) + columns_stats = [ + ColumnStats(**stats) + for stats in parquet_columns_stats + ] df = self.dataframe(path) # FIXME this is not optimal for ORC/CSV where we can read_metadata, but it is not read @@ -183,6 +195,7 @@ async def get(self, path: str) -> None: num_cols=len(schema), num_rows=num_rows, schema=SchemaInfo(data=schema_64), + columns_stats=columns_stats, ) await self.finish(dataclasses.asdict(response)) @@ -192,7 +205,7 @@ class SqliteFileInfo: """Sqlite specific information about a file.""" table_names: list[str] - columns: list[ColumnInfo] = dataclasses.field(default_factory=list) + size_bytes: int | None = None @dataclasses.dataclass(frozen=True, slots=True) @@ -200,7 +213,7 @@ class CsvFileInfo: """Csv specific information about a file.""" delimiters: list[str] = dataclasses.field(default_factory=lambda: [",", ";", "\\t", "|", "#"]) - columns: list[ColumnInfo] = dataclasses.field(default_factory=list) + size_bytes: int | None = None FileInfo = SqliteFileInfo | CsvFileInfo @@ -217,7 +230,7 @@ class FileInfoResponse[I, P]: CsvFileInfoResponse = FileInfoResponse[CsvFileInfo, CsvReadOptions] SqliteFileInfoResponse = FileInfoResponse[SqliteFileInfo, SqliteReadOptions] -NoFileInfoResponse = FileInfoResponse[Empty, Empty] +NoFileInfoResponse = FileInfoResponse[Empty, Empty, ] class FileInfoRouteHandler(BaseRouteHandler): @@ -231,18 +244,21 @@ async def get(self, path: str) -> None: df = self.dataframe(path) schema = df.schema() - columns = [ - ColumnInfo(name=field.name, dtype=str(field.type), nullable=field.nullable) - for field in schema - ] + + try: + size_bytes = os.path.getsize(file) + except Exception: + size_bytes = None + match file_format: case ff.FileFormat.Csv: - info = CsvFileInfo(columns=columns) + info = CsvFileInfo(size_bytes=size_bytes) csv_response = CsvFileInfoResponse( info=info, default_options=CsvReadOptions(delimiter=info.delimiters[0]), + size_bytes=size_bytes, ) await self.finish(dataclasses.asdict(csv_response)) case ff.FileFormat.Sqlite: @@ -251,8 +267,9 @@ async def get(self, path: str) -> None: table_names = adbc.SqliteDataFrame.get_table_names(file) sqlite_response = SqliteFileInfoResponse( - info=SqliteFileInfo(table_names=table_names, columns=columns), + info=SqliteFileInfo(table_names=table_names, size_bytes=size_bytes), default_options=SqliteReadOptions(table_name=table_names[0]), + size_bytes=size_bytes, ) await self.finish(dataclasses.asdict(sqlite_response)) case _: diff --git a/src/model.ts b/src/model.ts index d269003..bb64fc4 100644 --- a/src/model.ts +++ b/src/model.ts @@ -118,7 +118,9 @@ export class ArrowModel extends DataModel { // This is to showcase that we can put additional information in the column header but it // does not look good. HuggingFace dataset has some good inspiration. const field = this.schema.fields[column]; - return `${field.name} (${field.type}${field.nullable ? " | null" : ""})`; + return `${field.name} + (${field.type}${field.nullable ? " | null" : ""}) + Rows: ${this._numRows}`; } case "row-header": return row.toString(); diff --git a/src/requests.ts b/src/requests.ts index d8e1061..88b4b90 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -90,7 +90,7 @@ export async function fetchStats( throw new Error(`Error communicating with the Arbalister server: ${response.status}`); } const data: StatsResponseRaw = await response.json(); - +console.log('data', data); // Validate encoding and content type if (data.schema.encoding !== "base64") { throw new Error(`Unexpected schema encoding: ${data.schema.encoding}, expected "base64"`); From 9ef9c3d73b7dc4306610444f0a3db3143e0e27e1 Mon Sep 17 00:00:00 2001 From: Anastasiia Sliusar Date: Fri, 2 Jan 2026 18:16:59 +0100 Subject: [PATCH 5/5] fix issues --- arbalister/arrow.py | 10 ++++++++-- arbalister/routes.py | 19 ++++++++----------- src/model.ts | 8 ++++++-- src/requests.ts | 2 +- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/arbalister/arrow.py b/arbalister/arrow.py index 9c86d4f..377308f 100644 --- a/arbalister/arrow.py +++ b/arbalister/arrow.py @@ -1,5 +1,6 @@ import codecs import pathlib +from pathlib import Path from typing import Any, Callable import datafusion as dn @@ -154,8 +155,13 @@ def write_csv( out = adbc.write_sqlite return out -def get_parquet_column_stats(path:str): +def get_parquet_column_stats(path:str | Path)->list[dict[str, Any]]: + """Get parquet column stats.""" import pyarrow.parquet as pq + + if isinstance(path, Path): + path = str(path) + file = pq.ParquetFile(path) metadata = file.metadata schema = file.schema @@ -173,7 +179,7 @@ def get_parquet_column_stats(path:str): if min_val is None or stats.min < min_val: min_val = stats.min if max_val is None or stats.max < max_val: - max_val = stats.max + max_val = stats.max null_count +=stats.null_count if stats.null_count is not None else 0 columns_stats.append({ "name":field.name, diff --git a/arbalister/routes.py b/arbalister/routes.py index 05f0510..86a80b1 100644 --- a/arbalister/routes.py +++ b/arbalister/routes.py @@ -128,15 +128,17 @@ class SchemaInfo: mimetype: str = "application/vnd.apache.arrow.stream" encoding: str = "base64" + @dataclasses.dataclass(frozen=True, slots=True) class ColumnStats: - """Column stats for a parquet file""" + """Column stats for a parquet file.""" name: str min: object | None = None max: object | None = None null_count: int | None = None + @dataclasses.dataclass(frozen=True, slots=True) class StatsResponse: """File statistics returned in the stats route.""" @@ -153,17 +155,13 @@ class StatsRouteHandler(BaseRouteHandler): @tornado.web.authenticated async def get(self, path: str) -> None: """HTTP GET return statistics.""" - file = self.data_file(path) file_format = ff.FileFormat.from_filename(file) columns_stats = None if file_format == ff.FileFormat.Parquet: parquet_columns_stats = abw.get_parquet_column_stats(file) - columns_stats = [ - ColumnStats(**stats) - for stats in parquet_columns_stats - ] + columns_stats = [ColumnStats(**stats) for stats in parquet_columns_stats] df = self.dataframe(path) # FIXME this is not optimal for ORC/CSV where we can read_metadata, but it is not read @@ -225,12 +223,13 @@ class FileInfoResponse[I, P]: info: I default_options: P + size_bytes: int | None = None CsvFileInfoResponse = FileInfoResponse[CsvFileInfo, CsvReadOptions] SqliteFileInfoResponse = FileInfoResponse[SqliteFileInfo, SqliteReadOptions] -NoFileInfoResponse = FileInfoResponse[Empty, Empty, ] +NoFileInfoResponse = FileInfoResponse[Empty, Empty] class FileInfoRouteHandler(BaseRouteHandler): @@ -243,14 +242,12 @@ async def get(self, path: str) -> None: file_format = ff.FileFormat.from_filename(file) df = self.dataframe(path) - schema = df.schema() + df.schema() try: size_bytes = os.path.getsize(file) except Exception: size_bytes = None - - match file_format: case ff.FileFormat.Csv: @@ -273,7 +270,7 @@ async def get(self, path: str) -> None: ) await self.finish(dataclasses.asdict(sqlite_response)) case _: - no_response = NoFileInfoResponse(info=Empty(), default_options=Empty()) + no_response = NoFileInfoResponse(info=Empty(), default_options=Empty(), size_bytes=size_bytes) await self.finish(dataclasses.asdict(no_response)) diff --git a/src/model.ts b/src/model.ts index bb64fc4..3a57753 100644 --- a/src/model.ts +++ b/src/model.ts @@ -24,11 +24,15 @@ export namespace ArrowModel { export class ArrowModel extends DataModel { static async fromRemoteFileInfo(loadingOptions: ArrowModel.LoadingOptions) { - const { info: fileInfo, default_options: fileOptions } = await fetchFileInfo({ + const { + info: fileInfo, + default_options: fileOptions, + size_bytes, + } = await fetchFileInfo({ path: loadingOptions.path, }); - console.log("fileInfo", fileInfo); + console.log("size_bytes", size_bytes); return new ArrowModel(loadingOptions, fileOptions, fileInfo); } diff --git a/src/requests.ts b/src/requests.ts index 88b4b90..c3d76a0 100644 --- a/src/requests.ts +++ b/src/requests.ts @@ -22,6 +22,7 @@ export interface FileInfoResponseFor { export interface FileInfoResponse { info: FileInfo; default_options: FileReadOptions; + size_bytes?: number | null; } export async function fetchFileInfo(params: Readonly): Promise { @@ -90,7 +91,6 @@ export async function fetchStats( throw new Error(`Error communicating with the Arbalister server: ${response.status}`); } const data: StatsResponseRaw = await response.json(); -console.log('data', data); // Validate encoding and content type if (data.schema.encoding !== "base64") { throw new Error(`Unexpected schema encoding: ${data.schema.encoding}, expected "base64"`);