#44 Make Split and TableRead Serializable

chenghuichen · chenghuichen · commit f21f4294ac74 · 2025-02-23T22:39:02.000+08:00
diff --git a/dev/test_deps/paimon-python-java-bridge-0.9-SNAPSHOT.jar b/dev/test_deps/paimon-python-java-bridge-0.9-SNAPSHOT.jar
diff --git a/paimon-python-java-bridge/src/main/java/org/apache/paimon/python/SerializationUtil.java b/paimon-python-java-bridge/src/main/java/org/apache/paimon/python/SerializationUtil.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.python;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+
+public class SerializationUtil {
+    public static byte[] serialize(Object obj) throws IOException {
+        try (ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
+                ObjectOutputStream objStream = new ObjectOutputStream(byteStream)) {
+            objStream.writeObject(obj);
+            return byteStream.toByteArray();
+        }
+    }
+
+    public static Object deserialize(byte[] bytes) throws IOException, ClassNotFoundException {
+        try (ByteArrayInputStream byteStream = new ByteArrayInputStream(bytes);
+                ObjectInputStream objStream = new ObjectInputStream(byteStream)) {
+            return objStream.readObject();
+        }
+    }
+}
diff --git a/pypaimon/api/table_read.py b/pypaimon/api/table_read.py
@@ -31,14 +31,14 @@
 class TableRead(ABC):
     """To read data from data splits."""
 
-    @abstractmethod
-    def to_arrow(self, splits: List[Split]) -> pa.Table:
-        """Read data from splits and converted to pyarrow.Table format."""
-
     @abstractmethod
     def to_arrow_batch_reader(self, splits: List[Split]) -> pa.RecordBatchReader:
         """Read data from splits and converted to pyarrow.RecordBatchReader format."""
 
+    @abstractmethod
+    def to_arrow(self, splits: List[Split]) -> pa.Table:
+        """Read data from splits and converted to pyarrow.Table format."""
+
     @abstractmethod
     def to_pandas(self, splits: List[Split]) -> pd.DataFrame:
         """Read data from splits and converted to pandas.DataFrame format."""
diff --git a/pypaimon/py4j/java_implementation.py b/pypaimon/py4j/java_implementation.py
@@ -23,6 +23,7 @@
 
 from pypaimon.py4j.java_gateway import get_gateway
 from pypaimon.py4j.util import java_utils, constants
+from pypaimon.py4j.util.java_utils import serialize_java_object, deserialize_java_object
 from pypaimon.api import \
     (catalog, table, read_builder, table_scan, split, row_type,
      table_read, write_builder, table_write, commit_message,
@@ -109,8 +110,9 @@ def new_scan(self) -> 'TableScan':
         return TableScan(j_table_scan)
 
     def new_read(self) -> 'TableRead':
-        j_table_read = self._j_read_builder.newRead().executeFilter()
-        return TableRead(j_table_read, self._j_read_builder.readType(), self._catalog_options)
+        j_table_read_bytes = serialize_java_object(self._j_read_builder.newRead().executeFilter())
+        j_read_type_bytes = serialize_java_object(self._j_read_builder.readType())
+        return TableRead(j_table_read_bytes, j_read_type_bytes, self._catalog_options)
 
     def new_predicate_builder(self) -> 'PredicateBuilder':
         return PredicateBuilder(self._j_row_type)
@@ -145,55 +147,66 @@ def __init__(self, j_splits):
         self._j_splits = j_splits
 
     def splits(self) -> List['Split']:
-        return list(map(lambda s: Split(s), self._j_splits))
+        return list(map(lambda s: self._build_single_split(s), self._j_splits))
+
+    def _build_single_split(self, j_split) -> 'Split':
+        j_split_bytes = serialize_java_object(j_split)
+        row_count = j_split.rowCount()
+        files_optional = j_split.convertToRawFiles()
+        if not files_optional.isPresent():
+            file_size = 0
+            file_paths = []
+        else:
+            files = files_optional.get()
+            file_size = sum(file.length() for file in files)
+            file_paths = [file.path() for file in files]
+        return Split(j_split_bytes, row_count, file_size, file_paths)
 
 
 class Split(split.Split):
 
-    def __init__(self, j_split):
-        self._j_split = j_split
+    def __init__(self, j_split_bytes, row_count: int, file_size: int, file_paths: List[str]):
+        self._j_split_bytes = j_split_bytes
+        self._row_count = row_count
+        self._file_size = file_size
+        self._file_paths = file_paths
 
     def to_j_split(self):
-        return self._j_split
+        return deserialize_java_object(self._j_split_bytes)
 
     def row_count(self) -> int:
-        return self._j_split.rowCount()
+        return self._row_count
 
     def file_size(self) -> int:
-        files_optional = self._j_split.convertToRawFiles()
-        if not files_optional.isPresent():
-            return 0
-        files = files_optional.get()
-        return sum(file.length() for file in files)
+        return self._file_size
 
     def file_paths(self) -> List[str]:
-        files_optional = self._j_split.convertToRawFiles()
-        if not files_optional.isPresent():
-            return []
-        files = files_optional.get()
-        return [file.path() for file in files]
+        return self._file_paths
 
 
 class TableRead(table_read.TableRead):
 
-    def __init__(self, j_table_read, j_read_type, catalog_options):
-        self._j_table_read = j_table_read
-        self._j_read_type = j_read_type
+    def __init__(self, j_table_read_bytes, j_read_type_bytes, catalog_options):
+        self._j_table_read_bytes = j_table_read_bytes
+        self._j_read_type_bytes = j_read_type_bytes
         self._catalog_options = catalog_options
-        self._j_bytes_reader = None
-        self._arrow_schema = java_utils.to_arrow_schema(j_read_type)
 
-    def to_arrow(self, splits):
-        record_batch_reader = self.to_arrow_batch_reader(splits)
-        return pa.Table.from_batches(record_batch_reader, schema=self._arrow_schema)
+        self._j_table_read = None
+        self._j_read_type = None
+        self._arrow_schema = None
+        self._j_bytes_reader = None
 
-    def to_arrow_batch_reader(self, splits):
+    def to_arrow_batch_reader(self, splits) -> pa.RecordBatchReader:
         self._init()
         j_splits = list(map(lambda s: s.to_j_split(), splits))
         self._j_bytes_reader.setSplits(j_splits)
         batch_iterator = self._batch_generator()
         return pa.RecordBatchReader.from_batches(self._arrow_schema, batch_iterator)
 
+    def to_arrow(self, splits) -> pa.Table:
+        record_batch_reader = self.to_arrow_batch_reader(splits)
+        return pa.Table.from_batches(record_batch_reader, schema=self._arrow_schema)
+
     def to_pandas(self, splits: List[Split]) -> pd.DataFrame:
         return self.to_arrow(splits).to_pandas()
 
@@ -214,6 +227,12 @@ def to_ray(self, splits: List[Split]) -> "ray.data.dataset.Dataset":
         return ray.data.from_arrow(self.to_arrow(splits))
 
     def _init(self):
+        if self._j_table_read is None:
+            self._j_table_read = deserialize_java_object(self._j_table_read_bytes)
+        if self._j_read_type is None:
+            self._j_read_type = deserialize_java_object(self._j_read_type_bytes)
+        if self._arrow_schema is None:
+            self._arrow_schema = java_utils.to_arrow_schema(self._j_read_type)
         if self._j_bytes_reader is None:
             # get thread num
             max_workers = self._catalog_options.get(constants.MAX_WORKERS)
diff --git a/pypaimon/py4j/util/java_utils.py b/pypaimon/py4j/util/java_utils.py
@@ -100,3 +100,28 @@ def to_arrow_schema(j_row_type):
     arrow_schema = schema_reader.schema
     schema_reader.close()
     return arrow_schema
+
+
+def serialize_java_object(java_obj) -> bytes:
+    gateway = get_gateway()
+    util = gateway.jvm.org.apache.paimon.python.SerializationUtil
+    try:
+        java_bytes = util.serialize(java_obj)
+        return bytes(java_bytes)
+    except Exception as e:
+        raise RuntimeError(f"Java serialization failed: {e}")
+
+
+def deserialize_java_object(bytes_data):
+    gateway = get_gateway()
+    util = gateway.jvm.org.apache.paimon.python.SerializationUtil
+    try:
+        byte_buffer = gateway.jvm.java.nio.ByteBuffer.allocate(len(bytes_data))
+        for b in bytes_data:
+            byte_buffer.put(b if b >= 0 else b + 256)
+        byte_buffer.flip()
+        java_bytes = byte_buffer.array()
+
+        return util.deserialize(java_bytes)
+    except Exception as e:
+        raise RuntimeError(f"Java deserialization failed: {e}")