postgresql loader: Add reorg aware streaming support

fordN · fordN · commit cb9a2c5297f6 · 2025-10-06T13:07:21.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -98,9 +98,6 @@ addopts = [
     "--tb=short",
     "--strict-markers",
 ]
-# Timeout configuration for longer-running integration tests
-timeout = 300  # 5 minutes per test
-timeout_method = "thread"
 
 markers = [
     "unit: Unit tests (fast, no external dependencies)",
diff --git a/src/amp/loaders/implementations/postgresql_loader.py b/src/amp/loaders/implementations/postgresql_loader.py
@@ -1,9 +1,10 @@
 from dataclasses import dataclass
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import pyarrow as pa
 from psycopg2.pool import ThreadedConnectionPool
 
+from ...streaming.types import BlockRange
 from ..base import DataLoader, LoadMode
 from ._postgres_helpers import has_binary_columns, prepare_csv_data, prepare_insert_data
 
@@ -120,7 +121,8 @@ def _clear_table(self, table_name: str) -> None:
 
     def _copy_arrow_data(self, cursor: Any, data: Union[pa.RecordBatch, pa.Table], table_name: str) -> None:
         """Copy Arrow data to PostgreSQL using optimal method based on data types."""
-        if has_binary_columns(data.schema):
+        # Use INSERT for data with binary columns OR metadata columns (JSONB/range types need special handling)
+        if has_binary_columns(data.schema) or '_meta_block_ranges' in data.schema.names:
             self._insert_arrow_data(cursor, data, table_name)
         else:
             self._csv_copy_arrow_data(cursor, data, table_name)
@@ -160,7 +162,7 @@ def _create_table_from_schema(self, schema: pa.Schema, table_name: str) -> None:
                 # Check if table already exists to avoid unnecessary work
                 cursor.execute(
                     """
-                    SELECT 1 FROM information_schema.tables 
+                    SELECT 1 FROM information_schema.tables
                     WHERE table_name = %s AND table_schema = 'public'
                 """,
                     (table_name,),
@@ -205,9 +207,18 @@ def _create_table_from_schema(self, schema: pa.Schema, table_name: str) -> None:
 
                 # Build CREATE TABLE statement
                 columns = []
+                # Check if this is streaming data with metadata columns
+                has_metadata = any(field.name.startswith('_meta_') for field in schema)
+
                 for field in schema:
+                    # Skip generic metadata columns - we'll use _meta_block_range instead
+                    if field.name in ('_meta_range_start', '_meta_range_end'):
+                        continue
+                    # Special handling for JSONB metadata column
+                    elif field.name == '_meta_block_ranges':
+                        pg_type = 'JSONB'
                     # Handle complex types
-                    if pa.types.is_timestamp(field.type):
+                    elif pa.types.is_timestamp(field.type):
                         # Handle timezone-aware timestamps
                         if field.type.tz is not None:
                             pg_type = 'TIMESTAMPTZ'
@@ -246,6 +257,14 @@ def _create_table_from_schema(self, schema: pa.Schema, table_name: str) -> None:
                     # Quote column name for safety (important for blockchain field names)
                     columns.append(f'"{field.name}" {pg_type}{nullable}')
 
+                # Add metadata columns for streaming/reorg support if this is streaming data
+                # but only if they don't already exist in the schema
+                if has_metadata:
+                    schema_field_names = [field.name for field in schema]
+                    if '_meta_block_ranges' not in schema_field_names:
+                        # Use JSONB for multi-network block ranges with GIN index support
+                        columns.append('"_meta_block_ranges" JSONB')
+
                 # Create the table - Fixed: use proper identifier quoting
                 create_sql = f"""
                 CREATE TABLE IF NOT EXISTS {table_name} (
@@ -272,7 +291,7 @@ def get_table_schema(self, table_name: str) -> Optional[pa.Schema]:
                     cur.execute(
                         """
                         SELECT column_name, data_type, is_nullable
-                        FROM information_schema.columns 
+                        FROM information_schema.columns
                         WHERE table_name = %s
                         ORDER BY ordinal_position
                     """,
@@ -328,3 +347,70 @@ def _pg_type_to_arrow(self, pg_type: str) -> pa.DataType:
             return pa.decimal128(18, 6)  # Default precision/scale
 
         return type_mapping.get(pg_type, pa.string())  # Default to string
+
+    def _handle_reorg(self, invalidation_ranges: List[BlockRange], table_name: str) -> None:
+        """
+        Handle blockchain reorganization by deleting affected rows using PostgreSQL JSONB operations.
+
+        In blockchain reorgs, if block N gets reorganized, ALL blocks >= N become invalid
+        because the chain has forked from that point. This method deletes all data
+        from the reorg point forward for each affected network, including ranges that overlap.
+
+        Args:
+            invalidation_ranges: List of block ranges to invalidate (reorg points)
+            table_name: The table containing the data to invalidate
+        """
+        if not invalidation_ranges:
+            return
+
+        conn = self.pool.getconn()
+        try:
+            with conn.cursor() as cur:
+                # Build WHERE clause using JSONB operators for multi-network support
+                # For blockchain reorgs: if reorg starts at block N, delete all data that
+                # either starts >= N OR overlaps with N (range_end >= N)
+                where_conditions = []
+                params = []
+
+                for range_obj in invalidation_ranges:
+                    # Delete all data from reorg point forward for this network
+                    # Check if JSONB array contains any range where:
+                    # 1. Network matches
+                    # 2. Range end >= reorg start (catches both overlap and forward cases)
+                    where_conditions.append("""
+                        EXISTS (
+                            SELECT 1 FROM jsonb_array_elements("_meta_block_ranges") AS range_elem
+                            WHERE range_elem->>'network' = %s
+                            AND (range_elem->>'end')::int >= %s
+                        )
+                    """)
+                    params.extend(
+                        [
+                            range_obj.network,
+                            range_obj.start,  # Delete everything where range_end >= reorg_start
+                        ]
+                    )
+
+                # Combine conditions with OR (if any network has reorg, delete the row)
+                where_clause = ' OR '.join(where_conditions)
+
+                # Execute deletion
+                delete_sql = f'DELETE FROM {table_name} WHERE {where_clause}'
+
+                self.logger.info(
+                    f'Executing blockchain reorg deletion for {len(invalidation_ranges)} networks '
+                    f"in table '{table_name}'"
+                )
+                self.logger.debug(f'Delete SQL: {delete_sql} with params: {params}')
+
+                cur.execute(delete_sql, params)
+                deleted_rows = cur.rowcount
+                conn.commit()
+
+                self.logger.info(f"Blockchain reorg deleted {deleted_rows} rows from table '{table_name}'")
+
+        except Exception as e:
+            self.logger.error(f"Failed to handle blockchain reorg for table '{table_name}': {str(e)}")
+            raise
+        finally:
+            self.pool.putconn(conn)
diff --git a/tests/integration/test_postgresql_loader.py b/tests/integration/test_postgresql_loader.py
@@ -425,3 +425,227 @@ def test_large_data_loading(self, postgresql_test_config, test_table_name, clean
                     assert count == 50000
             finally:
                 loader.pool.putconn(conn)
+
+
+@pytest.mark.integration
+@pytest.mark.postgresql
+class TestPostgreSQLLoaderStreaming:
+    """Integration tests for PostgreSQL loader streaming functionality"""
+
+    def test_streaming_metadata_columns(self, postgresql_test_config, test_table_name, cleanup_tables):
+        """Test that streaming data creates tables with metadata columns"""
+        cleanup_tables.append(test_table_name)
+
+        # Import streaming types
+        from src.amp.streaming.types import BlockRange
+
+        # Create test data with metadata
+        data = {
+            'block_number': [100, 101, 102],
+            'transaction_hash': ['0xabc', '0xdef', '0x123'],
+            'value': [1.0, 2.0, 3.0],
+        }
+        batch = pa.RecordBatch.from_pydict(data)
+
+        # Create metadata with block ranges
+        block_ranges = [BlockRange(network='ethereum', start=100, end=102)]
+
+        loader = PostgreSQLLoader(postgresql_test_config)
+
+        with loader:
+            # Add metadata columns (simulating what load_stream_continuous does)
+            batch_with_metadata = loader._add_metadata_columns(batch, block_ranges)
+
+            # Load the batch
+            result = loader.load_batch(batch_with_metadata, test_table_name, create_table=True)
+            assert result.success == True
+            assert result.rows_loaded == 3
+
+            # Verify metadata columns were created in the table
+            conn = loader.pool.getconn()
+            try:
+                with conn.cursor() as cur:
+                    # Check table schema includes metadata columns
+                    cur.execute(
+                        """
+                        SELECT column_name, data_type 
+                        FROM information_schema.columns 
+                        WHERE table_name = %s 
+                        ORDER BY ordinal_position
+                    """,
+                        (test_table_name,),
+                    )
+
+                    columns = cur.fetchall()
+                    column_names = [col[0] for col in columns]
+
+                    # Should have original columns plus metadata columns
+                    assert '_meta_block_ranges' in column_names
+
+                    # Verify metadata column types
+                    column_types = {col[0]: col[1] for col in columns}
+                    assert 'jsonb' in column_types['_meta_block_ranges'].lower()
+
+                    # Verify data was stored correctly
+                    cur.execute(f'SELECT "_meta_block_ranges" FROM {test_table_name} LIMIT 1')
+                    meta_row = cur.fetchone()
+
+                    # PostgreSQL JSONB automatically parses to Python objects
+                    ranges_data = meta_row[0]  # Already parsed by psycopg2
+                    assert len(ranges_data) == 1
+                    assert ranges_data[0]['network'] == 'ethereum'
+                    assert ranges_data[0]['start'] == 100
+                    assert ranges_data[0]['end'] == 102
+
+            finally:
+                loader.pool.putconn(conn)
+
+    def test_handle_reorg_deletion(self, postgresql_test_config, test_table_name, cleanup_tables):
+        """Test that _handle_reorg correctly deletes invalidated ranges"""
+        cleanup_tables.append(test_table_name)
+
+        from src.amp.streaming.types import BlockRange
+
+        loader = PostgreSQLLoader(postgresql_test_config)
+
+        with loader:
+            # Create table and load test data with multiple block ranges
+            data_batch1 = {
+                'tx_hash': ['0x100', '0x101', '0x102'],
+                'block_num': [100, 101, 102],
+                'value': [10.0, 11.0, 12.0],
+            }
+            batch1 = pa.RecordBatch.from_pydict(data_batch1)
+            ranges1 = [BlockRange(network='ethereum', start=100, end=102)]
+            batch1_with_meta = loader._add_metadata_columns(batch1, ranges1)
+
+            data_batch2 = {'tx_hash': ['0x200', '0x201'], 'block_num': [103, 104], 'value': [12.0, 33.0]}
+            batch2 = pa.RecordBatch.from_pydict(data_batch2)
+            ranges2 = [BlockRange(network='ethereum', start=103, end=104)]
+            batch2_with_meta = loader._add_metadata_columns(batch2, ranges2)
+
+            data_batch3 = {'tx_hash': ['0x200', '0x201'], 'block_num': [105, 106], 'value': [7.0, 9.0]}
+            batch3 = pa.RecordBatch.from_pydict(data_batch3)
+            ranges3 = [BlockRange(network='ethereum', start=103, end=104)]
+            batch3_with_meta = loader._add_metadata_columns(batch3, ranges3)
+
+            data_batch4 = {'tx_hash': ['0x200', '0x201'], 'block_num': [107, 108], 'value': [6.0, 73.0]}
+            batch4 = pa.RecordBatch.from_pydict(data_batch4)
+            ranges4 = [BlockRange(network='ethereum', start=103, end=104)]
+            batch4_with_meta = loader._add_metadata_columns(batch4, ranges4)
+
+            # Load all batches
+            result1 = loader.load_batch(batch1_with_meta, test_table_name, create_table=True)
+            result2 = loader.load_batch(batch2_with_meta, test_table_name, create_table=False)
+            result3 = loader.load_batch(batch3_with_meta, test_table_name, create_table=False)
+            result4 = loader.load_batch(batch4_with_meta, test_table_name, create_table=False)
+
+            assert all([result1.success, result2.success, result3.success, result4.success])
+
+            # Verify initial data count
+            conn = loader.pool.getconn()
+            try:
+                with conn.cursor() as cur:
+                    cur.execute(f'SELECT COUNT(*) FROM {test_table_name}')
+                    initial_count = cur.fetchone()[0]
+                    assert initial_count == 9  # 3 + 2 + 2 + 2
+
+                    # Test reorg deletion - invalidate blocks 104-108 on ethereum
+                    invalidation_ranges = [BlockRange(network='ethereum', start=104, end=108)]
+                    loader._handle_reorg(invalidation_ranges, test_table_name)
+
+                    # Should delete batch2, batch3 and batch4 leaving only the 3 rows from batch1
+                    cur.execute(f'SELECT COUNT(*) FROM {test_table_name}')
+                    after_reorg_count = cur.fetchone()[0]
+                    assert after_reorg_count == 3
+
+            finally:
+                loader.pool.putconn(conn)
+
+    def test_reorg_with_overlapping_ranges(self, postgresql_test_config, test_table_name, cleanup_tables):
+        """Test reorg deletion with overlapping block ranges"""
+        cleanup_tables.append(test_table_name)
+
+        from src.amp.streaming.types import BlockRange
+
+        loader = PostgreSQLLoader(postgresql_test_config)
+
+        with loader:
+            # Load data with overlapping ranges that should be invalidated
+            data = {'tx_hash': ['0x150', '0x175', '0x250'], 'block_num': [150, 175, 250], 'value': [15.0, 17.5, 25.0]}
+            batch = pa.RecordBatch.from_pydict(data)
+            ranges = [BlockRange(network='ethereum', start=150, end=175)]
+            batch_with_meta = loader._add_metadata_columns(batch, ranges)
+
+            result = loader.load_batch(batch_with_meta, test_table_name, create_table=True)
+            assert result.success == True
+
+            conn = loader.pool.getconn()
+            try:
+                with conn.cursor() as cur:
+                    # Verify initial data
+                    cur.execute(f'SELECT COUNT(*) FROM {test_table_name}')
+                    assert cur.fetchone()[0] == 3
+
+                    # Test partial overlap invalidation (160-180)
+                    # This should invalidate our range [150,175] because they overlap
+                    invalidation_ranges = [BlockRange(network='ethereum', start=160, end=180)]
+                    loader._handle_reorg(invalidation_ranges, test_table_name)
+
+                    # All data should be deleted due to overlap
+                    cur.execute(f'SELECT COUNT(*) FROM {test_table_name}')
+                    assert cur.fetchone()[0] == 0
+
+            finally:
+                loader.pool.putconn(conn)
+
+    def test_reorg_preserves_different_networks(self, postgresql_test_config, test_table_name, cleanup_tables):
+        """Test that reorg only affects specified network"""
+        cleanup_tables.append(test_table_name)
+
+        from src.amp.streaming.types import BlockRange
+
+        loader = PostgreSQLLoader(postgresql_test_config)
+
+        with loader:
+            # Load data from multiple networks with same block ranges
+            data_eth = {'tx_hash': ['0x100_eth'], 'network_id': ['ethereum'], 'block_num': [100], 'value': [10.0]}
+            batch_eth = pa.RecordBatch.from_pydict(data_eth)
+            ranges_eth = [BlockRange(network='ethereum', start=100, end=100)]
+            batch_eth_with_meta = loader._add_metadata_columns(batch_eth, ranges_eth)
+
+            data_poly = {'tx_hash': ['0x100_poly'], 'network_id': ['polygon'], 'block_num': [100], 'value': [10.0]}
+            batch_poly = pa.RecordBatch.from_pydict(data_poly)
+            ranges_poly = [BlockRange(network='polygon', start=100, end=100)]
+            batch_poly_with_meta = loader._add_metadata_columns(batch_poly, ranges_poly)
+
+            # Load both batches
+            result1 = loader.load_batch(batch_eth_with_meta, test_table_name, create_table=True)
+            result2 = loader.load_batch(batch_poly_with_meta, test_table_name, create_table=False)
+
+            assert result1.success and result2.success
+
+            conn = loader.pool.getconn()
+            try:
+                with conn.cursor() as cur:
+                    # Verify both networks' data exists
+                    cur.execute(f'SELECT COUNT(*) FROM {test_table_name}')
+                    assert cur.fetchone()[0] == 2
+
+                    # Invalidate only ethereum network
+                    invalidation_ranges = [BlockRange(network='ethereum', start=100, end=100)]
+                    loader._handle_reorg(invalidation_ranges, test_table_name)
+
+                    # Should only delete ethereum data, polygon should remain
+                    cur.execute(f'SELECT COUNT(*) FROM {test_table_name}')
+                    assert cur.fetchone()[0] == 1
+
+                    # Verify remaining data is from polygon
+                    cur.execute(f'SELECT "_meta_block_ranges" FROM {test_table_name}')
+                    remaining_ranges = cur.fetchone()[0]
+                    # PostgreSQL JSONB automatically parses to Python objects
+                    ranges_data = remaining_ranges
+                    assert ranges_data[0]['network'] == 'polygon'
+
+            finally:
+                loader.pool.putconn(conn)

Original file line number	Diff line number	Diff line change
`@@ -98,9 +98,6 @@ addopts = [`
`98`	`98`	`"--tb=short",`
`99`	`99`	`"--strict-markers",`
`100`	`100`	`]`
`101`		`-# Timeout configuration for longer-running integration tests`
`102`		`-timeout = 300 # 5 minutes per test`
`103`		`-timeout_method = "thread"`
`104`	`101`
`105`	`102`	`markers = [`
`106`	`103`	`"unit: Unit tests (fast, no external dependencies)",`