diff --git a/.github/workflows/ci-monorepo.yml b/.github/workflows/ci-monorepo.yml index a37ecd2..9c30edb 100644 --- a/.github/workflows/ci-monorepo.yml +++ b/.github/workflows/ci-monorepo.yml @@ -209,7 +209,7 @@ jobs: - name: "BDD Tests" command: "pytest tests/bdd -v" - name: "Example App" - command: "cd ../../examples/fastapi_app && pytest tests/ -v" + command: "cd examples/fastapi_app && pytest tests/ -v" services: cassandra: diff --git a/libs/async-cassandra-bulk/examples/Makefile b/libs/async-cassandra-bulk/examples/Makefile deleted file mode 100644 index 2f2a0e7..0000000 --- a/libs/async-cassandra-bulk/examples/Makefile +++ /dev/null @@ -1,121 +0,0 @@ -.PHONY: help install dev-install test test-unit test-integration lint format type-check clean docker-up docker-down run-example - -# Default target -.DEFAULT_GOAL := help - -help: ## Show this help message - @echo "Available commands:" - @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-20s\033[0m %s\n", $$1, $$2}' - -install: ## Install production dependencies - pip install -e . - -dev-install: ## Install development dependencies - pip install -e ".[dev]" - -test: ## Run all tests - pytest -v - -test-unit: ## Run unit tests only - pytest -v -m unit - -test-integration: ## Run integration tests (requires Cassandra cluster) - ./run_integration_tests.sh - -test-integration-only: ## Run integration tests without managing cluster - pytest -v -m integration - -test-slow: ## Run slow tests - pytest -v -m slow - -lint: ## Run linting checks - ruff check . - black --check . - -format: ## Format code - black . - ruff check --fix . - -type-check: ## Run type checking - mypy bulk_operations tests - -clean: ## Clean up generated files - rm -rf build/ dist/ *.egg-info/ - rm -rf .pytest_cache/ .coverage htmlcov/ - rm -rf iceberg_warehouse/ - find . -type d -name __pycache__ -exec rm -rf {} + - find . -type f -name "*.pyc" -delete - -# Container runtime detection -CONTAINER_RUNTIME ?= $(shell which docker >/dev/null 2>&1 && echo docker || which podman >/dev/null 2>&1 && echo podman) -ifeq ($(CONTAINER_RUNTIME),podman) - COMPOSE_CMD = podman-compose -else - COMPOSE_CMD = docker-compose -endif - -docker-up: ## Start 3-node Cassandra cluster - $(COMPOSE_CMD) up -d - @echo "Waiting for Cassandra cluster to be ready..." - @sleep 30 - @$(CONTAINER_RUNTIME) exec cassandra-1 cqlsh -e "DESCRIBE CLUSTER" || (echo "Cluster not ready, waiting more..." && sleep 30) - @echo "Cassandra cluster is ready!" - -docker-down: ## Stop and remove Cassandra cluster - $(COMPOSE_CMD) down -v - -docker-logs: ## Show Cassandra logs - $(COMPOSE_CMD) logs -f - -# Cassandra cluster management -cassandra-up: ## Start 3-node Cassandra cluster - $(COMPOSE_CMD) up -d - -cassandra-down: ## Stop and remove Cassandra cluster - $(COMPOSE_CMD) down -v - -cassandra-wait: ## Wait for Cassandra to be ready - @echo "Waiting for Cassandra cluster to be ready..." - @for i in {1..30}; do \ - if $(CONTAINER_RUNTIME) exec bulk-cassandra-1 cqlsh -e "SELECT now() FROM system.local" >/dev/null 2>&1; then \ - echo "Cassandra is ready!"; \ - break; \ - fi; \ - echo "Waiting for Cassandra... ($$i/30)"; \ - sleep 5; \ - done - -cassandra-logs: ## Show Cassandra logs - $(COMPOSE_CMD) logs -f - -# Example commands -example-count: ## Run bulk count example - @echo "Running bulk count example..." - python example_count.py - -example-export: ## Run export to Iceberg example (not yet implemented) - @echo "Export example not yet implemented" - # python example_export.py - -example-import: ## Run import from Iceberg example (not yet implemented) - @echo "Import example not yet implemented" - # python example_import.py - -# Quick demo -demo: cassandra-up cassandra-wait example-count ## Run quick demo with count example - -# Development workflow -dev-setup: dev-install docker-up ## Complete development setup - -ci: lint type-check test-unit ## Run CI checks (no integration tests) - -# Vnode validation -validate-vnodes: cassandra-up cassandra-wait ## Validate vnode token distribution - @echo "Checking vnode configuration..." - @$(CONTAINER_RUNTIME) exec bulk-cassandra-1 nodetool info | grep "Token" - @echo "" - @echo "Token ownership by node:" - @$(CONTAINER_RUNTIME) exec bulk-cassandra-1 nodetool ring | grep "^[0-9]" | awk '{print $$8}' | sort | uniq -c - @echo "" - @echo "Sample token ranges (first 10):" - @$(CONTAINER_RUNTIME) exec bulk-cassandra-1 nodetool describering test 2>/dev/null | grep "TokenRange" | head -10 || echo "Create test keyspace first" diff --git a/libs/async-cassandra-bulk/examples/README.md b/libs/async-cassandra-bulk/examples/README.md deleted file mode 100644 index 8399851..0000000 --- a/libs/async-cassandra-bulk/examples/README.md +++ /dev/null @@ -1,225 +0,0 @@ -# Token-Aware Bulk Operations Example - -This example demonstrates how to perform efficient bulk operations on Apache Cassandra using token-aware parallel processing, similar to DataStax Bulk Loader (DSBulk). - -## ๐Ÿš€ Features - -- **Token-aware operations**: Leverages Cassandra's token ring for parallel processing -- **Streaming exports**: Memory-efficient data export using async generators -- **Progress tracking**: Real-time progress updates during operations -- **Multi-node support**: Automatically distributes work across cluster nodes -- **Multiple export formats**: CSV, JSON, and Parquet with compression support โœ… -- **Apache Iceberg integration**: Export Cassandra data to the modern lakehouse format (coming in Phase 3) - -## ๐Ÿ“‹ Prerequisites - -- Python 3.12+ -- Docker or Podman (for running Cassandra) -- 30GB+ free disk space (for 3-node cluster) -- 32GB+ RAM recommended - -## ๐Ÿ› ๏ธ Installation - -1. **Install the example with dependencies:** - ```bash - pip install -e . - ``` - -2. **Install development dependencies (optional):** - ```bash - make dev-install - ``` - -## ๐ŸŽฏ Quick Start - -1. **Start a 3-node Cassandra cluster:** - ```bash - make cassandra-up - make cassandra-wait - ``` - -2. **Run the bulk count demo:** - ```bash - make demo - ``` - -3. **Stop the cluster when done:** - ```bash - make cassandra-down - ``` - -## ๐Ÿ“– Examples - -### Basic Bulk Count - -Count all rows in a table using token-aware parallel processing: - -```python -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator - -async with AsyncCluster(['localhost']) as cluster: - async with cluster.connect() as session: - operator = TokenAwareBulkOperator(session) - - # Count with automatic parallelism - count = await operator.count_by_token_ranges( - keyspace="my_keyspace", - table="my_table" - ) - print(f"Total rows: {count:,}") -``` - -### Count with Progress Tracking - -```python -def progress_callback(stats): - print(f"Progress: {stats.progress_percentage:.1f}% " - f"({stats.rows_processed:,} rows, " - f"{stats.rows_per_second:,.0f} rows/sec)") - -count, stats = await operator.count_by_token_ranges_with_stats( - keyspace="my_keyspace", - table="my_table", - split_count=32, # Use 32 parallel ranges - progress_callback=progress_callback -) -``` - -### Streaming Export - -Export large tables without loading everything into memory: - -```python -async for row in operator.export_by_token_ranges( - keyspace="my_keyspace", - table="my_table", - split_count=16 -): - # Process each row as it arrives - process_row(row) -``` - -## ๐Ÿ—๏ธ Architecture - -### Token Range Discovery -The operator discovers natural token ranges from the cluster topology and can further split them for increased parallelism. - -### Parallel Execution -Multiple token ranges are queried concurrently, with configurable parallelism limits to prevent overwhelming the cluster. - -### Streaming Results -Data is streamed using async generators, ensuring constant memory usage regardless of dataset size. - -## ๐Ÿงช Testing - -Run the test suite: - -```bash -# Unit tests only -make test-unit - -# All tests (requires running Cassandra) -make test - -# With coverage report -pytest --cov=bulk_operations --cov-report=html -``` - -## ๐Ÿ”ง Configuration - -### Split Count -Controls the number of token ranges to process in parallel: -- **Default**: 4 ร— number of nodes -- **Higher values**: More parallelism, higher resource usage -- **Lower values**: Less parallelism, more stable - -### Parallelism -Controls concurrent query execution: -- **Default**: 2 ร— number of nodes -- **Adjust based on**: Cluster capacity, network bandwidth - -## ๐Ÿ“Š Performance - -Example performance on a 3-node cluster: - -| Operation | Rows | Split Count | Time | Rate | -|-----------|------|-------------|------|------| -| Count | 1M | 1 | 45s | 22K/s | -| Count | 1M | 8 | 12s | 83K/s | -| Count | 1M | 32 | 6s | 167K/s | -| Export | 10M | 16 | 120s | 83K/s | - -## ๐ŸŽ“ How It Works - -1. **Token Range Discovery** - - Query cluster metadata for natural token ranges - - Each range has start/end tokens and replica nodes - - With vnodes (256 per node), expect ~768 ranges in a 3-node cluster - -2. **Range Splitting** - - Split ranges proportionally based on size - - Larger ranges get more splits for balance - - Small vnode ranges may not split further - -3. **Parallel Execution** - - Execute queries for each range concurrently - - Use semaphore to limit parallelism - - Queries use `token()` function: `WHERE token(pk) > X AND token(pk) <= Y` - -4. **Result Aggregation** - - Stream results as they arrive - - Track progress and statistics - - No duplicates due to exclusive range boundaries - -## ๐Ÿ” Understanding Vnodes - -Our test cluster uses 256 virtual nodes (vnodes) per physical node. This means: - -- Each physical node owns 256 non-contiguous token ranges -- Token ownership is distributed evenly across the ring -- Smaller ranges mean better load distribution but more metadata - -To visualize token distribution: -```bash -python visualize_tokens.py -``` - -To validate vnodes configuration: -```bash -make validate-vnodes -``` - -## ๐Ÿงช Integration Testing - -The integration tests validate our token handling against a real Cassandra cluster: - -```bash -# Run all integration tests with cluster management -make test-integration - -# Run integration tests only (cluster must be running) -make test-integration-only -``` - -Key integration tests: -- **Token range discovery**: Validates all vnodes are discovered -- **Nodetool comparison**: Compares with `nodetool describering` output -- **Data coverage**: Ensures no rows are missed or duplicated -- **Performance scaling**: Verifies parallel execution benefits - -## ๐Ÿ“š References - -- [DataStax Bulk Loader (DSBulk)](https://docs.datastax.com/en/dsbulk/docs/) -- [Cassandra Token Ranges](https://cassandra.apache.org/doc/latest/cassandra/architecture/dynamo.html#consistent-hashing-using-a-token-ring) -- [Apache Iceberg](https://iceberg.apache.org/) - -## โš ๏ธ Important Notes - -1. **Memory Usage**: While streaming reduces memory usage, the thread pool and connection pool still consume resources - -2. **Network Bandwidth**: Bulk operations can saturate network links. Monitor and adjust parallelism accordingly. - -3. **Cluster Impact**: High parallelism can impact cluster performance. Test in non-production first. - -4. **Token Ranges**: The implementation assumes Murmur3Partitioner (Cassandra default). diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/__init__.py b/libs/async-cassandra-bulk/examples/bulk_operations/__init__.py deleted file mode 100644 index 467d6d5..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -Token-aware bulk operations for Apache Cassandra using async-cassandra. - -This package provides efficient, parallel bulk operations by leveraging -Cassandra's token ranges for data distribution. -""" - -__version__ = "0.1.0" - -from .bulk_operator import BulkOperationStats, TokenAwareBulkOperator -from .token_utils import TokenRange, TokenRangeSplitter - -__all__ = [ - "TokenAwareBulkOperator", - "BulkOperationStats", - "TokenRange", - "TokenRangeSplitter", -] diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/bulk_operator.py b/libs/async-cassandra-bulk/examples/bulk_operations/bulk_operator.py deleted file mode 100644 index 2d502cb..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/bulk_operator.py +++ /dev/null @@ -1,566 +0,0 @@ -""" -Token-aware bulk operator for parallel Cassandra operations. -""" - -import asyncio -import time -from collections.abc import AsyncIterator, Callable -from pathlib import Path -from typing import Any - -from cassandra import ConsistencyLevel - -from async_cassandra import AsyncCassandraSession - -from .parallel_export import export_by_token_ranges_parallel -from .stats import BulkOperationStats -from .token_utils import TokenRange, TokenRangeSplitter, discover_token_ranges - - -class BulkOperationError(Exception): - """Error during bulk operation.""" - - def __init__( - self, message: str, partial_result: Any = None, errors: list[Exception] | None = None - ): - super().__init__(message) - self.partial_result = partial_result - self.errors = errors or [] - - -class TokenAwareBulkOperator: - """Performs bulk operations using token ranges for parallelism. - - This class uses prepared statements for all token range queries to: - - Improve performance through query plan caching - - Provide protection against injection attacks - - Ensure type safety and validation - - Follow Cassandra best practices - - Token range boundaries are passed as parameters to prepared statements, - not embedded in the query string. - """ - - def __init__(self, session: AsyncCassandraSession): - self.session = session - self.splitter = TokenRangeSplitter() - self._prepared_statements: dict[str, dict[str, Any]] = {} - - async def _get_prepared_statements( - self, keyspace: str, table: str, partition_keys: list[str] - ) -> dict[str, Any]: - """Get or prepare statements for token range queries.""" - pk_list = ", ".join(partition_keys) - key = f"{keyspace}.{table}" - - if key not in self._prepared_statements: - # Prepare all the statements we need for this table - self._prepared_statements[key] = { - "count_range": await self.session.prepare( - f""" - SELECT COUNT(*) FROM {keyspace}.{table} - WHERE token({pk_list}) > ? - AND token({pk_list}) <= ? - """ - ), - "count_wraparound_gt": await self.session.prepare( - f""" - SELECT COUNT(*) FROM {keyspace}.{table} - WHERE token({pk_list}) > ? - """ - ), - "count_wraparound_lte": await self.session.prepare( - f""" - SELECT COUNT(*) FROM {keyspace}.{table} - WHERE token({pk_list}) <= ? - """ - ), - "select_range": await self.session.prepare( - f""" - SELECT * FROM {keyspace}.{table} - WHERE token({pk_list}) > ? - AND token({pk_list}) <= ? - """ - ), - "select_wraparound_gt": await self.session.prepare( - f""" - SELECT * FROM {keyspace}.{table} - WHERE token({pk_list}) > ? - """ - ), - "select_wraparound_lte": await self.session.prepare( - f""" - SELECT * FROM {keyspace}.{table} - WHERE token({pk_list}) <= ? - """ - ), - } - - return self._prepared_statements[key] - - async def count_by_token_ranges( - self, - keyspace: str, - table: str, - split_count: int | None = None, - parallelism: int | None = None, - progress_callback: Callable[[BulkOperationStats], None] | None = None, - consistency_level: ConsistencyLevel | None = None, - ) -> int: - """Count all rows in a table using parallel token range queries. - - Args: - keyspace: The keyspace name. - table: The table name. - split_count: Number of token range splits (default: 4 * number of nodes). - parallelism: Max concurrent operations (default: 2 * number of nodes). - progress_callback: Optional callback for progress updates. - consistency_level: Consistency level for queries (default: None, uses driver default). - - Returns: - Total row count. - """ - count, _ = await self.count_by_token_ranges_with_stats( - keyspace=keyspace, - table=table, - split_count=split_count, - parallelism=parallelism, - progress_callback=progress_callback, - consistency_level=consistency_level, - ) - return count - - async def count_by_token_ranges_with_stats( - self, - keyspace: str, - table: str, - split_count: int | None = None, - parallelism: int | None = None, - progress_callback: Callable[[BulkOperationStats], None] | None = None, - consistency_level: ConsistencyLevel | None = None, - ) -> tuple[int, BulkOperationStats]: - """Count all rows and return statistics.""" - # Get table metadata - table_meta = await self._get_table_metadata(keyspace, table) - partition_keys = [col.name for col in table_meta.partition_key] - - # Discover and split token ranges - ranges = await discover_token_ranges(self.session, keyspace) - - if split_count is None: - # Default: 4 splits per node - split_count = len(self.session._session.cluster.contact_points) * 4 - - splits = self.splitter.split_proportionally(ranges, split_count) - - # Initialize stats - stats = BulkOperationStats(total_ranges=len(splits)) - - # Determine parallelism - if parallelism is None: - parallelism = min(len(splits), len(self.session._session.cluster.contact_points) * 2) - - # Get prepared statements for this table - prepared_stmts = await self._get_prepared_statements(keyspace, table, partition_keys) - - # Create count tasks - semaphore = asyncio.Semaphore(parallelism) - tasks = [] - - for split in splits: - task = self._count_range( - keyspace, - table, - partition_keys, - split, - semaphore, - stats, - progress_callback, - prepared_stmts, - consistency_level, - ) - tasks.append(task) - - # Execute all tasks - results = await asyncio.gather(*tasks, return_exceptions=True) - - # Process results - total_count = 0 - for result in results: - if isinstance(result, Exception): - stats.errors.append(result) - else: - total_count += int(result) - - stats.end_time = time.time() - - if stats.errors: - raise BulkOperationError( - f"Failed to count all ranges: {len(stats.errors)} errors", - partial_result=total_count, - errors=stats.errors, - ) - - return total_count, stats - - async def _count_range( - self, - keyspace: str, - table: str, - partition_keys: list[str], - token_range: TokenRange, - semaphore: asyncio.Semaphore, - stats: BulkOperationStats, - progress_callback: Callable[[BulkOperationStats], None] | None, - prepared_stmts: dict[str, Any], - consistency_level: ConsistencyLevel | None, - ) -> int: - """Count rows in a single token range.""" - async with semaphore: - # Check if this is a wraparound range - if token_range.end < token_range.start: - # Wraparound range needs to be split into two queries - # First part: from start to MAX_TOKEN - stmt = prepared_stmts["count_wraparound_gt"] - if consistency_level is not None: - stmt.consistency_level = consistency_level - result1 = await self.session.execute(stmt, (token_range.start,)) - row1 = result1.one() - count1 = row1.count if row1 else 0 - - # Second part: from MIN_TOKEN to end - stmt = prepared_stmts["count_wraparound_lte"] - if consistency_level is not None: - stmt.consistency_level = consistency_level - result2 = await self.session.execute(stmt, (token_range.end,)) - row2 = result2.one() - count2 = row2.count if row2 else 0 - - count = count1 + count2 - else: - # Normal range - use prepared statement - stmt = prepared_stmts["count_range"] - if consistency_level is not None: - stmt.consistency_level = consistency_level - result = await self.session.execute(stmt, (token_range.start, token_range.end)) - row = result.one() - count = row.count if row else 0 - - # Update stats - stats.rows_processed += count - stats.ranges_completed += 1 - - # Call progress callback if provided - if progress_callback: - progress_callback(stats) - - return int(count) - - async def export_by_token_ranges( - self, - keyspace: str, - table: str, - split_count: int | None = None, - parallelism: int | None = None, - progress_callback: Callable[[BulkOperationStats], None] | None = None, - consistency_level: ConsistencyLevel | None = None, - ) -> AsyncIterator[Any]: - """Export all rows from a table by streaming token ranges in parallel. - - This method uses parallel queries to stream data from multiple token ranges - concurrently, providing high performance for large table exports. - - Args: - keyspace: The keyspace name. - table: The table name. - split_count: Number of token range splits (default: 4 * number of nodes). - parallelism: Max concurrent queries (default: 2 * number of nodes). - progress_callback: Optional callback for progress updates. - consistency_level: Consistency level for queries (default: None, uses driver default). - - Yields: - Row data from the table, streamed as results arrive from parallel queries. - """ - # Get table metadata - table_meta = await self._get_table_metadata(keyspace, table) - partition_keys = [col.name for col in table_meta.partition_key] - - # Discover and split token ranges - ranges = await discover_token_ranges(self.session, keyspace) - - if split_count is None: - split_count = len(self.session._session.cluster.contact_points) * 4 - - splits = self.splitter.split_proportionally(ranges, split_count) - - # Determine parallelism - if parallelism is None: - parallelism = min(len(splits), len(self.session._session.cluster.contact_points) * 2) - - # Initialize stats - stats = BulkOperationStats(total_ranges=len(splits)) - - # Get prepared statements for this table - prepared_stmts = await self._get_prepared_statements(keyspace, table, partition_keys) - - # Use parallel export - async for row in export_by_token_ranges_parallel( - operator=self, - keyspace=keyspace, - table=table, - splits=splits, - prepared_stmts=prepared_stmts, - parallelism=parallelism, - consistency_level=consistency_level, - stats=stats, - progress_callback=progress_callback, - ): - yield row - - stats.end_time = time.time() - - async def import_from_iceberg( - self, - iceberg_warehouse_path: str, - iceberg_table: str, - target_keyspace: str, - target_table: str, - parallelism: int | None = None, - batch_size: int = 1000, - progress_callback: Callable[[BulkOperationStats], None] | None = None, - ) -> BulkOperationStats: - """Import data from Iceberg to Cassandra.""" - # This will be implemented when we add Iceberg integration - raise NotImplementedError("Iceberg import will be implemented in next phase") - - async def _get_table_metadata(self, keyspace: str, table: str) -> Any: - """Get table metadata from cluster.""" - metadata = self.session._session.cluster.metadata - - if keyspace not in metadata.keyspaces: - raise ValueError(f"Keyspace '{keyspace}' not found") - - keyspace_meta = metadata.keyspaces[keyspace] - - if table not in keyspace_meta.tables: - raise ValueError(f"Table '{table}' not found in keyspace '{keyspace}'") - - return keyspace_meta.tables[table] - - async def export_to_csv( - self, - keyspace: str, - table: str, - output_path: str | Path, - columns: list[str] | None = None, - delimiter: str = ",", - null_string: str = "", - compression: str | None = None, - split_count: int | None = None, - parallelism: int | None = None, - progress_callback: Callable[[Any], Any] | None = None, - consistency_level: ConsistencyLevel | None = None, - ) -> Any: - """Export table to CSV format. - - Args: - keyspace: Keyspace name - table: Table name - output_path: Output file path - columns: Columns to export (None for all) - delimiter: CSV delimiter - null_string: String to represent NULL values - compression: Compression type (gzip, bz2, lz4) - split_count: Number of token range splits - parallelism: Max concurrent operations - progress_callback: Progress callback function - consistency_level: Consistency level for queries - - Returns: - ExportProgress object - """ - from .exporters import CSVExporter - - exporter = CSVExporter( - self, - delimiter=delimiter, - null_string=null_string, - compression=compression, - ) - - return await exporter.export( - keyspace=keyspace, - table=table, - output_path=Path(output_path), - columns=columns, - split_count=split_count, - parallelism=parallelism, - progress_callback=progress_callback, - consistency_level=consistency_level, - ) - - async def export_to_json( - self, - keyspace: str, - table: str, - output_path: str | Path, - columns: list[str] | None = None, - format_mode: str = "jsonl", - indent: int | None = None, - compression: str | None = None, - split_count: int | None = None, - parallelism: int | None = None, - progress_callback: Callable[[Any], Any] | None = None, - consistency_level: ConsistencyLevel | None = None, - ) -> Any: - """Export table to JSON format. - - Args: - keyspace: Keyspace name - table: Table name - output_path: Output file path - columns: Columns to export (None for all) - format_mode: 'jsonl' (line-delimited) or 'array' - indent: JSON indentation - compression: Compression type (gzip, bz2, lz4) - split_count: Number of token range splits - parallelism: Max concurrent operations - progress_callback: Progress callback function - consistency_level: Consistency level for queries - - Returns: - ExportProgress object - """ - from .exporters import JSONExporter - - exporter = JSONExporter( - self, - format_mode=format_mode, - indent=indent, - compression=compression, - ) - - return await exporter.export( - keyspace=keyspace, - table=table, - output_path=Path(output_path), - columns=columns, - split_count=split_count, - parallelism=parallelism, - progress_callback=progress_callback, - consistency_level=consistency_level, - ) - - async def export_to_parquet( - self, - keyspace: str, - table: str, - output_path: str | Path, - columns: list[str] | None = None, - compression: str = "snappy", - row_group_size: int = 50000, - split_count: int | None = None, - parallelism: int | None = None, - progress_callback: Callable[[Any], Any] | None = None, - consistency_level: ConsistencyLevel | None = None, - ) -> Any: - """Export table to Parquet format. - - Args: - keyspace: Keyspace name - table: Table name - output_path: Output file path - columns: Columns to export (None for all) - compression: Parquet compression (snappy, gzip, brotli, lz4, zstd) - row_group_size: Rows per row group - split_count: Number of token range splits - parallelism: Max concurrent operations - progress_callback: Progress callback function - - Returns: - ExportProgress object - """ - from .exporters import ParquetExporter - - exporter = ParquetExporter( - self, - compression=compression, - row_group_size=row_group_size, - ) - - return await exporter.export( - keyspace=keyspace, - table=table, - output_path=Path(output_path), - columns=columns, - split_count=split_count, - parallelism=parallelism, - progress_callback=progress_callback, - consistency_level=consistency_level, - ) - - async def export_to_iceberg( - self, - keyspace: str, - table: str, - namespace: str | None = None, - table_name: str | None = None, - catalog: Any | None = None, - catalog_config: dict[str, Any] | None = None, - warehouse_path: str | Path | None = None, - partition_spec: Any | None = None, - table_properties: dict[str, str] | None = None, - compression: str = "snappy", - row_group_size: int = 100000, - columns: list[str] | None = None, - split_count: int | None = None, - parallelism: int | None = None, - progress_callback: Any | None = None, - ) -> Any: - """Export table data to Apache Iceberg format. - - This enables modern data lakehouse features like ACID transactions, - time travel, and schema evolution. - - Args: - keyspace: Cassandra keyspace to export from - table: Cassandra table to export - namespace: Iceberg namespace (default: keyspace name) - table_name: Iceberg table name (default: Cassandra table name) - catalog: Pre-configured Iceberg catalog (optional) - catalog_config: Custom catalog configuration (optional) - warehouse_path: Path to Iceberg warehouse (for filesystem catalog) - partition_spec: Iceberg partition specification - table_properties: Additional Iceberg table properties - compression: Parquet compression (default: snappy) - row_group_size: Rows per Parquet file (default: 100000) - columns: Columns to export (default: all) - split_count: Number of token range splits - parallelism: Max concurrent operations - progress_callback: Progress callback function - - Returns: - ExportProgress with Iceberg metadata - """ - from .iceberg import IcebergExporter - - exporter = IcebergExporter( - self, - catalog=catalog, - catalog_config=catalog_config, - warehouse_path=warehouse_path, - compression=compression, - row_group_size=row_group_size, - ) - return await exporter.export( - keyspace=keyspace, - table=table, - namespace=namespace, - table_name=table_name, - partition_spec=partition_spec, - table_properties=table_properties, - columns=columns, - split_count=split_count, - parallelism=parallelism, - progress_callback=progress_callback, - ) diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/exporters/__init__.py b/libs/async-cassandra-bulk/examples/bulk_operations/exporters/__init__.py deleted file mode 100644 index 6053593..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/exporters/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Export format implementations for bulk operations.""" - -from .base import Exporter, ExportFormat, ExportProgress -from .csv_exporter import CSVExporter -from .json_exporter import JSONExporter -from .parquet_exporter import ParquetExporter - -__all__ = [ - "ExportFormat", - "Exporter", - "ExportProgress", - "CSVExporter", - "JSONExporter", - "ParquetExporter", -] diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/exporters/base.py b/libs/async-cassandra-bulk/examples/bulk_operations/exporters/base.py deleted file mode 100644 index 015d629..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/exporters/base.py +++ /dev/null @@ -1,229 +0,0 @@ -"""Base classes for export format implementations.""" - -import asyncio -import json -from abc import ABC, abstractmethod -from dataclasses import dataclass, field -from datetime import datetime -from enum import Enum -from pathlib import Path -from typing import Any - -from cassandra.util import OrderedMap, OrderedMapSerializedKey - -from bulk_operations.bulk_operator import TokenAwareBulkOperator - - -class ExportFormat(Enum): - """Supported export formats.""" - - CSV = "csv" - JSON = "json" - PARQUET = "parquet" - ICEBERG = "iceberg" - - -@dataclass -class ExportProgress: - """Tracks export progress for resume capability.""" - - export_id: str - keyspace: str - table: str - format: ExportFormat - output_path: str - started_at: datetime - completed_at: datetime | None = None - total_ranges: int = 0 - completed_ranges: list[tuple[int, int]] = field(default_factory=list) - rows_exported: int = 0 - bytes_written: int = 0 - errors: list[dict[str, Any]] = field(default_factory=list) - metadata: dict[str, Any] = field(default_factory=dict) - - def to_json(self) -> str: - """Serialize progress to JSON.""" - data = { - "export_id": self.export_id, - "keyspace": self.keyspace, - "table": self.table, - "format": self.format.value, - "output_path": self.output_path, - "started_at": self.started_at.isoformat(), - "completed_at": self.completed_at.isoformat() if self.completed_at else None, - "total_ranges": self.total_ranges, - "completed_ranges": self.completed_ranges, - "rows_exported": self.rows_exported, - "bytes_written": self.bytes_written, - "errors": self.errors, - "metadata": self.metadata, - } - return json.dumps(data, indent=2) - - @classmethod - def from_json(cls, json_str: str) -> "ExportProgress": - """Deserialize progress from JSON.""" - data = json.loads(json_str) - return cls( - export_id=data["export_id"], - keyspace=data["keyspace"], - table=data["table"], - format=ExportFormat(data["format"]), - output_path=data["output_path"], - started_at=datetime.fromisoformat(data["started_at"]), - completed_at=( - datetime.fromisoformat(data["completed_at"]) if data["completed_at"] else None - ), - total_ranges=data["total_ranges"], - completed_ranges=[(r[0], r[1]) for r in data["completed_ranges"]], - rows_exported=data["rows_exported"], - bytes_written=data["bytes_written"], - errors=data["errors"], - metadata=data["metadata"], - ) - - def save(self, progress_file: Path | None = None) -> Path: - """Save progress to file.""" - if progress_file is None: - progress_file = Path(f"{self.output_path}.progress") - progress_file.write_text(self.to_json()) - return progress_file - - @classmethod - def load(cls, progress_file: Path) -> "ExportProgress": - """Load progress from file.""" - return cls.from_json(progress_file.read_text()) - - def is_range_completed(self, start: int, end: int) -> bool: - """Check if a token range has been completed.""" - return (start, end) in self.completed_ranges - - def mark_range_completed(self, start: int, end: int, rows: int) -> None: - """Mark a token range as completed.""" - if not self.is_range_completed(start, end): - self.completed_ranges.append((start, end)) - self.rows_exported += rows - - @property - def is_complete(self) -> bool: - """Check if export is complete.""" - return len(self.completed_ranges) == self.total_ranges - - @property - def progress_percentage(self) -> float: - """Calculate progress percentage.""" - if self.total_ranges == 0: - return 0.0 - return (len(self.completed_ranges) / self.total_ranges) * 100 - - -class Exporter(ABC): - """Base class for export format implementations.""" - - def __init__( - self, - operator: TokenAwareBulkOperator, - compression: str | None = None, - buffer_size: int = 8192, - ): - """Initialize exporter. - - Args: - operator: Token-aware bulk operator instance - compression: Compression type (gzip, bz2, lz4, etc.) - buffer_size: Buffer size for file operations - """ - self.operator = operator - self.compression = compression - self.buffer_size = buffer_size - self._write_lock = asyncio.Lock() - - @abstractmethod - async def export( - self, - keyspace: str, - table: str, - output_path: Path, - columns: list[str] | None = None, - split_count: int | None = None, - parallelism: int | None = None, - progress: ExportProgress | None = None, - progress_callback: Any | None = None, - consistency_level: Any | None = None, - ) -> ExportProgress: - """Export table data to the specified format. - - Args: - keyspace: Keyspace name - table: Table name - output_path: Output file path - columns: Columns to export (None for all) - split_count: Number of token range splits - parallelism: Max concurrent operations - progress: Resume from previous progress - progress_callback: Callback for progress updates - - Returns: - ExportProgress with final statistics - """ - pass - - @abstractmethod - async def write_header(self, file_handle: Any, columns: list[str]) -> None: - """Write file header if applicable.""" - pass - - @abstractmethod - async def write_row(self, file_handle: Any, row: Any) -> int: - """Write a single row and return bytes written.""" - pass - - @abstractmethod - async def write_footer(self, file_handle: Any) -> None: - """Write file footer if applicable.""" - pass - - def _serialize_value(self, value: Any) -> Any: - """Serialize Cassandra types to exportable format.""" - if value is None: - return None - elif isinstance(value, list | set): - return [self._serialize_value(v) for v in value] - elif isinstance(value, dict | OrderedMap | OrderedMapSerializedKey): - # Handle Cassandra map types - return {str(k): self._serialize_value(v) for k, v in value.items()} - elif isinstance(value, bytes): - # Convert bytes to base64 for JSON compatibility - import base64 - - return base64.b64encode(value).decode("ascii") - elif isinstance(value, datetime): - return value.isoformat() - else: - return value - - async def _open_output_file(self, output_path: Path, mode: str = "w") -> Any: - """Open output file with optional compression.""" - if self.compression == "gzip": - import gzip - - return gzip.open(output_path, mode + "t", encoding="utf-8") - elif self.compression == "bz2": - import bz2 - - return bz2.open(output_path, mode + "t", encoding="utf-8") - elif self.compression == "lz4": - try: - import lz4.frame - - return lz4.frame.open(output_path, mode + "t", encoding="utf-8") - except ImportError: - raise ImportError("lz4 compression requires 'pip install lz4'") from None - else: - return open(output_path, mode, encoding="utf-8", buffering=self.buffer_size) - - def _get_output_path_with_compression(self, output_path: Path) -> Path: - """Add compression extension to output path if needed.""" - if self.compression: - return output_path.with_suffix(output_path.suffix + f".{self.compression}") - return output_path diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/exporters/csv_exporter.py b/libs/async-cassandra-bulk/examples/bulk_operations/exporters/csv_exporter.py deleted file mode 100644 index 56e6f80..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/exporters/csv_exporter.py +++ /dev/null @@ -1,221 +0,0 @@ -"""CSV export implementation.""" - -import asyncio -import csv -import io -import uuid -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -from bulk_operations.exporters.base import Exporter, ExportFormat, ExportProgress - - -class CSVExporter(Exporter): - """Export Cassandra data to CSV format with streaming support.""" - - def __init__( - self, - operator, - delimiter: str = ",", - quoting: int = csv.QUOTE_MINIMAL, - null_string: str = "", - compression: str | None = None, - buffer_size: int = 8192, - ): - """Initialize CSV exporter. - - Args: - operator: Token-aware bulk operator instance - delimiter: Field delimiter (default: comma) - quoting: CSV quoting style (default: QUOTE_MINIMAL) - null_string: String to represent NULL values (default: empty string) - compression: Compression type (gzip, bz2, lz4) - buffer_size: Buffer size for file operations - """ - super().__init__(operator, compression, buffer_size) - self.delimiter = delimiter - self.quoting = quoting - self.null_string = null_string - - async def export( # noqa: C901 - self, - keyspace: str, - table: str, - output_path: Path, - columns: list[str] | None = None, - split_count: int | None = None, - parallelism: int | None = None, - progress: ExportProgress | None = None, - progress_callback: Any | None = None, - consistency_level: Any | None = None, - ) -> ExportProgress: - """Export table data to CSV format. - - What this does: - -------------- - 1. Discovers table schema if columns not specified - 2. Creates/resumes progress tracking - 3. Streams data by token ranges - 4. Writes CSV with proper escaping - 5. Supports compression and resume - - Why this matters: - ---------------- - - Memory efficient for large tables - - Maintains data fidelity - - Resume capability for long exports - - Compatible with standard tools - """ - # Get table metadata if columns not specified - if columns is None: - metadata = self.operator.session._session.cluster.metadata - keyspace_metadata = metadata.keyspaces.get(keyspace) - if not keyspace_metadata: - raise ValueError(f"Keyspace '{keyspace}' not found") - table_metadata = keyspace_metadata.tables.get(table) - if not table_metadata: - raise ValueError(f"Table '{keyspace}.{table}' not found") - columns = list(table_metadata.columns.keys()) - - # Initialize or resume progress - if progress is None: - progress = ExportProgress( - export_id=str(uuid.uuid4()), - keyspace=keyspace, - table=table, - format=ExportFormat.CSV, - output_path=str(output_path), - started_at=datetime.now(UTC), - ) - - # Get actual output path with compression extension - actual_output_path = self._get_output_path_with_compression(output_path) - - # Open output file (append mode if resuming) - mode = "a" if progress.completed_ranges else "w" - file_handle = await self._open_output_file(actual_output_path, mode) - - try: - # Write header for new exports - if mode == "w": - await self.write_header(file_handle, columns) - - # Store columns for row filtering - self._export_columns = columns - - # Track bytes written - file_handle.tell() if hasattr(file_handle, "tell") else 0 - - # Export by token ranges - async for row in self.operator.export_by_token_ranges( - keyspace=keyspace, - table=table, - split_count=split_count, - parallelism=parallelism, - consistency_level=consistency_level, - ): - # Check if we need to track a new range - # (This is simplified - in real implementation we'd track actual ranges) - bytes_written = await self.write_row(file_handle, row) - progress.rows_exported += 1 - progress.bytes_written += bytes_written - - # Periodic progress callback - if progress_callback and progress.rows_exported % 1000 == 0: - if asyncio.iscoroutinefunction(progress_callback): - await progress_callback(progress) - else: - progress_callback(progress) - - # Mark completion - progress.completed_at = datetime.now(UTC) - - # Final callback - if progress_callback: - if asyncio.iscoroutinefunction(progress_callback): - await progress_callback(progress) - else: - progress_callback(progress) - - finally: - if hasattr(file_handle, "close"): - file_handle.close() - - # Save final progress - progress.save() - return progress - - async def write_header(self, file_handle: Any, columns: list[str]) -> None: - """Write CSV header row.""" - writer = csv.writer(file_handle, delimiter=self.delimiter, quoting=self.quoting) - writer.writerow(columns) - - async def write_row(self, file_handle: Any, row: Any) -> int: - """Write a single row to CSV.""" - # Convert row to list of values in column order - # Row objects from Cassandra driver have _fields attribute - values = [] - if hasattr(row, "_fields"): - # If we have specific columns, only export those - if hasattr(self, "_export_columns") and self._export_columns: - for col in self._export_columns: - if hasattr(row, col): - value = getattr(row, col) - values.append(self._serialize_csv_value(value)) - else: - values.append(self._serialize_csv_value(None)) - else: - # Export all fields - for field in row._fields: - value = getattr(row, field) - values.append(self._serialize_csv_value(value)) - else: - # Fallback for other row types - for i in range(len(row)): - values.append(self._serialize_csv_value(row[i])) - - # Write to string buffer first to calculate bytes - buffer = io.StringIO() - writer = csv.writer(buffer, delimiter=self.delimiter, quoting=self.quoting) - writer.writerow(values) - row_data = buffer.getvalue() - - # Write to actual file - async with self._write_lock: - file_handle.write(row_data) - if hasattr(file_handle, "flush"): - file_handle.flush() - - return len(row_data.encode("utf-8")) - - async def write_footer(self, file_handle: Any) -> None: - """CSV files don't have footers.""" - pass - - def _serialize_csv_value(self, value: Any) -> str: - """Serialize value for CSV output.""" - if value is None: - return self.null_string - elif isinstance(value, bool): - return "true" if value else "false" - elif isinstance(value, list | set): - # Format collections as [item1, item2, ...] - items = [self._serialize_csv_value(v) for v in value] - return f"[{', '.join(items)}]" - elif isinstance(value, dict): - # Format maps as {key1: value1, key2: value2} - items = [ - f"{self._serialize_csv_value(k)}: {self._serialize_csv_value(v)}" - for k, v in value.items() - ] - return f"{{{', '.join(items)}}}" - elif isinstance(value, bytes): - # Hex encode bytes - return value.hex() - elif isinstance(value, datetime): - return value.isoformat() - elif isinstance(value, uuid.UUID): - return str(value) - else: - return str(value) diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/exporters/json_exporter.py b/libs/async-cassandra-bulk/examples/bulk_operations/exporters/json_exporter.py deleted file mode 100644 index 6067a6c..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/exporters/json_exporter.py +++ /dev/null @@ -1,221 +0,0 @@ -"""JSON export implementation.""" - -import asyncio -import json -import uuid -from datetime import UTC, datetime -from decimal import Decimal -from pathlib import Path -from typing import Any - -from bulk_operations.exporters.base import Exporter, ExportFormat, ExportProgress - - -class JSONExporter(Exporter): - """Export Cassandra data to JSON format (line-delimited by default).""" - - def __init__( - self, - operator, - format_mode: str = "jsonl", # jsonl (line-delimited) or array - indent: int | None = None, - compression: str | None = None, - buffer_size: int = 8192, - ): - """Initialize JSON exporter. - - Args: - operator: Token-aware bulk operator instance - format_mode: Output format - 'jsonl' (line-delimited) or 'array' - indent: JSON indentation (None for compact) - compression: Compression type (gzip, bz2, lz4) - buffer_size: Buffer size for file operations - """ - super().__init__(operator, compression, buffer_size) - self.format_mode = format_mode - self.indent = indent - self._first_row = True - - async def export( # noqa: C901 - self, - keyspace: str, - table: str, - output_path: Path, - columns: list[str] | None = None, - split_count: int | None = None, - parallelism: int | None = None, - progress: ExportProgress | None = None, - progress_callback: Any | None = None, - consistency_level: Any | None = None, - ) -> ExportProgress: - """Export table data to JSON format. - - What this does: - -------------- - 1. Exports as line-delimited JSON (default) or JSON array - 2. Handles all Cassandra data types with proper serialization - 3. Supports compression for smaller files - 4. Maintains streaming for memory efficiency - - Why this matters: - ---------------- - - JSONL works well with streaming tools - - JSON arrays are compatible with many APIs - - Preserves type information better than CSV - - Standard format for data pipelines - """ - # Get table metadata if columns not specified - if columns is None: - metadata = self.operator.session._session.cluster.metadata - keyspace_metadata = metadata.keyspaces.get(keyspace) - if not keyspace_metadata: - raise ValueError(f"Keyspace '{keyspace}' not found") - table_metadata = keyspace_metadata.tables.get(table) - if not table_metadata: - raise ValueError(f"Table '{keyspace}.{table}' not found") - columns = list(table_metadata.columns.keys()) - - # Initialize or resume progress - if progress is None: - progress = ExportProgress( - export_id=str(uuid.uuid4()), - keyspace=keyspace, - table=table, - format=ExportFormat.JSON, - output_path=str(output_path), - started_at=datetime.now(UTC), - metadata={"format_mode": self.format_mode}, - ) - - # Get actual output path with compression extension - actual_output_path = self._get_output_path_with_compression(output_path) - - # Open output file - mode = "a" if progress.completed_ranges else "w" - file_handle = await self._open_output_file(actual_output_path, mode) - - try: - # Write header for array mode - if mode == "w" and self.format_mode == "array": - await self.write_header(file_handle, columns) - - # Store columns for row filtering - self._export_columns = columns - - # Export by token ranges - async for row in self.operator.export_by_token_ranges( - keyspace=keyspace, - table=table, - split_count=split_count, - parallelism=parallelism, - consistency_level=consistency_level, - ): - bytes_written = await self.write_row(file_handle, row) - progress.rows_exported += 1 - progress.bytes_written += bytes_written - - # Progress callback - if progress_callback and progress.rows_exported % 1000 == 0: - if asyncio.iscoroutinefunction(progress_callback): - await progress_callback(progress) - else: - progress_callback(progress) - - # Write footer for array mode - if self.format_mode == "array": - await self.write_footer(file_handle) - - # Mark completion - progress.completed_at = datetime.now(UTC) - - # Final callback - if progress_callback: - if asyncio.iscoroutinefunction(progress_callback): - await progress_callback(progress) - else: - progress_callback(progress) - - finally: - if hasattr(file_handle, "close"): - file_handle.close() - - # Save progress - progress.save() - return progress - - async def write_header(self, file_handle: Any, columns: list[str]) -> None: - """Write JSON array opening bracket.""" - if self.format_mode == "array": - file_handle.write("[\n") - self._first_row = True - - async def write_row(self, file_handle: Any, row: Any) -> int: # noqa: C901 - """Write a single row as JSON.""" - # Convert row to dictionary - row_dict = {} - if hasattr(row, "_fields"): - # If we have specific columns, only export those - if hasattr(self, "_export_columns") and self._export_columns: - for col in self._export_columns: - if hasattr(row, col): - value = getattr(row, col) - row_dict[col] = self._serialize_value(value) - else: - row_dict[col] = None - else: - # Export all fields - for field in row._fields: - value = getattr(row, field) - row_dict[field] = self._serialize_value(value) - else: - # Handle other row types - for i, value in enumerate(row): - row_dict[f"column_{i}"] = self._serialize_value(value) - - # Format as JSON - if self.format_mode == "jsonl": - # Line-delimited JSON - json_str = json.dumps(row_dict, separators=(",", ":")) - json_str += "\n" - else: - # Array mode - if not self._first_row: - json_str = ",\n" - else: - json_str = "" - self._first_row = False - - if self.indent: - json_str += json.dumps(row_dict, indent=self.indent) - else: - json_str += json.dumps(row_dict, separators=(",", ":")) - - # Write to file - async with self._write_lock: - file_handle.write(json_str) - if hasattr(file_handle, "flush"): - file_handle.flush() - - return len(json_str.encode("utf-8")) - - async def write_footer(self, file_handle: Any) -> None: - """Write JSON array closing bracket.""" - if self.format_mode == "array": - file_handle.write("\n]") - - def _serialize_value(self, value: Any) -> Any: - """Override to handle UUID and other types.""" - if isinstance(value, uuid.UUID): - return str(value) - elif isinstance(value, set | frozenset): - # JSON doesn't have sets, convert to list - return [self._serialize_value(v) for v in sorted(value)] - elif hasattr(value, "__class__") and "SortedSet" in value.__class__.__name__: - # Handle SortedSet specifically - return [self._serialize_value(v) for v in value] - elif isinstance(value, Decimal): - # Convert Decimal to float for JSON - return float(value) - else: - # Use parent class serialization - return super()._serialize_value(value) diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/exporters/parquet_exporter.py b/libs/async-cassandra-bulk/examples/bulk_operations/exporters/parquet_exporter.py deleted file mode 100644 index f9835bc..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/exporters/parquet_exporter.py +++ /dev/null @@ -1,311 +0,0 @@ -"""Parquet export implementation using PyArrow.""" - -import asyncio -import uuid -from datetime import UTC, datetime -from decimal import Decimal -from pathlib import Path -from typing import Any - -try: - import pyarrow as pa - import pyarrow.parquet as pq -except ImportError: - raise ImportError( - "PyArrow is required for Parquet export. Install with: pip install pyarrow" - ) from None - -from cassandra.util import OrderedMap, OrderedMapSerializedKey - -from bulk_operations.exporters.base import Exporter, ExportFormat, ExportProgress - - -class ParquetExporter(Exporter): - """Export Cassandra data to Parquet format - the foundation for Iceberg.""" - - def __init__( - self, - operator, - compression: str = "snappy", - row_group_size: int = 50000, - use_dictionary: bool = True, - buffer_size: int = 8192, - ): - """Initialize Parquet exporter. - - Args: - operator: Token-aware bulk operator instance - compression: Compression codec (snappy, gzip, brotli, lz4, zstd) - row_group_size: Number of rows per row group - use_dictionary: Enable dictionary encoding for strings - buffer_size: Buffer size for file operations - """ - super().__init__(operator, compression, buffer_size) - self.row_group_size = row_group_size - self.use_dictionary = use_dictionary - self._batch_rows = [] - self._schema = None - self._writer = None - - async def export( # noqa: C901 - self, - keyspace: str, - table: str, - output_path: Path, - columns: list[str] | None = None, - split_count: int | None = None, - parallelism: int | None = None, - progress: ExportProgress | None = None, - progress_callback: Any | None = None, - consistency_level: Any | None = None, - ) -> ExportProgress: - """Export table data to Parquet format. - - What this does: - -------------- - 1. Converts Cassandra schema to Arrow schema - 2. Batches rows into row groups for efficiency - 3. Applies columnar compression - 4. Creates Parquet files ready for Iceberg - - Why this matters: - ---------------- - - Parquet is the storage format for Iceberg - - Columnar format enables analytics - - Excellent compression ratios - - Schema evolution support - """ - # Get table metadata - metadata = self.operator.session._session.cluster.metadata - keyspace_metadata = metadata.keyspaces.get(keyspace) - if not keyspace_metadata: - raise ValueError(f"Keyspace '{keyspace}' not found") - table_metadata = keyspace_metadata.tables.get(table) - if not table_metadata: - raise ValueError(f"Table '{keyspace}.{table}' not found") - - # Get columns - if columns is None: - columns = list(table_metadata.columns.keys()) - - # Build Arrow schema from Cassandra schema - self._schema = self._build_arrow_schema(table_metadata, columns) - - # Initialize progress - if progress is None: - progress = ExportProgress( - export_id=str(uuid.uuid4()), - keyspace=keyspace, - table=table, - format=ExportFormat.PARQUET, - output_path=str(output_path), - started_at=datetime.now(UTC), - metadata={ - "compression": self.compression, - "row_group_size": self.row_group_size, - }, - ) - - # Note: Parquet doesn't use compression extension in filename - # Compression is internal to the format - - try: - # Open Parquet writer - self._writer = pq.ParquetWriter( - output_path, - self._schema, - compression=self.compression, - use_dictionary=self.use_dictionary, - ) - - # Export by token ranges - async for row in self.operator.export_by_token_ranges( - keyspace=keyspace, - table=table, - split_count=split_count, - parallelism=parallelism, - consistency_level=consistency_level, - ): - # Add row to batch - row_data = self._convert_row_to_dict(row, columns) - self._batch_rows.append(row_data) - - # Write batch when full - if len(self._batch_rows) >= self.row_group_size: - await self._write_batch() - progress.bytes_written = output_path.stat().st_size - - progress.rows_exported += 1 - - # Progress callback - if progress_callback and progress.rows_exported % 1000 == 0: - if asyncio.iscoroutinefunction(progress_callback): - await progress_callback(progress) - else: - progress_callback(progress) - - # Write final batch - if self._batch_rows: - await self._write_batch() - - # Close writer - self._writer.close() - - # Final stats - progress.bytes_written = output_path.stat().st_size - progress.completed_at = datetime.now(UTC) - - # Final callback - if progress_callback: - if asyncio.iscoroutinefunction(progress_callback): - await progress_callback(progress) - else: - progress_callback(progress) - - except Exception: - # Ensure writer is closed on error - if self._writer: - self._writer.close() - raise - - # Save progress - progress.save() - return progress - - def _build_arrow_schema(self, table_metadata, columns): - """Build PyArrow schema from Cassandra table metadata.""" - fields = [] - - for col_name in columns: - col_meta = table_metadata.columns.get(col_name) - if not col_meta: - continue - - # Map Cassandra types to Arrow types - arrow_type = self._cassandra_to_arrow_type(col_meta.cql_type) - fields.append(pa.field(col_name, arrow_type, nullable=True)) - - return pa.schema(fields) - - def _cassandra_to_arrow_type(self, cql_type: str) -> pa.DataType: - """Map Cassandra types to PyArrow types.""" - # Handle parameterized types - base_type = cql_type.split("<")[0].lower() - - type_mapping = { - "ascii": pa.string(), - "bigint": pa.int64(), - "blob": pa.binary(), - "boolean": pa.bool_(), - "counter": pa.int64(), - "date": pa.date32(), - "decimal": pa.decimal128(38, 10), # Max precision - "double": pa.float64(), - "float": pa.float32(), - "inet": pa.string(), - "int": pa.int32(), - "smallint": pa.int16(), - "text": pa.string(), - "time": pa.int64(), # Nanoseconds since midnight - "timestamp": pa.timestamp("us"), # Microsecond precision - "timeuuid": pa.string(), - "tinyint": pa.int8(), - "uuid": pa.string(), - "varchar": pa.string(), - "varint": pa.string(), # Store as string for arbitrary precision - } - - # Handle collections - if base_type == "list" or base_type == "set": - element_type = self._extract_collection_type(cql_type) - return pa.list_(self._cassandra_to_arrow_type(element_type)) - elif base_type == "map": - key_type, value_type = self._extract_map_types(cql_type) - return pa.map_( - self._cassandra_to_arrow_type(key_type), - self._cassandra_to_arrow_type(value_type), - ) - - return type_mapping.get(base_type, pa.string()) # Default to string - - def _extract_collection_type(self, cql_type: str) -> str: - """Extract element type from list or set.""" - start = cql_type.index("<") + 1 - end = cql_type.rindex(">") - return cql_type[start:end].strip() - - def _extract_map_types(self, cql_type: str) -> tuple[str, str]: - """Extract key and value types from map.""" - start = cql_type.index("<") + 1 - end = cql_type.rindex(">") - types = cql_type[start:end].split(",", 1) - return types[0].strip(), types[1].strip() - - def _convert_row_to_dict(self, row: Any, columns: list[str]) -> dict[str, Any]: - """Convert Cassandra row to dictionary with proper type conversion.""" - row_dict = {} - - if hasattr(row, "_fields"): - for field in row._fields: - value = getattr(row, field) - row_dict[field] = self._convert_value_for_arrow(value) - else: - for i, col in enumerate(columns): - if i < len(row): - row_dict[col] = self._convert_value_for_arrow(row[i]) - - return row_dict - - def _convert_value_for_arrow(self, value: Any) -> Any: - """Convert Cassandra value to Arrow-compatible format.""" - if value is None: - return None - elif isinstance(value, uuid.UUID): - return str(value) - elif isinstance(value, Decimal): - # Keep as Decimal for Arrow's decimal128 type - return value - elif isinstance(value, set): - # Convert sets to lists - return list(value) - elif isinstance(value, OrderedMap | OrderedMapSerializedKey): - # Convert Cassandra map types to dict - return dict(value) - elif isinstance(value, bytes): - # Keep as bytes for binary columns - return value - elif isinstance(value, datetime): - # Ensure timezone aware - if value.tzinfo is None: - return value.replace(tzinfo=UTC) - return value - else: - return value - - async def _write_batch(self): - """Write accumulated batch to Parquet file.""" - if not self._batch_rows: - return - - # Convert to Arrow Table - table = pa.Table.from_pylist(self._batch_rows, schema=self._schema) - - # Write to file - async with self._write_lock: - self._writer.write_table(table) - - # Clear batch - self._batch_rows = [] - - async def write_header(self, file_handle: Any, columns: list[str]) -> None: - """Parquet handles headers internally.""" - pass - - async def write_row(self, file_handle: Any, row: Any) -> int: - """Parquet uses batch writing, not row-by-row.""" - # This is handled in export() method - return 0 - - async def write_footer(self, file_handle: Any) -> None: - """Parquet handles footers internally.""" - pass diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/__init__.py b/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/__init__.py deleted file mode 100644 index 83d5ba1..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Apache Iceberg integration for Cassandra bulk operations. - -This module provides functionality to export Cassandra data to Apache Iceberg tables, -enabling modern data lakehouse capabilities including: -- ACID transactions -- Schema evolution -- Time travel -- Hidden partitioning -- Efficient analytics -""" - -from bulk_operations.iceberg.exporter import IcebergExporter -from bulk_operations.iceberg.schema_mapper import CassandraToIcebergSchemaMapper - -__all__ = ["IcebergExporter", "CassandraToIcebergSchemaMapper"] diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/catalog.py b/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/catalog.py deleted file mode 100644 index 2275142..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/catalog.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Iceberg catalog configuration for filesystem-based tables.""" - -from pathlib import Path -from typing import Any - -from pyiceberg.catalog import Catalog, load_catalog -from pyiceberg.catalog.sql import SqlCatalog - - -def create_filesystem_catalog( - name: str = "cassandra_export", - warehouse_path: str | Path | None = None, -) -> Catalog: - """Create a filesystem-based Iceberg catalog. - - What this does: - -------------- - 1. Creates a local filesystem catalog using SQLite - 2. Stores table metadata in SQLite database - 3. Stores actual data files in warehouse directory - 4. No external dependencies (S3, Hive, etc.) - - Why this matters: - ---------------- - - Simple setup for development and testing - - No cloud dependencies - - Easy to inspect and debug - - Can be migrated to production catalogs later - - Args: - name: Catalog name - warehouse_path: Path to warehouse directory (default: ./iceberg_warehouse) - - Returns: - Iceberg catalog instance - """ - if warehouse_path is None: - warehouse_path = Path.cwd() / "iceberg_warehouse" - else: - warehouse_path = Path(warehouse_path) - - # Create warehouse directory if it doesn't exist - warehouse_path.mkdir(parents=True, exist_ok=True) - - # SQLite catalog configuration - catalog_config = { - "type": "sql", - "uri": f"sqlite:///{warehouse_path / 'catalog.db'}", - "warehouse": str(warehouse_path), - } - - # Create catalog - catalog = SqlCatalog(name, **catalog_config) - - return catalog - - -def get_or_create_catalog( - catalog_name: str = "cassandra_export", - warehouse_path: str | Path | None = None, - config: dict[str, Any] | None = None, -) -> Catalog: - """Get existing catalog or create a new one. - - This allows for custom catalog configurations while providing - sensible defaults for filesystem-based catalogs. - - Args: - catalog_name: Name of the catalog - warehouse_path: Path to warehouse (for filesystem catalogs) - config: Custom catalog configuration (overrides defaults) - - Returns: - Iceberg catalog instance - """ - if config is not None: - # Use custom configuration - return load_catalog(catalog_name, **config) - else: - # Use filesystem catalog - return create_filesystem_catalog(catalog_name, warehouse_path) diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/exporter.py b/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/exporter.py deleted file mode 100644 index cd6cb7a..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/exporter.py +++ /dev/null @@ -1,376 +0,0 @@ -"""Export Cassandra data to Apache Iceberg tables.""" - -import asyncio -import contextlib -import uuid -from datetime import UTC, datetime -from pathlib import Path -from typing import Any - -import pyarrow as pa -import pyarrow.parquet as pq -from pyiceberg.catalog import Catalog -from pyiceberg.exceptions import NoSuchTableError -from pyiceberg.partitioning import PartitionSpec -from pyiceberg.schema import Schema -from pyiceberg.table import Table - -from bulk_operations.exporters.base import ExportFormat, ExportProgress -from bulk_operations.exporters.parquet_exporter import ParquetExporter -from bulk_operations.iceberg.catalog import get_or_create_catalog -from bulk_operations.iceberg.schema_mapper import CassandraToIcebergSchemaMapper - - -class IcebergExporter(ParquetExporter): - """Export Cassandra data to Apache Iceberg tables. - - This exporter extends the Parquet exporter to write data in Iceberg format, - enabling advanced data lakehouse features like ACID transactions, time travel, - and schema evolution. - - What this does: - -------------- - 1. Creates Iceberg tables from Cassandra schemas - 2. Writes data as Parquet files in Iceberg format - 3. Updates Iceberg metadata and manifests - 4. Supports partitioning strategies - 5. Enables time travel and version history - - Why this matters: - ---------------- - - ACID transactions on exported data - - Schema evolution without rewriting data - - Time travel queries ("SELECT * FROM table AS OF timestamp") - - Hidden partitioning for better performance - - Integration with modern data tools (Spark, Trino, etc.) - """ - - def __init__( - self, - operator, - catalog: Catalog | None = None, - catalog_config: dict[str, Any] | None = None, - warehouse_path: str | Path | None = None, - compression: str = "snappy", - row_group_size: int = 100000, - buffer_size: int = 8192, - ): - """Initialize Iceberg exporter. - - Args: - operator: Token-aware bulk operator instance - catalog: Pre-configured Iceberg catalog (optional) - catalog_config: Custom catalog configuration (optional) - warehouse_path: Path to Iceberg warehouse (for filesystem catalog) - compression: Parquet compression codec - row_group_size: Rows per Parquet row group - buffer_size: Buffer size for file operations - """ - super().__init__( - operator=operator, - compression=compression, - row_group_size=row_group_size, - use_dictionary=True, - buffer_size=buffer_size, - ) - - # Set up catalog - if catalog is not None: - self.catalog = catalog - else: - self.catalog = get_or_create_catalog( - catalog_name="cassandra_export", - warehouse_path=warehouse_path, - config=catalog_config, - ) - - self.schema_mapper = CassandraToIcebergSchemaMapper() - self._current_table: Table | None = None - self._data_files: list[str] = [] - - async def export( - self, - keyspace: str, - table: str, - output_path: Path | None = None, # Not used, Iceberg manages paths - namespace: str | None = None, - table_name: str | None = None, - partition_spec: PartitionSpec | None = None, - table_properties: dict[str, str] | None = None, - columns: list[str] | None = None, - split_count: int | None = None, - parallelism: int | None = None, - progress: ExportProgress | None = None, - progress_callback: Any | None = None, - ) -> ExportProgress: - """Export Cassandra table to Iceberg format. - - Args: - keyspace: Cassandra keyspace - table: Cassandra table name - output_path: Not used - Iceberg manages file paths - namespace: Iceberg namespace (default: cassandra keyspace) - table_name: Iceberg table name (default: cassandra table name) - partition_spec: Iceberg partition specification - table_properties: Additional Iceberg table properties - columns: Columns to export (default: all) - split_count: Number of token range splits - parallelism: Max concurrent operations - progress: Resume progress (optional) - progress_callback: Progress callback function - - Returns: - Export progress with Iceberg-specific metadata - """ - # Use Cassandra names as defaults - if namespace is None: - namespace = keyspace - if table_name is None: - table_name = table - - # Get Cassandra table metadata - metadata = self.operator.session._session.cluster.metadata - keyspace_metadata = metadata.keyspaces.get(keyspace) - if not keyspace_metadata: - raise ValueError(f"Keyspace '{keyspace}' not found") - table_metadata = keyspace_metadata.tables.get(table) - if not table_metadata: - raise ValueError(f"Table '{keyspace}.{table}' not found") - - # Create or get Iceberg table - iceberg_schema = self.schema_mapper.map_table_schema(table_metadata) - self._current_table = await self._get_or_create_iceberg_table( - namespace=namespace, - table_name=table_name, - schema=iceberg_schema, - partition_spec=partition_spec, - table_properties=table_properties, - ) - - # Initialize progress - if progress is None: - progress = ExportProgress( - export_id=str(uuid.uuid4()), - keyspace=keyspace, - table=table, - format=ExportFormat.PARQUET, # Iceberg uses Parquet format - output_path=f"iceberg://{namespace}.{table_name}", - started_at=datetime.now(UTC), - metadata={ - "iceberg_namespace": namespace, - "iceberg_table": table_name, - "catalog": self.catalog.name, - "compression": self.compression, - "row_group_size": self.row_group_size, - }, - ) - - # Reset data files list - self._data_files = [] - - try: - # Export data using token ranges - await self._export_by_ranges( - keyspace=keyspace, - table=table, - columns=columns, - split_count=split_count, - parallelism=parallelism, - progress=progress, - progress_callback=progress_callback, - ) - - # Commit data files to Iceberg table - if self._data_files: - await self._commit_data_files() - - # Update progress - progress.completed_at = datetime.now(UTC) - progress.metadata["data_files"] = len(self._data_files) - progress.metadata["iceberg_snapshot"] = ( - self._current_table.current_snapshot().snapshot_id - if self._current_table.current_snapshot() - else None - ) - - # Final callback - if progress_callback: - if asyncio.iscoroutinefunction(progress_callback): - await progress_callback(progress) - else: - progress_callback(progress) - - except Exception as e: - progress.errors.append(str(e)) - raise - - # Save progress - progress.save() - return progress - - async def _get_or_create_iceberg_table( - self, - namespace: str, - table_name: str, - schema: Schema, - partition_spec: PartitionSpec | None = None, - table_properties: dict[str, str] | None = None, - ) -> Table: - """Get existing Iceberg table or create a new one. - - Args: - namespace: Iceberg namespace - table_name: Table name - schema: Iceberg schema - partition_spec: Partition specification (optional) - table_properties: Table properties (optional) - - Returns: - Iceberg Table instance - """ - table_identifier = f"{namespace}.{table_name}" - - try: - # Try to load existing table - table = self.catalog.load_table(table_identifier) - - # TODO: Implement schema evolution check - # For now, we'll append to existing tables - - return table - - except NoSuchTableError: - # Create new table - if table_properties is None: - table_properties = {} - - # Add default properties - table_properties.setdefault("write.format.default", "parquet") - table_properties.setdefault("write.parquet.compression-codec", self.compression) - - # Create namespace if it doesn't exist - with contextlib.suppress(Exception): - self.catalog.create_namespace(namespace) - - # Create table - table = self.catalog.create_table( - identifier=table_identifier, - schema=schema, - partition_spec=partition_spec, - properties=table_properties, - ) - - return table - - async def _export_by_ranges( - self, - keyspace: str, - table: str, - columns: list[str] | None, - split_count: int | None, - parallelism: int | None, - progress: ExportProgress, - progress_callback: Any | None, - ) -> None: - """Export data by token ranges to multiple Parquet files.""" - # Build Arrow schema for the data - table_meta = await self._get_table_metadata(keyspace, table) - - if columns is None: - columns = list(table_meta.columns.keys()) - - self._schema = self._build_arrow_schema(table_meta, columns) - - # Export each token range to a separate file - file_index = 0 - - async for row in self.operator.export_by_token_ranges( - keyspace=keyspace, - table=table, - split_count=split_count, - parallelism=parallelism, - ): - # Add row to batch - row_data = self._convert_row_to_dict(row, columns) - self._batch_rows.append(row_data) - - # Write batch when full - if len(self._batch_rows) >= self.row_group_size: - file_path = await self._write_data_file(file_index) - self._data_files.append(str(file_path)) - file_index += 1 - - progress.rows_exported += 1 - - # Progress callback - if progress_callback and progress.rows_exported % 1000 == 0: - if asyncio.iscoroutinefunction(progress_callback): - await progress_callback(progress) - else: - progress_callback(progress) - - # Write final batch - if self._batch_rows: - file_path = await self._write_data_file(file_index) - self._data_files.append(str(file_path)) - - async def _write_data_file(self, file_index: int) -> Path: - """Write a batch of rows to a Parquet data file. - - Args: - file_index: Index for file naming - - Returns: - Path to the written file - """ - if not self._batch_rows: - raise ValueError("No data to write") - - # Generate file path in Iceberg data directory - # Format: data/part-{index}-{uuid}.parquet - file_name = f"part-{file_index:05d}-{uuid.uuid4()}.parquet" - file_path = Path(self._current_table.location()) / "data" / file_name - - # Ensure directory exists - file_path.parent.mkdir(parents=True, exist_ok=True) - - # Convert to Arrow table - table = pa.Table.from_pylist(self._batch_rows, schema=self._schema) - - # Write Parquet file - pq.write_table( - table, - file_path, - compression=self.compression, - use_dictionary=self.use_dictionary, - ) - - # Clear batch - self._batch_rows = [] - - return file_path - - async def _commit_data_files(self) -> None: - """Commit data files to Iceberg table as a new snapshot.""" - # This is a simplified version - in production, you'd use - # proper Iceberg APIs to add data files with statistics - - # For now, we'll just note that files were written - # The full implementation would: - # 1. Collect file statistics (row count, column bounds, etc.) - # 2. Create DataFile objects - # 3. Append files to table using transaction API - - # TODO: Implement proper Iceberg commit - pass - - async def _get_table_metadata(self, keyspace: str, table: str): - """Get Cassandra table metadata.""" - metadata = self.operator.session._session.cluster.metadata - keyspace_metadata = metadata.keyspaces.get(keyspace) - if not keyspace_metadata: - raise ValueError(f"Keyspace '{keyspace}' not found") - table_metadata = keyspace_metadata.tables.get(table) - if not table_metadata: - raise ValueError(f"Table '{keyspace}.{table}' not found") - return table_metadata diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/schema_mapper.py b/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/schema_mapper.py deleted file mode 100644 index b9c42e3..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/iceberg/schema_mapper.py +++ /dev/null @@ -1,196 +0,0 @@ -"""Maps Cassandra table schemas to Iceberg schemas.""" - -from cassandra.metadata import ColumnMetadata, TableMetadata -from pyiceberg.schema import Schema -from pyiceberg.types import ( - BinaryType, - BooleanType, - DateType, - DecimalType, - DoubleType, - FloatType, - IcebergType, - IntegerType, - ListType, - LongType, - MapType, - NestedField, - StringType, - TimestamptzType, -) - - -class CassandraToIcebergSchemaMapper: - """Maps Cassandra table schemas to Apache Iceberg schemas. - - What this does: - -------------- - 1. Converts CQL types to Iceberg types - 2. Preserves column nullability - 3. Handles complex types (lists, sets, maps) - 4. Assigns unique field IDs for schema evolution - - Why this matters: - ---------------- - - Enables seamless data migration from Cassandra to Iceberg - - Preserves type information for analytics - - Supports schema evolution in Iceberg - - Maintains data integrity during export - """ - - def __init__(self): - """Initialize the schema mapper.""" - self._field_id_counter = 1 - - def map_table_schema(self, table_metadata: TableMetadata) -> Schema: - """Map a Cassandra table schema to an Iceberg schema. - - Args: - table_metadata: Cassandra table metadata - - Returns: - Iceberg Schema object - """ - fields = [] - - # Map each column - for column_name, column_meta in table_metadata.columns.items(): - field = self._map_column(column_name, column_meta) - fields.append(field) - - return Schema(*fields) - - def _map_column(self, name: str, column_meta: ColumnMetadata) -> NestedField: - """Map a single Cassandra column to an Iceberg field. - - Args: - name: Column name - column_meta: Cassandra column metadata - - Returns: - Iceberg NestedField - """ - # Get the Iceberg type - iceberg_type = self._map_cql_type(column_meta.cql_type) - - # Create field with unique ID - field_id = self._get_next_field_id() - - # In Cassandra, primary key columns are required (not null) - # All other columns are nullable - is_required = column_meta.is_primary_key - - return NestedField( - field_id=field_id, - name=name, - field_type=iceberg_type, - required=is_required, - ) - - def _map_cql_type(self, cql_type: str) -> IcebergType: - """Map a CQL type string to an Iceberg type. - - Args: - cql_type: CQL type string (e.g., "text", "int", "list") - - Returns: - Iceberg Type - """ - # Handle parameterized types - base_type = cql_type.split("<")[0].lower() - - # Simple type mappings - type_mapping = { - # String types - "ascii": StringType(), - "text": StringType(), - "varchar": StringType(), - # Numeric types - "tinyint": IntegerType(), # 8-bit in Cassandra, 32-bit in Iceberg - "smallint": IntegerType(), # 16-bit in Cassandra, 32-bit in Iceberg - "int": IntegerType(), - "bigint": LongType(), - "counter": LongType(), - "varint": DecimalType(38, 0), # Arbitrary precision integer - "decimal": DecimalType(38, 10), # Default precision/scale - "float": FloatType(), - "double": DoubleType(), - # Boolean - "boolean": BooleanType(), - # Date/Time types - "date": DateType(), - "timestamp": TimestamptzType(), # Cassandra timestamps have timezone - "time": LongType(), # Time as nanoseconds since midnight - # Binary - "blob": BinaryType(), - # UUID types - "uuid": StringType(), # Store as string for compatibility - "timeuuid": StringType(), - # Network - "inet": StringType(), # IP address as string - } - - # Handle simple types - if base_type in type_mapping: - return type_mapping[base_type] - - # Handle collection types - if base_type == "list": - element_type = self._extract_collection_type(cql_type) - return ListType( - element_id=self._get_next_field_id(), - element_type=self._map_cql_type(element_type), - element_required=False, # Cassandra allows null elements - ) - elif base_type == "set": - # Sets become lists in Iceberg (no native set type) - element_type = self._extract_collection_type(cql_type) - return ListType( - element_id=self._get_next_field_id(), - element_type=self._map_cql_type(element_type), - element_required=False, - ) - elif base_type == "map": - key_type, value_type = self._extract_map_types(cql_type) - return MapType( - key_id=self._get_next_field_id(), - key_type=self._map_cql_type(key_type), - value_id=self._get_next_field_id(), - value_type=self._map_cql_type(value_type), - value_required=False, # Cassandra allows null values - ) - elif base_type == "tuple": - # Tuples become structs in Iceberg - # For now, we'll use a string representation - # TODO: Implement proper tuple parsing - return StringType() - elif base_type == "frozen": - # Frozen collections - strip "frozen" and process inner type - inner_type = cql_type[7:-1] # Remove "frozen<" and ">" - return self._map_cql_type(inner_type) - else: - # Default to string for unknown types - return StringType() - - def _extract_collection_type(self, cql_type: str) -> str: - """Extract element type from list or set.""" - start = cql_type.index("<") + 1 - end = cql_type.rindex(">") - return cql_type[start:end].strip() - - def _extract_map_types(self, cql_type: str) -> tuple[str, str]: - """Extract key and value types from map.""" - start = cql_type.index("<") + 1 - end = cql_type.rindex(">") - types = cql_type[start:end].split(",", 1) - return types[0].strip(), types[1].strip() - - def _get_next_field_id(self) -> int: - """Get the next available field ID.""" - field_id = self._field_id_counter - self._field_id_counter += 1 - return field_id - - def reset_field_ids(self) -> None: - """Reset field ID counter (useful for testing).""" - self._field_id_counter = 1 diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/parallel_export.py b/libs/async-cassandra-bulk/examples/bulk_operations/parallel_export.py deleted file mode 100644 index 22f0e1c..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/parallel_export.py +++ /dev/null @@ -1,203 +0,0 @@ -""" -Parallel export implementation for production-grade bulk operations. - -This module provides a truly parallel export capability that streams data -from multiple token ranges concurrently, similar to DSBulk. -""" - -import asyncio -from collections.abc import AsyncIterator, Callable -from typing import Any - -from cassandra import ConsistencyLevel - -from .stats import BulkOperationStats -from .token_utils import TokenRange - - -class ParallelExportIterator: - """ - Parallel export iterator that manages concurrent token range queries. - - This implementation uses asyncio queues to coordinate between multiple - worker tasks that query different token ranges in parallel. - """ - - def __init__( - self, - operator: Any, - keyspace: str, - table: str, - splits: list[TokenRange], - prepared_stmts: dict[str, Any], - parallelism: int, - consistency_level: ConsistencyLevel | None, - stats: BulkOperationStats, - progress_callback: Callable[[BulkOperationStats], None] | None, - ): - self.operator = operator - self.keyspace = keyspace - self.table = table - self.splits = splits - self.prepared_stmts = prepared_stmts - self.parallelism = parallelism - self.consistency_level = consistency_level - self.stats = stats - self.progress_callback = progress_callback - - # Queue for results from parallel workers - self.result_queue: asyncio.Queue[tuple[Any, bool]] = asyncio.Queue(maxsize=parallelism * 10) - self.workers_done = False - self.worker_tasks: list[asyncio.Task] = [] - - async def __aiter__(self) -> AsyncIterator[Any]: - """Start parallel workers and yield results as they come in.""" - # Start worker tasks - await self._start_workers() - - # Yield results from the queue - while True: - try: - # Wait for results with a timeout to check if workers are done - row, is_end_marker = await asyncio.wait_for(self.result_queue.get(), timeout=0.1) - - if is_end_marker: - # This was an end marker from a worker - continue - - yield row - - except TimeoutError: - # Check if all workers are done - if self.workers_done and self.result_queue.empty(): - break - continue - except Exception: - # Cancel all workers on error - await self._cancel_workers() - raise - - async def _start_workers(self) -> None: - """Start parallel worker tasks to process token ranges.""" - # Create a semaphore to limit concurrent queries - semaphore = asyncio.Semaphore(self.parallelism) - - # Create worker tasks for each split - for split in self.splits: - task = asyncio.create_task(self._process_split(split, semaphore)) - self.worker_tasks.append(task) - - # Create a task to monitor when all workers are done - asyncio.create_task(self._monitor_workers()) - - async def _monitor_workers(self) -> None: - """Monitor worker tasks and signal when all are complete.""" - try: - # Wait for all workers to complete - await asyncio.gather(*self.worker_tasks, return_exceptions=True) - finally: - self.workers_done = True - # Put a final marker to unblock the iterator if needed - await self.result_queue.put((None, True)) - - async def _cancel_workers(self) -> None: - """Cancel all worker tasks.""" - for task in self.worker_tasks: - if not task.done(): - task.cancel() - - # Wait for cancellation to complete - await asyncio.gather(*self.worker_tasks, return_exceptions=True) - - async def _process_split(self, split: TokenRange, semaphore: asyncio.Semaphore) -> None: - """Process a single token range split.""" - async with semaphore: - try: - if split.end < split.start: - # Wraparound range - process in two parts - await self._query_and_queue( - self.prepared_stmts["select_wraparound_gt"], (split.start,) - ) - await self._query_and_queue( - self.prepared_stmts["select_wraparound_lte"], (split.end,) - ) - else: - # Normal range - await self._query_and_queue( - self.prepared_stmts["select_range"], (split.start, split.end) - ) - - # Update stats - self.stats.ranges_completed += 1 - if self.progress_callback: - self.progress_callback(self.stats) - - except Exception as e: - # Add error to stats but don't fail the whole export - self.stats.errors.append(e) - # Put an end marker to signal this worker is done - await self.result_queue.put((None, True)) - raise - - # Signal this worker is done - await self.result_queue.put((None, True)) - - async def _query_and_queue(self, stmt: Any, params: tuple) -> None: - """Execute a query and queue all results.""" - # Set consistency level if provided - if self.consistency_level is not None: - stmt.consistency_level = self.consistency_level - - # Execute streaming query - async with await self.operator.session.execute_stream(stmt, params) as result: - async for row in result: - self.stats.rows_processed += 1 - # Queue the row for the main iterator - await self.result_queue.put((row, False)) - - -async def export_by_token_ranges_parallel( - operator: Any, - keyspace: str, - table: str, - splits: list[TokenRange], - prepared_stmts: dict[str, Any], - parallelism: int, - consistency_level: ConsistencyLevel | None, - stats: BulkOperationStats, - progress_callback: Callable[[BulkOperationStats], None] | None, -) -> AsyncIterator[Any]: - """ - Export rows from token ranges in parallel. - - This function creates a parallel export iterator that manages multiple - concurrent queries to different token ranges, similar to how DSBulk works. - - Args: - operator: The bulk operator instance - keyspace: Keyspace name - table: Table name - splits: List of token ranges to query - prepared_stmts: Prepared statements for queries - parallelism: Maximum concurrent queries - consistency_level: Consistency level for queries - stats: Statistics object to update - progress_callback: Optional progress callback - - Yields: - Rows from the table, streamed as they arrive from parallel queries - """ - iterator = ParallelExportIterator( - operator=operator, - keyspace=keyspace, - table=table, - splits=splits, - prepared_stmts=prepared_stmts, - parallelism=parallelism, - consistency_level=consistency_level, - stats=stats, - progress_callback=progress_callback, - ) - - async for row in iterator: - yield row diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/stats.py b/libs/async-cassandra-bulk/examples/bulk_operations/stats.py deleted file mode 100644 index 6f576d0..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/stats.py +++ /dev/null @@ -1,43 +0,0 @@ -"""Statistics tracking for bulk operations.""" - -import time -from dataclasses import dataclass, field - - -@dataclass -class BulkOperationStats: - """Statistics for bulk operations.""" - - rows_processed: int = 0 - ranges_completed: int = 0 - total_ranges: int = 0 - start_time: float = field(default_factory=time.time) - end_time: float | None = None - errors: list[Exception] = field(default_factory=list) - - @property - def duration_seconds(self) -> float: - """Calculate operation duration.""" - if self.end_time: - return self.end_time - self.start_time - return time.time() - self.start_time - - @property - def rows_per_second(self) -> float: - """Calculate processing rate.""" - duration = self.duration_seconds - if duration > 0: - return self.rows_processed / duration - return 0 - - @property - def progress_percentage(self) -> float: - """Calculate progress as percentage.""" - if self.total_ranges > 0: - return (self.ranges_completed / self.total_ranges) * 100 - return 0 - - @property - def is_complete(self) -> bool: - """Check if operation is complete.""" - return self.ranges_completed == self.total_ranges diff --git a/libs/async-cassandra-bulk/examples/bulk_operations/token_utils.py b/libs/async-cassandra-bulk/examples/bulk_operations/token_utils.py deleted file mode 100644 index 29c0c1a..0000000 --- a/libs/async-cassandra-bulk/examples/bulk_operations/token_utils.py +++ /dev/null @@ -1,185 +0,0 @@ -""" -Token range utilities for bulk operations. - -Handles token range discovery, splitting, and query generation. -""" - -from dataclasses import dataclass - -from async_cassandra import AsyncCassandraSession - -# Murmur3 token range boundaries -MIN_TOKEN = -(2**63) # -9223372036854775808 -MAX_TOKEN = 2**63 - 1 # 9223372036854775807 -TOTAL_TOKEN_RANGE = 2**64 - 1 # Total range size - - -@dataclass -class TokenRange: - """Represents a token range with replica information.""" - - start: int - end: int - replicas: list[str] - - @property - def size(self) -> int: - """Calculate the size of this token range.""" - if self.end >= self.start: - return self.end - self.start - else: - # Handle wraparound (e.g., 9223372036854775800 to -9223372036854775800) - return (MAX_TOKEN - self.start) + (self.end - MIN_TOKEN) + 1 - - @property - def fraction(self) -> float: - """Calculate what fraction of the total ring this range represents.""" - return self.size / TOTAL_TOKEN_RANGE - - -class TokenRangeSplitter: - """Splits token ranges for parallel processing.""" - - def split_single_range(self, token_range: TokenRange, split_count: int) -> list[TokenRange]: - """Split a single token range into approximately equal parts.""" - if split_count <= 1: - return [token_range] - - # Calculate split size - split_size = token_range.size // split_count - if split_size < 1: - # Range too small to split further - return [token_range] - - splits = [] - current_start = token_range.start - - for i in range(split_count): - if i == split_count - 1: - # Last split gets any remainder - current_end = token_range.end - else: - current_end = current_start + split_size - # Handle potential overflow - if current_end > MAX_TOKEN: - current_end = current_end - TOTAL_TOKEN_RANGE - - splits.append( - TokenRange(start=current_start, end=current_end, replicas=token_range.replicas) - ) - - current_start = current_end - - return splits - - def split_proportionally( - self, ranges: list[TokenRange], target_splits: int - ) -> list[TokenRange]: - """Split ranges proportionally based on their size.""" - if not ranges: - return [] - - # Calculate total size - total_size = sum(r.size for r in ranges) - - all_splits = [] - for token_range in ranges: - # Calculate number of splits for this range - range_fraction = token_range.size / total_size - range_splits = max(1, round(range_fraction * target_splits)) - - # Split the range - splits = self.split_single_range(token_range, range_splits) - all_splits.extend(splits) - - return all_splits - - def cluster_by_replicas( - self, ranges: list[TokenRange] - ) -> dict[tuple[str, ...], list[TokenRange]]: - """Group ranges by their replica sets.""" - clusters: dict[tuple[str, ...], list[TokenRange]] = {} - - for token_range in ranges: - # Use sorted tuple as key for consistency - replica_key = tuple(sorted(token_range.replicas)) - if replica_key not in clusters: - clusters[replica_key] = [] - clusters[replica_key].append(token_range) - - return clusters - - -async def discover_token_ranges(session: AsyncCassandraSession, keyspace: str) -> list[TokenRange]: - """Discover token ranges from cluster metadata.""" - # Access cluster through the underlying sync session - cluster = session._session.cluster - metadata = cluster.metadata - token_map = metadata.token_map - - if not token_map: - raise RuntimeError("Token map not available") - - # Get all tokens from the ring - all_tokens = sorted(token_map.ring) - if not all_tokens: - raise RuntimeError("No tokens found in ring") - - ranges = [] - - # Create ranges from consecutive tokens - for i in range(len(all_tokens)): - start_token = all_tokens[i] - # Wrap around to first token for the last range - end_token = all_tokens[(i + 1) % len(all_tokens)] - - # Handle wraparound - last range goes from last token to first token - if i == len(all_tokens) - 1: - # This is the wraparound range - start = start_token.value - end = all_tokens[0].value - else: - start = start_token.value - end = end_token.value - - # Get replicas for this token - replicas = token_map.get_replicas(keyspace, start_token) - replica_addresses = [str(r.address) for r in replicas] - - ranges.append(TokenRange(start=start, end=end, replicas=replica_addresses)) - - return ranges - - -def generate_token_range_query( - keyspace: str, - table: str, - partition_keys: list[str], - token_range: TokenRange, - columns: list[str] | None = None, -) -> str: - """Generate a CQL query for a specific token range. - - Note: This function assumes non-wraparound ranges. Wraparound ranges - (where end < start) should be handled by the caller by splitting them - into two separate queries. - """ - # Column selection - column_list = ", ".join(columns) if columns else "*" - - # Partition key list for token function - pk_list = ", ".join(partition_keys) - - # Generate token condition - if token_range.start == MIN_TOKEN: - # First range uses >= to include minimum token - token_condition = ( - f"token({pk_list}) >= {token_range.start} AND token({pk_list}) <= {token_range.end}" - ) - else: - # All other ranges use > to avoid duplicates - token_condition = ( - f"token({pk_list}) > {token_range.start} AND token({pk_list}) <= {token_range.end}" - ) - - return f"SELECT {column_list} FROM {keyspace}.{table} WHERE {token_condition}" diff --git a/libs/async-cassandra-bulk/examples/debug_coverage.py b/libs/async-cassandra-bulk/examples/debug_coverage.py deleted file mode 100644 index ca8c781..0000000 --- a/libs/async-cassandra-bulk/examples/debug_coverage.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python3 -"""Debug token range coverage issue.""" - -import asyncio - -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator -from bulk_operations.token_utils import MIN_TOKEN, discover_token_ranges, generate_token_range_query - - -async def debug_coverage(): - """Debug why we're missing rows.""" - print("Debugging token range coverage...") - - async with AsyncCluster(contact_points=["localhost"]) as cluster: - session = await cluster.connect() - - # First, let's see what tokens our test data actually has - print("\nChecking token distribution of test data...") - - # Get a sample of tokens - result = await session.execute( - """ - SELECT id, token(id) as token_value - FROM bulk_test.test_data - LIMIT 20 - """ - ) - - print("Sample tokens:") - for row in result: - print(f" ID {row.id}: token = {row.token_value}") - - # Get min and max tokens in our data - result = await session.execute( - """ - SELECT MIN(token(id)) as min_token, MAX(token(id)) as max_token - FROM bulk_test.test_data - """ - ) - row = result.one() - print(f"\nActual token range in data: {row.min_token} to {row.max_token}") - print(f"MIN_TOKEN constant: {MIN_TOKEN}") - - # Now let's see our token ranges - ranges = await discover_token_ranges(session, "bulk_test") - sorted_ranges = sorted(ranges, key=lambda r: r.start) - - print("\nFirst 5 token ranges:") - for i, r in enumerate(sorted_ranges[:5]): - print(f" Range {i}: {r.start} to {r.end}") - - # Check if any of our data falls outside the discovered ranges - print("\nChecking for data outside discovered ranges...") - - # Find the range that should contain MIN_TOKEN - min_token_range = None - for r in sorted_ranges: - if r.start <= row.min_token <= r.end: - min_token_range = r - break - - if min_token_range: - print( - f"Range containing minimum data token: {min_token_range.start} to {min_token_range.end}" - ) - else: - print("WARNING: No range found containing minimum data token!") - - # Let's also check if we have the wraparound issue - print(f"\nLast range: {sorted_ranges[-1].start} to {sorted_ranges[-1].end}") - print(f"First range: {sorted_ranges[0].start} to {sorted_ranges[0].end}") - - # The issue might be with how we handle the wraparound - # In Cassandra's token ring, the last range wraps to the first - # Let's verify this - if sorted_ranges[-1].end != sorted_ranges[0].start: - print( - f"WARNING: Ring not properly closed! Last end: {sorted_ranges[-1].end}, First start: {sorted_ranges[0].start}" - ) - - # Test the actual queries - print("\nTesting actual token range queries...") - operator = TokenAwareBulkOperator(session) - - # Get table metadata - table_meta = await operator._get_table_metadata("bulk_test", "test_data") - partition_keys = [col.name for col in table_meta.partition_key] - - # Test first range query - first_query = generate_token_range_query( - "bulk_test", "test_data", partition_keys, sorted_ranges[0] - ) - print(f"\nFirst range query: {first_query}") - count_query = first_query.replace("SELECT *", "SELECT COUNT(*)") - result = await session.execute(count_query) - print(f"Rows in first range: {result.one()[0]}") - - # Test last range query - last_query = generate_token_range_query( - "bulk_test", "test_data", partition_keys, sorted_ranges[-1] - ) - print(f"\nLast range query: {last_query}") - count_query = last_query.replace("SELECT *", "SELECT COUNT(*)") - result = await session.execute(count_query) - print(f"Rows in last range: {result.one()[0]}") - - -if __name__ == "__main__": - try: - asyncio.run(debug_coverage()) - except Exception as e: - print(f"Error: {e}") - import traceback - - traceback.print_exc() diff --git a/libs/async-cassandra-bulk/examples/docker-compose-single.yml b/libs/async-cassandra-bulk/examples/docker-compose-single.yml deleted file mode 100644 index 073b12d..0000000 --- a/libs/async-cassandra-bulk/examples/docker-compose-single.yml +++ /dev/null @@ -1,46 +0,0 @@ -version: '3.8' - -# Single node Cassandra for testing with limited resources - -services: - cassandra-1: - image: cassandra:5.0 - container_name: bulk-cassandra-1 - hostname: cassandra-1 - environment: - - CASSANDRA_CLUSTER_NAME=BulkOpsCluster - - CASSANDRA_DC=datacenter1 - - CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch - - CASSANDRA_NUM_TOKENS=256 - - MAX_HEAP_SIZE=1G - - JVM_OPTS=-XX:+UseG1GC -XX:G1RSetUpdatingPauseTimePercent=5 -XX:MaxGCPauseMillis=300 - - ports: - - "9042:9042" - volumes: - - cassandra1-data:/var/lib/cassandra - - deploy: - resources: - limits: - memory: 2G - reservations: - memory: 1G - - healthcheck: - test: ["CMD-SHELL", "nodetool info | grep -q 'Native Transport active: true' && cqlsh -e 'SELECT now() FROM system.local'"] - interval: 30s - timeout: 10s - retries: 15 - start_period: 90s - - networks: - - cassandra-net - -networks: - cassandra-net: - driver: bridge - -volumes: - cassandra1-data: - driver: local diff --git a/libs/async-cassandra-bulk/examples/docker-compose.yml b/libs/async-cassandra-bulk/examples/docker-compose.yml deleted file mode 100644 index 82e571c..0000000 --- a/libs/async-cassandra-bulk/examples/docker-compose.yml +++ /dev/null @@ -1,160 +0,0 @@ -version: '3.8' - -# Bulk Operations Example - 3-node Cassandra cluster -# Optimized for token-aware bulk operations testing - -services: - # First Cassandra node (seed) - cassandra-1: - image: cassandra:5.0 - container_name: bulk-cassandra-1 - hostname: cassandra-1 - environment: - # Cluster configuration - - CASSANDRA_CLUSTER_NAME=BulkOpsCluster - - CASSANDRA_SEEDS=cassandra-1 - - CASSANDRA_DC=datacenter1 - - CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch - - CASSANDRA_NUM_TOKENS=256 - - # Memory settings (reduced for development) - - MAX_HEAP_SIZE=2G - - JVM_OPTS=-XX:+UseG1GC -XX:G1RSetUpdatingPauseTimePercent=5 -XX:MaxGCPauseMillis=300 - - ports: - - "9042:9042" - volumes: - - cassandra1-data:/var/lib/cassandra - - # Resource limits for stability - deploy: - resources: - limits: - memory: 3G - reservations: - memory: 2G - - healthcheck: - test: ["CMD-SHELL", "nodetool info | grep -q 'Native Transport active: true' && cqlsh -e 'SELECT now() FROM system.local'"] - interval: 30s - timeout: 10s - retries: 15 - start_period: 120s - - networks: - - cassandra-net - - # Second Cassandra node - cassandra-2: - image: cassandra:5.0 - container_name: bulk-cassandra-2 - hostname: cassandra-2 - environment: - - CASSANDRA_CLUSTER_NAME=BulkOpsCluster - - CASSANDRA_SEEDS=cassandra-1 - - CASSANDRA_DC=datacenter1 - - CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch - - CASSANDRA_NUM_TOKENS=256 - - MAX_HEAP_SIZE=2G - - JVM_OPTS=-XX:+UseG1GC -XX:G1RSetUpdatingPauseTimePercent=5 -XX:MaxGCPauseMillis=300 - - ports: - - "9043:9042" - volumes: - - cassandra2-data:/var/lib/cassandra - depends_on: - cassandra-1: - condition: service_healthy - - deploy: - resources: - limits: - memory: 3G - reservations: - memory: 2G - - healthcheck: - test: ["CMD-SHELL", "nodetool info | grep -q 'Native Transport active: true' && nodetool status | grep -c UN | grep -q 2"] - interval: 30s - timeout: 10s - retries: 15 - start_period: 120s - - networks: - - cassandra-net - - # Third Cassandra node - starts after cassandra-2 to avoid overwhelming the system - cassandra-3: - image: cassandra:5.0 - container_name: bulk-cassandra-3 - hostname: cassandra-3 - environment: - - CASSANDRA_CLUSTER_NAME=BulkOpsCluster - - CASSANDRA_SEEDS=cassandra-1 - - CASSANDRA_DC=datacenter1 - - CASSANDRA_ENDPOINT_SNITCH=GossipingPropertyFileSnitch - - CASSANDRA_NUM_TOKENS=256 - - MAX_HEAP_SIZE=2G - - JVM_OPTS=-XX:+UseG1GC -XX:G1RSetUpdatingPauseTimePercent=5 -XX:MaxGCPauseMillis=300 - - ports: - - "9044:9042" - volumes: - - cassandra3-data:/var/lib/cassandra - depends_on: - cassandra-2: - condition: service_healthy - - deploy: - resources: - limits: - memory: 3G - reservations: - memory: 2G - - healthcheck: - test: ["CMD-SHELL", "nodetool info | grep -q 'Native Transport active: true' && nodetool status | grep -c UN | grep -q 3"] - interval: 30s - timeout: 10s - retries: 15 - start_period: 120s - - networks: - - cassandra-net - - # Initialization container - creates keyspace and tables - init-cassandra: - image: cassandra:5.0 - container_name: bulk-init - depends_on: - cassandra-3: - condition: service_healthy - volumes: - - ./scripts/init.cql:/init.cql:ro - command: > - bash -c " - echo 'Waiting for cluster to stabilize...'; - sleep 15; - echo 'Checking cluster status...'; - until cqlsh cassandra-1 -e 'SELECT now() FROM system.local'; do - echo 'Waiting for Cassandra to be ready...'; - sleep 5; - done; - echo 'Creating keyspace and tables...'; - cqlsh cassandra-1 -f /init.cql || echo 'Init script may have already run'; - echo 'Initialization complete!'; - " - networks: - - cassandra-net - -networks: - cassandra-net: - driver: bridge - -volumes: - cassandra1-data: - driver: local - cassandra2-data: - driver: local - cassandra3-data: - driver: local diff --git a/libs/async-cassandra-bulk/examples/example_count.py b/libs/async-cassandra-bulk/examples/example_count.py deleted file mode 100644 index f8b7b77..0000000 --- a/libs/async-cassandra-bulk/examples/example_count.py +++ /dev/null @@ -1,207 +0,0 @@ -#!/usr/bin/env python3 -""" -Example: Token-aware bulk count operation. - -This example demonstrates how to count all rows in a table -using token-aware parallel processing for maximum performance. -""" - -import asyncio -import logging -import time - -from rich.console import Console -from rich.progress import Progress, SpinnerColumn, TimeElapsedColumn -from rich.table import Table - -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator - -# Configure logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -# Rich console for pretty output -console = Console() - - -async def count_table_example(): - """Demonstrate token-aware counting of a large table.""" - - # Connect to cluster - console.print("[cyan]Connecting to Cassandra cluster...[/cyan]") - - async with AsyncCluster(contact_points=["localhost", "127.0.0.1"], port=9042) as cluster: - session = await cluster.connect() - # Create test data if needed - console.print("[yellow]Setting up test keyspace and table...[/yellow]") - - # Create keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS bulk_demo - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 3 - } - """ - ) - - # Create table - await session.execute( - """ - CREATE TABLE IF NOT EXISTS bulk_demo.large_table ( - partition_key INT, - clustering_key INT, - data TEXT, - value DOUBLE, - PRIMARY KEY (partition_key, clustering_key) - ) - """ - ) - - # Check if we need to insert test data - result = await session.execute("SELECT COUNT(*) FROM bulk_demo.large_table LIMIT 1") - current_count = result.one().count - - if current_count < 10000: - console.print( - f"[yellow]Table has {current_count} rows. " f"Inserting test data...[/yellow]" - ) - - # Insert some test data using prepared statement - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_demo.large_table - (partition_key, clustering_key, data, value) - VALUES (?, ?, ?, ?) - """ - ) - - with Progress( - SpinnerColumn(), - *Progress.get_default_columns(), - TimeElapsedColumn(), - console=console, - ) as progress: - task = progress.add_task("[green]Inserting test data...", total=10000) - - for pk in range(100): - for ck in range(100): - await session.execute( - insert_stmt, (pk, ck, f"data-{pk}-{ck}", pk * ck * 0.1) - ) - progress.update(task, advance=1) - - # Now demonstrate bulk counting - console.print("\n[bold cyan]Token-Aware Bulk Count Demo[/bold cyan]\n") - - operator = TokenAwareBulkOperator(session) - - # Progress tracking - stats_list = [] - - def progress_callback(stats): - """Track progress during operation.""" - stats_list.append( - { - "rows": stats.rows_processed, - "ranges": stats.ranges_completed, - "total_ranges": stats.total_ranges, - "progress": stats.progress_percentage, - "rate": stats.rows_per_second, - } - ) - - # Perform count with different split counts - table = Table(title="Bulk Count Performance Comparison") - table.add_column("Split Count", style="cyan") - table.add_column("Total Rows", style="green") - table.add_column("Duration (s)", style="yellow") - table.add_column("Rows/Second", style="magenta") - table.add_column("Ranges Processed", style="blue") - - for split_count in [1, 4, 8, 16, 32]: - console.print(f"\n[cyan]Counting with {split_count} splits...[/cyan]") - - start_time = time.time() - - try: - with Progress( - SpinnerColumn(), - *Progress.get_default_columns(), - TimeElapsedColumn(), - console=console, - ) as progress: - current_task = progress.add_task( - f"[green]Counting with {split_count} splits...", total=100 - ) - - # Track progress - last_progress = 0 - - def update_progress(stats, task=current_task): - nonlocal last_progress - progress.update(task, completed=int(stats.progress_percentage)) - last_progress = stats.progress_percentage - progress_callback(stats) - - count, final_stats = await operator.count_by_token_ranges_with_stats( - keyspace="bulk_demo", - table="large_table", - split_count=split_count, - progress_callback=update_progress, - ) - - duration = time.time() - start_time - - table.add_row( - str(split_count), - f"{count:,}", - f"{duration:.2f}", - f"{final_stats.rows_per_second:,.0f}", - str(final_stats.ranges_completed), - ) - - except Exception as e: - console.print(f"[red]Error: {e}[/red]") - continue - - # Display results - console.print("\n") - console.print(table) - - # Show token range distribution - console.print("\n[bold]Token Range Analysis:[/bold]") - - from bulk_operations.token_utils import discover_token_ranges - - ranges = await discover_token_ranges(session, "bulk_demo") - - range_table = Table(title="Natural Token Ranges") - range_table.add_column("Range #", style="cyan") - range_table.add_column("Start Token", style="green") - range_table.add_column("End Token", style="yellow") - range_table.add_column("Size", style="magenta") - range_table.add_column("Replicas", style="blue") - - for i, r in enumerate(ranges[:5]): # Show first 5 - range_table.add_row( - str(i + 1), str(r.start), str(r.end), f"{r.size:,}", ", ".join(r.replicas) - ) - - if len(ranges) > 5: - range_table.add_row("...", "...", "...", "...", "...") - - console.print(range_table) - console.print(f"\nTotal natural ranges: {len(ranges)}") - - -if __name__ == "__main__": - try: - asyncio.run(count_table_example()) - except KeyboardInterrupt: - console.print("\n[yellow]Operation cancelled by user[/yellow]") - except Exception as e: - console.print(f"\n[red]Error: {e}[/red]") - logger.exception("Unexpected error") diff --git a/libs/async-cassandra-bulk/examples/example_csv_export.py b/libs/async-cassandra-bulk/examples/example_csv_export.py deleted file mode 100755 index 1d3ceda..0000000 --- a/libs/async-cassandra-bulk/examples/example_csv_export.py +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/env python3 -""" -Example: Export Cassandra table to CSV format. - -This demonstrates: -- Basic CSV export -- Compressed CSV export -- Custom delimiters and NULL handling -- Progress tracking -- Resume capability -""" - -import asyncio -import logging -from pathlib import Path - -from rich.console import Console -from rich.logging import RichHandler -from rich.progress import Progress, SpinnerColumn, TextColumn -from rich.table import Table - -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(message)s", - handlers=[RichHandler(console=Console(stderr=True))], -) -logger = logging.getLogger(__name__) - - -async def export_examples(): - """Run various CSV export examples.""" - console = Console() - - # Connect to Cassandra - console.print("\n[bold blue]Connecting to Cassandra...[/bold blue]") - cluster = AsyncCluster(["localhost"]) - session = await cluster.connect() - - try: - # Ensure test data exists - await setup_test_data(session) - - # Create bulk operator - operator = TokenAwareBulkOperator(session) - - # Example 1: Basic CSV export - console.print("\n[bold green]Example 1: Basic CSV Export[/bold green]") - output_path = Path("exports/products.csv") - output_path.parent.mkdir(exist_ok=True) - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - console=console, - ) as progress: - task = progress.add_task("Exporting to CSV...", total=None) - - def progress_callback(export_progress): - progress.update( - task, - description=f"Exported {export_progress.rows_exported:,} rows " - f"({export_progress.progress_percentage:.1f}%)", - ) - - result = await operator.export_to_csv( - keyspace="bulk_demo", - table="products", - output_path=output_path, - progress_callback=progress_callback, - ) - - console.print(f"โœ“ Exported {result.rows_exported:,} rows to {output_path}") - console.print(f" File size: {result.bytes_written:,} bytes") - - # Example 2: Compressed CSV with custom delimiter - console.print("\n[bold green]Example 2: Compressed Tab-Delimited Export[/bold green]") - output_path = Path("exports/products_tab.csv") - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - console=console, - ) as progress: - task = progress.add_task("Exporting compressed CSV...", total=None) - - def progress_callback(export_progress): - progress.update( - task, - description=f"Exported {export_progress.rows_exported:,} rows", - ) - - result = await operator.export_to_csv( - keyspace="bulk_demo", - table="products", - output_path=output_path, - delimiter="\t", - compression="gzip", - progress_callback=progress_callback, - ) - - console.print(f"โœ“ Exported to {output_path}.gzip") - console.print(f" Compressed size: {result.bytes_written:,} bytes") - - # Example 3: Export with specific columns and NULL handling - console.print("\n[bold green]Example 3: Selective Column Export[/bold green]") - output_path = Path("exports/products_summary.csv") - - result = await operator.export_to_csv( - keyspace="bulk_demo", - table="products", - output_path=output_path, - columns=["id", "name", "price", "category"], - null_string="NULL", - ) - - console.print(f"โœ“ Exported {result.rows_exported:,} rows (selected columns)") - - # Show export summary - console.print("\n[bold cyan]Export Summary:[/bold cyan]") - summary_table = Table(show_header=True, header_style="bold magenta") - summary_table.add_column("Export", style="cyan") - summary_table.add_column("Format", style="green") - summary_table.add_column("Rows", justify="right") - summary_table.add_column("Size", justify="right") - summary_table.add_column("Compression") - - summary_table.add_row( - "products.csv", - "CSV", - "10,000", - "~500 KB", - "None", - ) - summary_table.add_row( - "products_tab.csv.gzip", - "TSV", - "10,000", - "~150 KB", - "gzip", - ) - summary_table.add_row( - "products_summary.csv", - "CSV", - "10,000", - "~300 KB", - "None", - ) - - console.print(summary_table) - - # Example 4: Demonstrate resume capability - console.print("\n[bold green]Example 4: Resume Capability[/bold green]") - console.print("Progress files saved at:") - for csv_file in Path("exports").glob("*.csv"): - progress_file = csv_file.with_suffix(".csv.progress") - if progress_file.exists(): - console.print(f" โ€ข {progress_file}") - - finally: - await session.close() - await cluster.shutdown() - - -async def setup_test_data(session): - """Create test keyspace and data if not exists.""" - # Create keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS bulk_demo - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 1 - } - """ - ) - - # Create table - await session.execute( - """ - CREATE TABLE IF NOT EXISTS bulk_demo.products ( - id INT PRIMARY KEY, - name TEXT, - description TEXT, - price DECIMAL, - category TEXT, - in_stock BOOLEAN, - tags SET, - attributes MAP, - created_at TIMESTAMP - ) - """ - ) - - # Check if data exists - result = await session.execute("SELECT COUNT(*) FROM bulk_demo.products") - count = result.one().count - - if count < 10000: - logger.info("Inserting test data...") - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_demo.products - (id, name, description, price, category, in_stock, tags, attributes, created_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, toTimestamp(now())) - """ - ) - - # Insert in batches - for i in range(10000): - await session.execute( - insert_stmt, - ( - i, - f"Product {i}", - f"Description for product {i}" if i % 3 != 0 else None, - float(10 + (i % 1000) * 0.1), - ["Electronics", "Books", "Clothing", "Food"][i % 4], - i % 5 != 0, # 80% in stock - {"tag1", f"tag{i % 10}"} if i % 2 == 0 else None, - {"color": ["red", "blue", "green"][i % 3], "size": "M"} if i % 4 == 0 else {}, - ), - ) - - -if __name__ == "__main__": - asyncio.run(export_examples()) diff --git a/libs/async-cassandra-bulk/examples/example_export_formats.py b/libs/async-cassandra-bulk/examples/example_export_formats.py deleted file mode 100755 index f6ca15f..0000000 --- a/libs/async-cassandra-bulk/examples/example_export_formats.py +++ /dev/null @@ -1,283 +0,0 @@ -#!/usr/bin/env python3 -""" -Example: Export Cassandra data to multiple formats. - -This demonstrates exporting to: -- CSV (with compression) -- JSON (line-delimited and array) -- Parquet (foundation for Iceberg) - -Shows why Parquet is critical for the Iceberg integration. -""" - -import asyncio -import logging -from pathlib import Path - -from rich.console import Console -from rich.logging import RichHandler -from rich.panel import Panel -from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeRemainingColumn -from rich.table import Table - -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(message)s", - handlers=[RichHandler(console=Console(stderr=True))], -) -logger = logging.getLogger(__name__) - - -async def export_format_examples(): - """Demonstrate all export formats.""" - console = Console() - - # Header - console.print( - Panel.fit( - "[bold cyan]Cassandra Bulk Export Examples[/bold cyan]\n" - "Exporting to CSV, JSON, and Parquet formats", - border_style="cyan", - ) - ) - - # Connect to Cassandra - console.print("\n[bold blue]Connecting to Cassandra...[/bold blue]") - cluster = AsyncCluster(["localhost"]) - session = await cluster.connect() - - try: - # Setup test data - await setup_test_data(session) - - # Create bulk operator - operator = TokenAwareBulkOperator(session) - - # Create exports directory - exports_dir = Path("exports") - exports_dir.mkdir(exist_ok=True) - - # Export to different formats - results = {} - - # 1. CSV Export - console.print("\n[bold green]1. CSV Export (Universal Format)[/bold green]") - console.print(" โ€ข Human readable") - console.print(" โ€ข Compatible with Excel, databases, etc.") - console.print(" โ€ข Good for data exchange") - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TimeRemainingColumn(), - console=console, - ) as progress: - task = progress.add_task("Exporting to CSV...", total=100) - - def csv_progress(export_progress): - progress.update( - task, - completed=export_progress.progress_percentage, - description=f"CSV: {export_progress.rows_exported:,} rows", - ) - - results["csv"] = await operator.export_to_csv( - keyspace="export_demo", - table="events", - output_path=exports_dir / "events.csv", - compression="gzip", - progress_callback=csv_progress, - ) - - # 2. JSON Export (Line-delimited) - console.print("\n[bold green]2. JSON Export (Streaming Format)[/bold green]") - console.print(" โ€ข Preserves data types") - console.print(" โ€ข Works with streaming tools") - console.print(" โ€ข Good for data pipelines") - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TimeRemainingColumn(), - console=console, - ) as progress: - task = progress.add_task("Exporting to JSONL...", total=100) - - def json_progress(export_progress): - progress.update( - task, - completed=export_progress.progress_percentage, - description=f"JSON: {export_progress.rows_exported:,} rows", - ) - - results["json"] = await operator.export_to_json( - keyspace="export_demo", - table="events", - output_path=exports_dir / "events.jsonl", - format_mode="jsonl", - compression="gzip", - progress_callback=json_progress, - ) - - # 3. Parquet Export (Foundation for Iceberg) - console.print("\n[bold yellow]3. Parquet Export (CRITICAL for Iceberg)[/bold yellow]") - console.print(" โ€ข Columnar format for analytics") - console.print(" โ€ข Excellent compression") - console.print(" โ€ข Schema included in file") - console.print(" โ€ข [bold red]This is what Iceberg uses![/bold red]") - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TimeRemainingColumn(), - console=console, - ) as progress: - task = progress.add_task("Exporting to Parquet...", total=100) - - def parquet_progress(export_progress): - progress.update( - task, - completed=export_progress.progress_percentage, - description=f"Parquet: {export_progress.rows_exported:,} rows", - ) - - results["parquet"] = await operator.export_to_parquet( - keyspace="export_demo", - table="events", - output_path=exports_dir / "events.parquet", - compression="snappy", - row_group_size=10000, - progress_callback=parquet_progress, - ) - - # Show results comparison - console.print("\n[bold cyan]Export Results Comparison:[/bold cyan]") - comparison = Table(show_header=True, header_style="bold magenta") - comparison.add_column("Format", style="cyan") - comparison.add_column("File", style="green") - comparison.add_column("Size", justify="right") - comparison.add_column("Rows", justify="right") - comparison.add_column("Time", justify="right") - - for format_name, result in results.items(): - file_path = Path(result.output_path) - if format_name != "parquet" and result.metadata.get("compression"): - file_path = file_path.with_suffix( - file_path.suffix + f".{result.metadata['compression']}" - ) - - size_mb = result.bytes_written / (1024 * 1024) - duration = (result.completed_at - result.started_at).total_seconds() - - comparison.add_row( - format_name.upper(), - file_path.name, - f"{size_mb:.1f} MB", - f"{result.rows_exported:,}", - f"{duration:.1f}s", - ) - - console.print(comparison) - - # Explain Parquet importance - console.print( - Panel( - "[bold yellow]Why Parquet Matters for Iceberg:[/bold yellow]\n\n" - "โ€ข Iceberg tables store data in Parquet files\n" - "โ€ข Columnar format enables fast analytics queries\n" - "โ€ข Built-in schema makes evolution easier\n" - "โ€ข Compression reduces storage costs\n" - "โ€ข Row groups enable efficient filtering\n\n" - "[bold cyan]Next Phase:[/bold cyan] These Parquet files will become " - "Iceberg table data files!", - title="[bold red]The Path to Iceberg[/bold red]", - border_style="yellow", - ) - ) - - finally: - await session.close() - await cluster.shutdown() - - -async def setup_test_data(session): - """Create test keyspace and data.""" - # Create keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS export_demo - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 1 - } - """ - ) - - # Create events table with various data types - await session.execute( - """ - CREATE TABLE IF NOT EXISTS export_demo.events ( - event_id UUID PRIMARY KEY, - event_type TEXT, - user_id INT, - timestamp TIMESTAMP, - properties MAP, - tags SET, - metrics LIST, - is_processed BOOLEAN, - processing_time DECIMAL - ) - """ - ) - - # Check if data exists - result = await session.execute("SELECT COUNT(*) FROM export_demo.events") - count = result.one().count - - if count < 50000: - logger.info("Inserting test events...") - insert_stmt = await session.prepare( - """ - INSERT INTO export_demo.events - (event_id, event_type, user_id, timestamp, properties, - tags, metrics, is_processed, processing_time) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) - """ - ) - - # Insert test events - import uuid - from datetime import datetime, timedelta - from decimal import Decimal - - base_time = datetime.now() - timedelta(days=30) - event_types = ["login", "purchase", "view", "click", "logout"] - - for i in range(50000): - event_time = base_time + timedelta(seconds=i * 60) - - await session.execute( - insert_stmt, - ( - uuid.uuid4(), - event_types[i % len(event_types)], - i % 1000, # user_id - event_time, - {"source": "web", "version": "2.0"} if i % 3 == 0 else {}, - {f"tag{i % 5}", f"cat{i % 3}"} if i % 2 == 0 else None, - [float(i), float(i * 0.1), float(i * 0.01)] if i % 4 == 0 else None, - i % 10 != 0, # 90% processed - Decimal(str(0.001 * (i % 1000))), - ), - ) - - -if __name__ == "__main__": - asyncio.run(export_format_examples()) diff --git a/libs/async-cassandra-bulk/examples/example_iceberg_export.py b/libs/async-cassandra-bulk/examples/example_iceberg_export.py deleted file mode 100644 index 1a08f1b..0000000 --- a/libs/async-cassandra-bulk/examples/example_iceberg_export.py +++ /dev/null @@ -1,302 +0,0 @@ -#!/usr/bin/env python3 -"""Example: Export Cassandra data to Apache Iceberg tables. - -This demonstrates the power of Apache Iceberg: -- ACID transactions on data lakes -- Schema evolution -- Time travel queries -- Hidden partitioning -- Integration with modern analytics tools -""" - -import asyncio -import logging -from datetime import datetime, timedelta -from pathlib import Path - -from pyiceberg.partitioning import PartitionField, PartitionSpec -from pyiceberg.transforms import DayTransform -from rich.console import Console -from rich.logging import RichHandler -from rich.panel import Panel -from rich.progress import BarColumn, Progress, SpinnerColumn, TextColumn, TimeRemainingColumn -from rich.table import Table as RichTable - -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator -from bulk_operations.iceberg import IcebergExporter - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(message)s", - handlers=[RichHandler(console=Console(stderr=True))], -) -logger = logging.getLogger(__name__) - - -async def iceberg_export_demo(): - """Demonstrate Cassandra to Iceberg export with advanced features.""" - console = Console() - - # Header - console.print( - Panel.fit( - "[bold cyan]Apache Iceberg Export Demo[/bold cyan]\n" - "Exporting Cassandra data to modern data lakehouse format", - border_style="cyan", - ) - ) - - # Connect to Cassandra - console.print("\n[bold blue]1. Connecting to Cassandra...[/bold blue]") - cluster = AsyncCluster(["localhost"]) - session = await cluster.connect() - - try: - # Setup test data - await setup_demo_data(session, console) - - # Create bulk operator - operator = TokenAwareBulkOperator(session) - - # Configure Iceberg export - warehouse_path = Path("iceberg_warehouse") - console.print( - f"\n[bold blue]2. Setting up Iceberg warehouse at:[/bold blue] {warehouse_path}" - ) - - # Create Iceberg exporter - exporter = IcebergExporter( - operator=operator, - warehouse_path=warehouse_path, - compression="snappy", - row_group_size=10000, - ) - - # Example 1: Basic export - console.print("\n[bold green]Example 1: Basic Iceberg Export[/bold green]") - console.print(" โ€ข Creates Iceberg table from Cassandra schema") - console.print(" โ€ข Writes data in Parquet format") - console.print(" โ€ข Enables ACID transactions") - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TimeRemainingColumn(), - console=console, - ) as progress: - task = progress.add_task("Exporting to Iceberg...", total=100) - - def iceberg_progress(export_progress): - progress.update( - task, - completed=export_progress.progress_percentage, - description=f"Iceberg: {export_progress.rows_exported:,} rows", - ) - - result = await exporter.export( - keyspace="iceberg_demo", - table="user_events", - namespace="cassandra_export", - table_name="user_events", - progress_callback=iceberg_progress, - ) - - console.print(f"โœ“ Exported {result.rows_exported:,} rows to Iceberg") - console.print(" Table: iceberg://cassandra_export.user_events") - - # Example 2: Partitioned export - console.print("\n[bold green]Example 2: Partitioned Iceberg Table[/bold green]") - console.print(" โ€ข Partitions by day for efficient queries") - console.print(" โ€ข Hidden partitioning (no query changes needed)") - console.print(" โ€ข Automatic partition pruning") - - # Create partition spec (partition by day) - partition_spec = PartitionSpec( - PartitionField( - source_id=4, # event_time field ID - field_id=1000, - transform=DayTransform(), - name="event_day", - ) - ) - - with Progress( - SpinnerColumn(), - TextColumn("[progress.description]{task.description}"), - BarColumn(), - TimeRemainingColumn(), - console=console, - ) as progress: - task = progress.add_task("Exporting with partitions...", total=100) - - def partition_progress(export_progress): - progress.update( - task, - completed=export_progress.progress_percentage, - description=f"Partitioned: {export_progress.rows_exported:,} rows", - ) - - result = await exporter.export( - keyspace="iceberg_demo", - table="user_events", - namespace="cassandra_export", - table_name="user_events_partitioned", - partition_spec=partition_spec, - progress_callback=partition_progress, - ) - - console.print("โœ“ Created partitioned Iceberg table") - console.print(" Partitioned by: event_day (daily partitions)") - - # Show Iceberg features - console.print("\n[bold cyan]Iceberg Features Enabled:[/bold cyan]") - features = RichTable(show_header=True, header_style="bold magenta") - features.add_column("Feature", style="cyan") - features.add_column("Description", style="green") - features.add_column("Example Query") - - features.add_row( - "Time Travel", - "Query data at any point in time", - "SELECT * FROM table AS OF '2025-01-01'", - ) - features.add_row( - "Schema Evolution", - "Add/drop/rename columns safely", - "ALTER TABLE table ADD COLUMN new_field STRING", - ) - features.add_row( - "Hidden Partitioning", - "Partition pruning without query changes", - "WHERE event_time > '2025-01-01' -- uses partitions", - ) - features.add_row( - "ACID Transactions", - "Atomic commits and rollbacks", - "Multiple concurrent writers supported", - ) - features.add_row( - "Incremental Processing", - "Process only new data", - "Read incrementally from snapshot N to M", - ) - - console.print(features) - - # Explain the power of Iceberg - console.print( - Panel( - "[bold yellow]Why Apache Iceberg Matters:[/bold yellow]\n\n" - "โ€ข [cyan]Netflix Scale:[/cyan] Created by Netflix to handle petabytes\n" - "โ€ข [cyan]Open Format:[/cyan] Works with Spark, Trino, Flink, and more\n" - "โ€ข [cyan]Cloud Native:[/cyan] Designed for S3, GCS, Azure storage\n" - "โ€ข [cyan]Performance:[/cyan] Faster than traditional data lakes\n" - "โ€ข [cyan]Reliability:[/cyan] ACID guarantees prevent data corruption\n\n" - "[bold green]Your Cassandra data is now ready for:[/bold green]\n" - "โ€ข Analytics with Spark or Trino\n" - "โ€ข Machine learning pipelines\n" - "โ€ข Data warehousing with Snowflake/BigQuery\n" - "โ€ข Real-time processing with Flink", - title="[bold red]The Modern Data Lakehouse[/bold red]", - border_style="yellow", - ) - ) - - # Show next steps - console.print("\n[bold blue]Next Steps:[/bold blue]") - console.print( - "1. Query with Spark: spark.read.format('iceberg').load('cassandra_export.user_events')" - ) - console.print( - "2. Time travel: SELECT * FROM user_events FOR SYSTEM_TIME AS OF '2025-01-01'" - ) - console.print("3. Schema evolution: ALTER TABLE user_events ADD COLUMNS (score DOUBLE)") - console.print(f"4. Explore warehouse: {warehouse_path}/") - - finally: - await session.close() - await cluster.shutdown() - - -async def setup_demo_data(session, console): - """Create demo keyspace and data.""" - console.print("\n[bold blue]Setting up demo data...[/bold blue]") - - # Create keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS iceberg_demo - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 1 - } - """ - ) - - # Create table with various data types - await session.execute( - """ - CREATE TABLE IF NOT EXISTS iceberg_demo.user_events ( - user_id UUID, - event_id UUID, - event_type TEXT, - event_time TIMESTAMP, - properties MAP, - metrics MAP, - tags SET, - is_processed BOOLEAN, - score DECIMAL, - PRIMARY KEY (user_id, event_time, event_id) - ) WITH CLUSTERING ORDER BY (event_time DESC, event_id ASC) - """ - ) - - # Check if data exists - result = await session.execute("SELECT COUNT(*) FROM iceberg_demo.user_events") - count = result.one().count - - if count < 10000: - console.print(" Inserting sample events...") - insert_stmt = await session.prepare( - """ - INSERT INTO iceberg_demo.user_events - (user_id, event_id, event_type, event_time, properties, - metrics, tags, is_processed, score) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) - """ - ) - - # Insert events over the last 30 days - import uuid - from decimal import Decimal - - base_time = datetime.now() - timedelta(days=30) - event_types = ["login", "purchase", "view", "click", "share", "logout"] - - for i in range(10000): - user_id = uuid.UUID(f"00000000-0000-0000-0000-{i % 100:012d}") - event_time = base_time + timedelta(minutes=i * 5) - - await session.execute( - insert_stmt, - ( - user_id, - uuid.uuid4(), - event_types[i % len(event_types)], - event_time, - {"device": "mobile", "version": "2.0"} if i % 3 == 0 else {}, - {"duration": float(i % 300), "count": float(i % 10)}, - {f"tag{i % 5}", f"category{i % 3}"}, - i % 10 != 0, # 90% processed - Decimal(str(0.1 * (i % 100))), - ), - ) - - console.print(" โœ“ Created 10,000 events across 100 users") - - -if __name__ == "__main__": - asyncio.run(iceberg_export_demo()) diff --git a/libs/async-cassandra-bulk/examples/exports/.gitignore b/libs/async-cassandra-bulk/examples/exports/.gitignore deleted file mode 100644 index c4f1b4c..0000000 --- a/libs/async-cassandra-bulk/examples/exports/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# Ignore all exported files -* -# But keep this .gitignore file -!.gitignore diff --git a/libs/async-cassandra-bulk/examples/fix_export_consistency.py b/libs/async-cassandra-bulk/examples/fix_export_consistency.py deleted file mode 100644 index dbd3293..0000000 --- a/libs/async-cassandra-bulk/examples/fix_export_consistency.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -"""Fix the export_by_token_ranges method to handle consistency level properly.""" - -# Here's the corrected version of the export_by_token_ranges method - -corrected_code = """ - # Stream results from each range - for split in splits: - # Check if this is a wraparound range - if split.end < split.start: - # Wraparound range needs to be split into two queries - # First part: from start to MAX_TOKEN - if consistency_level is not None: - async with await self.session.execute_stream( - prepared_stmts["select_wraparound_gt"], - (split.start,), - consistency_level=consistency_level - ) as result: - async for row in result: - stats.rows_processed += 1 - yield row - else: - async with await self.session.execute_stream( - prepared_stmts["select_wraparound_gt"], - (split.start,) - ) as result: - async for row in result: - stats.rows_processed += 1 - yield row - - # Second part: from MIN_TOKEN to end - if consistency_level is not None: - async with await self.session.execute_stream( - prepared_stmts["select_wraparound_lte"], - (split.end,), - consistency_level=consistency_level - ) as result: - async for row in result: - stats.rows_processed += 1 - yield row - else: - async with await self.session.execute_stream( - prepared_stmts["select_wraparound_lte"], - (split.end,) - ) as result: - async for row in result: - stats.rows_processed += 1 - yield row - else: - # Normal range - use prepared statement - if consistency_level is not None: - async with await self.session.execute_stream( - prepared_stmts["select_range"], - (split.start, split.end), - consistency_level=consistency_level - ) as result: - async for row in result: - stats.rows_processed += 1 - yield row - else: - async with await self.session.execute_stream( - prepared_stmts["select_range"], - (split.start, split.end) - ) as result: - async for row in result: - stats.rows_processed += 1 - yield row - - stats.ranges_completed += 1 - - if progress_callback: - progress_callback(stats) - - stats.end_time = time.time() -""" - -print(corrected_code) diff --git a/libs/async-cassandra-bulk/examples/pyproject.toml b/libs/async-cassandra-bulk/examples/pyproject.toml deleted file mode 100644 index 39dc0a8..0000000 --- a/libs/async-cassandra-bulk/examples/pyproject.toml +++ /dev/null @@ -1,102 +0,0 @@ -[build-system] -requires = ["setuptools>=61.0", "wheel"] -build-backend = "setuptools.build_meta" - -[project] -name = "async-cassandra-bulk-operations" -version = "0.1.0" -description = "Token-aware bulk operations example for async-cassandra" -readme = "README.md" -requires-python = ">=3.12" -license = {text = "Apache-2.0"} -authors = [ - {name = "AxonOps", email = "info@axonops.com"}, -] -dependencies = [ - # For development, install async-cassandra from parent directory: - # pip install -e ../.. - # For production, use: "async-cassandra>=0.2.0", - "pyiceberg[pyarrow]>=0.8.0", - "pyarrow>=18.0.0", - "pandas>=2.0.0", - "rich>=13.0.0", # For nice progress bars - "click>=8.0.0", # For CLI -] - -[project.optional-dependencies] -dev = [ - "pytest>=8.0.0", - "pytest-asyncio>=0.24.0", - "pytest-cov>=5.0.0", - "black>=24.0.0", - "ruff>=0.8.0", - "mypy>=1.13.0", -] - -[project.scripts] -bulk-ops = "bulk_operations.cli:main" - -[tool.pytest.ini_options] -minversion = "8.0" -addopts = [ - "-ra", - "--strict-markers", - "--asyncio-mode=auto", - "--cov=bulk_operations", - "--cov-report=html", - "--cov-report=term-missing", -] -testpaths = ["tests"] -python_files = ["test_*.py"] -python_classes = ["Test*"] -python_functions = ["test_*"] -markers = [ - "unit: Unit tests that don't require Cassandra", - "integration: Integration tests that require a running Cassandra cluster", - "slow: Tests that take a long time to run", -] - -[tool.black] -line-length = 100 -target-version = ["py312"] -include = '\.pyi?$' - -[tool.isort] -profile = "black" -line_length = 100 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -use_parentheses = true -ensure_newline_before_comments = true -known_first_party = ["async_cassandra"] - -[tool.ruff] -line-length = 100 -target-version = "py312" - -[tool.ruff.lint] -select = [ - "E", # pycodestyle errors - "W", # pycodestyle warnings - "F", # pyflakes - # "I", # isort - disabled since we use isort separately - "B", # flake8-bugbear - "C90", # mccabe complexity - "UP", # pyupgrade - "SIM", # flake8-simplify -] -ignore = ["E501"] # Line too long - handled by black - -[tool.mypy] -python_version = "3.12" -warn_return_any = true -warn_unused_configs = true -disallow_untyped_defs = true -disallow_incomplete_defs = true -check_untyped_defs = true -no_implicit_optional = true -warn_redundant_casts = true -warn_unused_ignores = true -warn_no_return = true -strict_equality = true diff --git a/libs/async-cassandra-bulk/examples/run_integration_tests.sh b/libs/async-cassandra-bulk/examples/run_integration_tests.sh deleted file mode 100755 index a25133f..0000000 --- a/libs/async-cassandra-bulk/examples/run_integration_tests.sh +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/bash -# Integration test runner for bulk operations - -echo "๐Ÿš€ Bulk Operations Integration Test Runner" -echo "=========================================" - -# Check if docker or podman is available -if command -v podman &> /dev/null; then - CONTAINER_TOOL="podman" -elif command -v docker &> /dev/null; then - CONTAINER_TOOL="docker" -else - echo "โŒ Error: Neither docker nor podman found. Please install one." - exit 1 -fi - -echo "Using container tool: $CONTAINER_TOOL" - -# Function to wait for cluster to be ready -wait_for_cluster() { - echo "โณ Waiting for Cassandra cluster to be ready..." - local max_attempts=60 - local attempt=0 - - while [ $attempt -lt $max_attempts ]; do - if $CONTAINER_TOOL exec bulk-cassandra-1 nodetool status 2>/dev/null | grep -q "UN"; then - echo "โœ… Cassandra cluster is ready!" - return 0 - fi - attempt=$((attempt + 1)) - echo -n "." - sleep 5 - done - - echo "โŒ Timeout waiting for cluster to be ready" - return 1 -} - -# Function to show cluster status -show_cluster_status() { - echo "" - echo "๐Ÿ“Š Cluster Status:" - echo "==================" - $CONTAINER_TOOL exec bulk-cassandra-1 nodetool status || true - echo "" -} - -# Main execution -echo "" -echo "1๏ธโƒฃ Starting Cassandra cluster..." -$CONTAINER_TOOL-compose up -d - -if wait_for_cluster; then - show_cluster_status - - echo "2๏ธโƒฃ Running integration tests..." - echo "" - - # Run pytest with integration markers - pytest tests/test_integration.py -v -s -m integration - TEST_RESULT=$? - - echo "" - echo "3๏ธโƒฃ Cluster token information:" - echo "==============================" - echo "Sample output from nodetool describering:" - $CONTAINER_TOOL exec bulk-cassandra-1 nodetool describering bulk_test 2>/dev/null | head -20 || true - - echo "" - echo "4๏ธโƒฃ Test Summary:" - echo "================" - if [ $TEST_RESULT -eq 0 ]; then - echo "โœ… All integration tests passed!" - else - echo "โŒ Some tests failed. Please check the output above." - fi - - echo "" - read -p "Press Enter to stop the cluster, or Ctrl+C to keep it running..." - - echo "Stopping cluster..." - $CONTAINER_TOOL-compose down -else - echo "โŒ Failed to start cluster. Check container logs:" - $CONTAINER_TOOL-compose logs - $CONTAINER_TOOL-compose down - exit 1 -fi - -echo "" -echo "โœจ Done!" diff --git a/libs/async-cassandra-bulk/examples/scripts/init.cql b/libs/async-cassandra-bulk/examples/scripts/init.cql deleted file mode 100644 index 70902c6..0000000 --- a/libs/async-cassandra-bulk/examples/scripts/init.cql +++ /dev/null @@ -1,72 +0,0 @@ --- Initialize keyspace and tables for bulk operations example --- This script creates test data for demonstrating token-aware bulk operations - --- Create keyspace with NetworkTopologyStrategy for production-like setup -CREATE KEYSPACE IF NOT EXISTS bulk_ops -WITH replication = { - 'class': 'NetworkTopologyStrategy', - 'datacenter1': 3 -} -AND durable_writes = true; - --- Use the keyspace -USE bulk_ops; - --- Create a large table for bulk operations testing -CREATE TABLE IF NOT EXISTS large_dataset ( - id UUID, - partition_key INT, - clustering_key INT, - data TEXT, - value DOUBLE, - created_at TIMESTAMP, - metadata MAP, - PRIMARY KEY (partition_key, clustering_key, id) -) WITH CLUSTERING ORDER BY (clustering_key ASC, id ASC) - AND compression = {'class': 'LZ4Compressor'} - AND compaction = {'class': 'SizeTieredCompactionStrategy'}; - --- Create an index for testing -CREATE INDEX IF NOT EXISTS idx_created_at ON large_dataset (created_at); - --- Create a table for export/import testing -CREATE TABLE IF NOT EXISTS orders ( - order_id UUID, - customer_id UUID, - order_date DATE, - order_time TIMESTAMP, - total_amount DECIMAL, - status TEXT, - items LIST>>, - shipping_address MAP, - PRIMARY KEY ((customer_id), order_date, order_id) -) WITH CLUSTERING ORDER BY (order_date DESC, order_id ASC) - AND compression = {'class': 'LZ4Compressor'}; - --- Create a simple counter table -CREATE TABLE IF NOT EXISTS page_views ( - page_id UUID, - date DATE, - views COUNTER, - PRIMARY KEY ((page_id), date) -) WITH CLUSTERING ORDER BY (date DESC); - --- Create a time series table -CREATE TABLE IF NOT EXISTS sensor_data ( - sensor_id UUID, - bucket TIMESTAMP, - reading_time TIMESTAMP, - temperature DOUBLE, - humidity DOUBLE, - pressure DOUBLE, - location FROZEN>, - PRIMARY KEY ((sensor_id, bucket), reading_time) -) WITH CLUSTERING ORDER BY (reading_time DESC) - AND compression = {'class': 'LZ4Compressor'} - AND default_time_to_live = 2592000; -- 30 days TTL - --- Grant permissions (if authentication is enabled) --- GRANT ALL ON KEYSPACE bulk_ops TO cassandra; - --- Display confirmation -SELECT keyspace_name, table_name FROM system_schema.tables WHERE keyspace_name = 'bulk_ops'; diff --git a/libs/async-cassandra-bulk/examples/test_simple_count.py b/libs/async-cassandra-bulk/examples/test_simple_count.py deleted file mode 100644 index 549f1ea..0000000 --- a/libs/async-cassandra-bulk/examples/test_simple_count.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python3 -"""Simple test to debug count issue.""" - -import asyncio - -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator - - -async def test_count(): - """Test count with error details.""" - async with AsyncCluster(contact_points=["localhost"]) as cluster: - session = await cluster.connect() - - operator = TokenAwareBulkOperator(session) - - try: - count = await operator.count_by_token_ranges( - keyspace="bulk_test", table="test_data", split_count=4, parallelism=2 - ) - print(f"Count successful: {count}") - except Exception as e: - print(f"Error: {e}") - if hasattr(e, "errors"): - print(f"Detailed errors: {e.errors}") - for err in e.errors: - print(f" - {err}") - - -if __name__ == "__main__": - asyncio.run(test_count()) diff --git a/libs/async-cassandra-bulk/examples/test_single_node.py b/libs/async-cassandra-bulk/examples/test_single_node.py deleted file mode 100644 index aa762de..0000000 --- a/libs/async-cassandra-bulk/examples/test_single_node.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python3 -"""Quick test to verify token range discovery with single node.""" - -import asyncio - -from async_cassandra import AsyncCluster -from bulk_operations.token_utils import ( - MAX_TOKEN, - MIN_TOKEN, - TOTAL_TOKEN_RANGE, - discover_token_ranges, -) - - -async def test_single_node(): - """Test token range discovery with single node.""" - print("Connecting to single-node cluster...") - - async with AsyncCluster(contact_points=["localhost"]) as cluster: - session = await cluster.connect() - - # Create test keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS test_single - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 1 - } - """ - ) - - print("Discovering token ranges...") - ranges = await discover_token_ranges(session, "test_single") - - print(f"\nToken ranges discovered: {len(ranges)}") - print("Expected with 1 node ร— 256 vnodes: 256 ranges") - - # Verify we have the expected number of ranges - assert len(ranges) == 256, f"Expected 256 ranges, got {len(ranges)}" - - # Verify ranges cover the entire ring - sorted_ranges = sorted(ranges, key=lambda r: r.start) - - # Debug first and last ranges - print(f"First range: {sorted_ranges[0].start} to {sorted_ranges[0].end}") - print(f"Last range: {sorted_ranges[-1].start} to {sorted_ranges[-1].end}") - print(f"MIN_TOKEN: {MIN_TOKEN}, MAX_TOKEN: {MAX_TOKEN}") - - # The token ring is circular, so we need to handle wraparound - # The smallest token in the sorted list might not be MIN_TOKEN - # because of how Cassandra distributes vnodes - - # Check for gaps or overlaps - gaps = [] - overlaps = [] - for i in range(len(sorted_ranges) - 1): - current = sorted_ranges[i] - next_range = sorted_ranges[i + 1] - if current.end < next_range.start: - gaps.append((current.end, next_range.start)) - elif current.end > next_range.start: - overlaps.append((current.end, next_range.start)) - - print(f"\nGaps found: {len(gaps)}") - if gaps: - for gap in gaps[:3]: - print(f" Gap: {gap[0]} to {gap[1]}") - - print(f"Overlaps found: {len(overlaps)}") - - # Check if ranges form a complete ring - # In a proper token ring, each range's end should equal the next range's start - # The last range should wrap around to the first - total_size = sum(r.size for r in ranges) - print(f"\nTotal token space covered: {total_size:,}") - print(f"Expected total space: {TOTAL_TOKEN_RANGE:,}") - - # Show sample ranges - print("\nSample token ranges (first 5):") - for i, r in enumerate(sorted_ranges[:5]): - print(f" Range {i+1}: {r.start} to {r.end} (size: {r.size:,})") - - print("\nโœ… All tests passed!") - - # Session is closed automatically by the context manager - return True - - -if __name__ == "__main__": - try: - asyncio.run(test_single_node()) - except Exception as e: - print(f"โŒ Error: {e}") - import traceback - - traceback.print_exc() - exit(1) diff --git a/libs/async-cassandra-bulk/examples/tests/__init__.py b/libs/async-cassandra-bulk/examples/tests/__init__.py deleted file mode 100644 index ce61b96..0000000 --- a/libs/async-cassandra-bulk/examples/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Test package for bulk operations.""" diff --git a/libs/async-cassandra-bulk/examples/tests/conftest.py b/libs/async-cassandra-bulk/examples/tests/conftest.py deleted file mode 100644 index 4445379..0000000 --- a/libs/async-cassandra-bulk/examples/tests/conftest.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -Pytest configuration for bulk operations tests. - -Handles test markers and Docker/Podman support. -""" - -import os -import subprocess -from pathlib import Path - -import pytest - - -def get_container_runtime(): - """Detect whether to use docker or podman.""" - # Check environment variable first - runtime = os.environ.get("CONTAINER_RUNTIME", "").lower() - if runtime in ["docker", "podman"]: - return runtime - - # Auto-detect - for cmd in ["docker", "podman"]: - try: - subprocess.run([cmd, "--version"], capture_output=True, check=True) - return cmd - except (subprocess.CalledProcessError, FileNotFoundError): - continue - - raise RuntimeError("Neither docker nor podman found. Please install one.") - - -# Set container runtime globally -CONTAINER_RUNTIME = get_container_runtime() -os.environ["CONTAINER_RUNTIME"] = CONTAINER_RUNTIME - - -def pytest_configure(config): - """Configure pytest with custom markers.""" - config.addinivalue_line("markers", "unit: Unit tests that don't require external services") - config.addinivalue_line("markers", "integration: Integration tests requiring Cassandra cluster") - config.addinivalue_line("markers", "slow: Tests that take a long time to run") - - -def pytest_collection_modifyitems(config, items): - """Automatically skip integration tests if not explicitly requested.""" - if config.getoption("markexpr"): - # User specified markers, respect their choice - return - - # Check if Cassandra is available - cassandra_available = check_cassandra_available() - - skip_integration = pytest.mark.skip( - reason="Integration tests require running Cassandra cluster. Use -m integration to run." - ) - - for item in items: - if "integration" in item.keywords and not cassandra_available: - item.add_marker(skip_integration) - - -def check_cassandra_available(): - """Check if Cassandra cluster is available.""" - try: - # Try to connect to the first node - import socket - - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.settimeout(1) - result = sock.connect_ex(("127.0.0.1", 9042)) - sock.close() - return result == 0 - except Exception: - return False - - -@pytest.fixture(scope="session") -def container_runtime(): - """Get the container runtime being used.""" - return CONTAINER_RUNTIME - - -@pytest.fixture(scope="session") -def docker_compose_file(): - """Path to docker-compose file.""" - return Path(__file__).parent.parent / "docker-compose.yml" - - -@pytest.fixture(scope="session") -def docker_compose_command(container_runtime): - """Get the appropriate docker-compose command.""" - if container_runtime == "podman": - return ["podman-compose"] - else: - return ["docker-compose"] diff --git a/libs/async-cassandra-bulk/examples/tests/integration/README.md b/libs/async-cassandra-bulk/examples/tests/integration/README.md deleted file mode 100644 index 25138a4..0000000 --- a/libs/async-cassandra-bulk/examples/tests/integration/README.md +++ /dev/null @@ -1,100 +0,0 @@ -# Integration Tests for Bulk Operations - -This directory contains integration tests that validate bulk operations against a real Cassandra cluster. - -## Test Organization - -The integration tests are organized into logical modules: - -- **test_token_discovery.py** - Tests for token range discovery with vnodes - - Validates token range discovery matches cluster configuration - - Compares with nodetool describering output - - Ensures complete ring coverage without gaps - -- **test_bulk_count.py** - Tests for bulk count operations - - Validates full data coverage (no missing/duplicate rows) - - Tests wraparound range handling - - Performance testing with different parallelism levels - -- **test_bulk_export.py** - Tests for bulk export operations - - Validates streaming export completeness - - Tests memory efficiency for large exports - - Handles different CQL data types - -- **test_token_splitting.py** - Tests for token range splitting strategies - - Tests proportional splitting based on range sizes - - Handles small vnode ranges appropriately - - Validates replica-aware clustering - -## Running Integration Tests - -Integration tests require a running Cassandra cluster. They are skipped by default. - -### Run all integration tests: -```bash -pytest tests/integration --integration -``` - -### Run specific test module: -```bash -pytest tests/integration/test_bulk_count.py --integration -v -``` - -### Run specific test: -```bash -pytest tests/integration/test_bulk_count.py::TestBulkCount::test_full_table_coverage_with_token_ranges --integration -v -``` - -## Test Infrastructure - -### Automatic Cassandra Startup - -The tests will automatically start a single-node Cassandra container if one is not already running, using either: -- `docker-compose-single.yml` (via docker-compose or podman-compose) - -### Manual Cassandra Setup - -You can also manually start Cassandra: - -```bash -# Single node (recommended for basic tests) -podman-compose -f docker-compose-single.yml up -d - -# Multi-node cluster (for advanced tests) -podman-compose -f docker-compose.yml up -d -``` - -### Test Fixtures - -Common fixtures are defined in `conftest.py`: -- `ensure_cassandra` - Session-scoped fixture that ensures Cassandra is running -- `cluster` - Creates AsyncCluster connection -- `session` - Creates test session with keyspace - -## Test Requirements - -- Cassandra 4.0+ (or ScyllaDB) -- Docker or Podman with compose -- Python packages: pytest, pytest-asyncio, async-cassandra - -## Debugging Tips - -1. **View Cassandra logs:** - ```bash - podman logs bulk-cassandra-1 - ``` - -2. **Check token ranges manually:** - ```bash - podman exec bulk-cassandra-1 nodetool describering bulk_test - ``` - -3. **Run with verbose output:** - ```bash - pytest tests/integration --integration -v -s - ``` - -4. **Run with coverage:** - ```bash - pytest tests/integration --integration --cov=bulk_operations - ``` diff --git a/libs/async-cassandra-bulk/examples/tests/integration/__init__.py b/libs/async-cassandra-bulk/examples/tests/integration/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/libs/async-cassandra-bulk/examples/tests/integration/conftest.py b/libs/async-cassandra-bulk/examples/tests/integration/conftest.py deleted file mode 100644 index c4f43aa..0000000 --- a/libs/async-cassandra-bulk/examples/tests/integration/conftest.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Shared configuration and fixtures for integration tests. -""" - -import os -import subprocess -import time - -import pytest - - -def is_cassandra_running(): - """Check if Cassandra is accessible on localhost.""" - try: - from cassandra.cluster import Cluster - - cluster = Cluster(["localhost"]) - session = cluster.connect() - session.shutdown() - cluster.shutdown() - return True - except Exception: - return False - - -def start_cassandra_if_needed(): - """Start Cassandra using docker-compose if not already running.""" - if is_cassandra_running(): - return True - - # Try to start single-node Cassandra - compose_file = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "docker-compose-single.yml" - ) - - if not os.path.exists(compose_file): - return False - - print("\nStarting Cassandra container for integration tests...") - - # Try podman first, then docker - for cmd in ["podman-compose", "docker-compose"]: - try: - subprocess.run([cmd, "-f", compose_file, "up", "-d"], check=True, capture_output=True) - break - except (subprocess.CalledProcessError, FileNotFoundError): - continue - else: - print("Could not start Cassandra - neither podman-compose nor docker-compose found") - return False - - # Wait for Cassandra to be ready - print("Waiting for Cassandra to be ready...") - for _i in range(60): # Wait up to 60 seconds - if is_cassandra_running(): - print("Cassandra is ready!") - return True - time.sleep(1) - - print("Cassandra failed to start in time") - return False - - -@pytest.fixture(scope="session", autouse=True) -def ensure_cassandra(): - """Ensure Cassandra is running for integration tests.""" - if not start_cassandra_if_needed(): - pytest.skip("Cassandra is not available for integration tests") - - -# Skip integration tests if not explicitly requested -def pytest_collection_modifyitems(config, items): - """Skip integration tests unless --integration flag is passed.""" - if not config.getoption("--integration", default=False): - skip_integration = pytest.mark.skip( - reason="Integration tests not requested (use --integration flag)" - ) - for item in items: - if "integration" in item.keywords: - item.add_marker(skip_integration) - - -def pytest_addoption(parser): - """Add custom command line options.""" - parser.addoption( - "--integration", action="store_true", default=False, help="Run integration tests" - ) diff --git a/libs/async-cassandra-bulk/examples/tests/integration/test_bulk_count.py b/libs/async-cassandra-bulk/examples/tests/integration/test_bulk_count.py deleted file mode 100644 index 8c94b5d..0000000 --- a/libs/async-cassandra-bulk/examples/tests/integration/test_bulk_count.py +++ /dev/null @@ -1,354 +0,0 @@ -""" -Integration tests for bulk count operations. - -What this tests: ---------------- -1. Full data coverage with token ranges (no missing/duplicate rows) -2. Wraparound range handling -3. Count accuracy across different data distributions -4. Performance with parallelism - -Why this matters: ----------------- -- Count is the simplest bulk operation - if it fails, everything fails -- Proves our token range queries are correct -- Gaps mean data loss in production -- Duplicates mean incorrect counting -- Critical for data integrity -""" - -import asyncio - -import pytest - -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator - - -@pytest.mark.integration -class TestBulkCount: - """Test bulk count operations against real Cassandra cluster.""" - - @pytest.fixture - async def cluster(self): - """Create connection to test cluster.""" - cluster = AsyncCluster( - contact_points=["localhost"], - port=9042, - ) - yield cluster - await cluster.shutdown() - - @pytest.fixture - async def session(self, cluster): - """Create test session with keyspace and table.""" - session = await cluster.connect() - - # Create test keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS bulk_test - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 1 - } - """ - ) - - # Create test table - await session.execute( - """ - CREATE TABLE IF NOT EXISTS bulk_test.test_data ( - id INT PRIMARY KEY, - data TEXT, - value DOUBLE - ) - """ - ) - - # Clear any existing data - await session.execute("TRUNCATE bulk_test.test_data") - - yield session - - @pytest.mark.asyncio - async def test_full_table_coverage_with_token_ranges(self, session): - """ - Test that token ranges cover all data without gaps or duplicates. - - What this tests: - --------------- - 1. Insert known dataset across token range - 2. Count using token ranges - 3. Verify exact match with direct count - 4. No missing or duplicate rows - - Why this matters: - ---------------- - - Proves our token range queries are correct - - Gaps mean data loss in production - - Duplicates mean incorrect counting - - Critical for data integrity - """ - # Insert test data with known count - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.test_data (id, data, value) - VALUES (?, ?, ?) - """ - ) - - expected_count = 10000 - print(f"\nInserting {expected_count} test rows...") - - # Insert in batches for efficiency - batch_size = 100 - for i in range(0, expected_count, batch_size): - tasks = [] - for j in range(batch_size): - if i + j < expected_count: - tasks.append(session.execute(insert_stmt, (i + j, f"data-{i+j}", float(i + j)))) - await asyncio.gather(*tasks) - - # Count using direct query - result = await session.execute("SELECT COUNT(*) FROM bulk_test.test_data") - direct_count = result.one().count - assert ( - direct_count == expected_count - ), f"Direct count mismatch: {direct_count} vs {expected_count}" - - # Count using token ranges - operator = TokenAwareBulkOperator(session) - token_count = await operator.count_by_token_ranges( - keyspace="bulk_test", - table="test_data", - split_count=16, # Moderate splitting - parallelism=8, - ) - - print("\nCount comparison:") - print(f" Direct count: {direct_count}") - print(f" Token range count: {token_count}") - - assert ( - token_count == direct_count - ), f"Token range count mismatch: {token_count} vs {direct_count}" - - @pytest.mark.asyncio - async def test_count_with_wraparound_ranges(self, session): - """ - Test counting specifically with wraparound ranges. - - What this tests: - --------------- - 1. Insert data that falls in wraparound range - 2. Verify wraparound range is properly split - 3. Count includes all data - 4. No double counting - - Why this matters: - ---------------- - - Wraparound ranges are tricky edge cases - - CQL doesn't support OR in token queries - - Must split into two queries properly - - Common source of bugs - """ - # Insert test data - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.test_data (id, data, value) - VALUES (?, ?, ?) - """ - ) - - # Insert data with IDs that we know will hash to extreme token values - test_ids = [] - for i in range(50000, 60000): # Test range that includes wraparound tokens - test_ids.append(i) - - print(f"\nInserting {len(test_ids)} test rows...") - batch_size = 100 - for i in range(0, len(test_ids), batch_size): - tasks = [] - for j in range(batch_size): - if i + j < len(test_ids): - id_val = test_ids[i + j] - tasks.append( - session.execute(insert_stmt, (id_val, f"data-{id_val}", float(id_val))) - ) - await asyncio.gather(*tasks) - - # Get direct count - result = await session.execute("SELECT COUNT(*) FROM bulk_test.test_data") - direct_count = result.one().count - - # Count using token ranges with different split counts - operator = TokenAwareBulkOperator(session) - - for split_count in [4, 8, 16, 32]: - token_count = await operator.count_by_token_ranges( - keyspace="bulk_test", - table="test_data", - split_count=split_count, - parallelism=4, - ) - - print(f"\nSplit count {split_count}: {token_count} rows") - assert ( - token_count == direct_count - ), f"Count mismatch with {split_count} splits: {token_count} vs {direct_count}" - - @pytest.mark.asyncio - async def test_parallel_count_performance(self, session): - """ - Test parallel execution improves count performance. - - What this tests: - --------------- - 1. Count performance with different parallelism levels - 2. Results are consistent across parallelism levels - 3. No deadlocks or timeouts - 4. Higher parallelism provides benefit - - Why this matters: - ---------------- - - Parallel execution is the main benefit - - Must handle concurrent queries properly - - Performance validation - - Resource efficiency - """ - # Insert more data for meaningful parallelism test - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.test_data (id, data, value) - VALUES (?, ?, ?) - """ - ) - - # Clear and insert fresh data - await session.execute("TRUNCATE bulk_test.test_data") - - row_count = 50000 - print(f"\nInserting {row_count} rows for parallel test...") - - batch_size = 500 - for i in range(0, row_count, batch_size): - tasks = [] - for j in range(batch_size): - if i + j < row_count: - tasks.append(session.execute(insert_stmt, (i + j, f"data-{i+j}", float(i + j)))) - await asyncio.gather(*tasks) - - operator = TokenAwareBulkOperator(session) - - # Test with different parallelism levels - import time - - results = [] - for parallelism in [1, 2, 4, 8]: - start_time = time.time() - - count = await operator.count_by_token_ranges( - keyspace="bulk_test", table="test_data", split_count=32, parallelism=parallelism - ) - - duration = time.time() - start_time - results.append( - { - "parallelism": parallelism, - "count": count, - "duration": duration, - "rows_per_sec": count / duration, - } - ) - - print(f"\nParallelism {parallelism}:") - print(f" Count: {count}") - print(f" Duration: {duration:.2f}s") - print(f" Rows/sec: {count/duration:,.0f}") - - # All counts should be identical - counts = [r["count"] for r in results] - assert len(set(counts)) == 1, f"Inconsistent counts: {counts}" - - # Higher parallelism should generally be faster - # (though not always due to overhead) - assert ( - results[-1]["duration"] < results[0]["duration"] * 1.5 - ), "Parallel execution not providing benefit" - - @pytest.mark.asyncio - async def test_count_with_progress_callback(self, session): - """ - Test progress callback during count operations. - - What this tests: - --------------- - 1. Progress callbacks are invoked correctly - 2. Stats are accurate and updated - 3. Progress percentage is calculated correctly - 4. Final stats match actual results - - Why this matters: - ---------------- - - Users need progress feedback for long operations - - Stats help with monitoring and debugging - - Progress tracking enables better UX - - Critical for production observability - """ - # Insert test data - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.test_data (id, data, value) - VALUES (?, ?, ?) - """ - ) - - expected_count = 5000 - for i in range(expected_count): - await session.execute(insert_stmt, (i, f"data-{i}", float(i))) - - operator = TokenAwareBulkOperator(session) - - # Track progress callbacks - progress_updates = [] - - def progress_callback(stats): - progress_updates.append( - { - "rows": stats.rows_processed, - "ranges_completed": stats.ranges_completed, - "total_ranges": stats.total_ranges, - "percentage": stats.progress_percentage, - } - ) - - # Count with progress tracking - count, stats = await operator.count_by_token_ranges_with_stats( - keyspace="bulk_test", - table="test_data", - split_count=8, - parallelism=4, - progress_callback=progress_callback, - ) - - print(f"\nProgress updates received: {len(progress_updates)}") - print(f"Final count: {count}") - print( - f"Final stats: rows={stats.rows_processed}, ranges={stats.ranges_completed}/{stats.total_ranges}" - ) - - # Verify results - assert count == expected_count, f"Count mismatch: {count} vs {expected_count}" - assert stats.rows_processed == expected_count - assert stats.ranges_completed == stats.total_ranges - assert stats.success is True - assert len(stats.errors) == 0 - assert len(progress_updates) > 0, "No progress callbacks received" - - # Verify progress increased monotonically - for i in range(1, len(progress_updates)): - assert ( - progress_updates[i]["ranges_completed"] - >= progress_updates[i - 1]["ranges_completed"] - ) diff --git a/libs/async-cassandra-bulk/examples/tests/integration/test_bulk_export.py b/libs/async-cassandra-bulk/examples/tests/integration/test_bulk_export.py deleted file mode 100644 index 35e5eef..0000000 --- a/libs/async-cassandra-bulk/examples/tests/integration/test_bulk_export.py +++ /dev/null @@ -1,382 +0,0 @@ -""" -Integration tests for bulk export operations. - -What this tests: ---------------- -1. Export captures all rows exactly once -2. Streaming doesn't exhaust memory -3. Order within ranges is preserved -4. Async iteration works correctly -5. Export handles different data types - -Why this matters: ----------------- -- Export must be complete and accurate -- Memory efficiency critical for large tables -- Streaming enables TB-scale exports -- Foundation for Iceberg integration -""" - -import asyncio - -import pytest - -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator - - -@pytest.mark.integration -class TestBulkExport: - """Test bulk export operations against real Cassandra cluster.""" - - @pytest.fixture - async def cluster(self): - """Create connection to test cluster.""" - cluster = AsyncCluster( - contact_points=["localhost"], - port=9042, - ) - yield cluster - await cluster.shutdown() - - @pytest.fixture - async def session(self, cluster): - """Create test session with keyspace and table.""" - session = await cluster.connect() - - # Create test keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS bulk_test - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 1 - } - """ - ) - - # Create test table - await session.execute( - """ - CREATE TABLE IF NOT EXISTS bulk_test.test_data ( - id INT PRIMARY KEY, - data TEXT, - value DOUBLE - ) - """ - ) - - # Clear any existing data - await session.execute("TRUNCATE bulk_test.test_data") - - yield session - - @pytest.mark.asyncio - async def test_export_streaming_completeness(self, session): - """ - Test streaming export doesn't miss or duplicate data. - - What this tests: - --------------- - 1. Export captures all rows exactly once - 2. Streaming doesn't exhaust memory - 3. Order within ranges is preserved - 4. Async iteration works correctly - - Why this matters: - ---------------- - - Export must be complete and accurate - - Memory efficiency critical for large tables - - Streaming enables TB-scale exports - - Foundation for Iceberg integration - """ - # Use smaller dataset for export test - await session.execute("TRUNCATE bulk_test.test_data") - - # Insert test data - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.test_data (id, data, value) - VALUES (?, ?, ?) - """ - ) - - expected_ids = set(range(1000)) - for i in expected_ids: - await session.execute(insert_stmt, (i, f"data-{i}", float(i))) - - # Export using token ranges - operator = TokenAwareBulkOperator(session) - - exported_ids = set() - row_count = 0 - - async for row in operator.export_by_token_ranges( - keyspace="bulk_test", table="test_data", split_count=16 - ): - exported_ids.add(row.id) - row_count += 1 - - # Verify row data integrity - assert row.data == f"data-{row.id}" - assert row.value == float(row.id) - - print("\nExport results:") - print(f" Expected rows: {len(expected_ids)}") - print(f" Exported rows: {row_count}") - print(f" Unique IDs: {len(exported_ids)}") - - # Verify completeness - assert row_count == len( - expected_ids - ), f"Row count mismatch: {row_count} vs {len(expected_ids)}" - - assert exported_ids == expected_ids, ( - f"Missing IDs: {expected_ids - exported_ids}, " - f"Duplicate IDs: {exported_ids - expected_ids}" - ) - - @pytest.mark.asyncio - async def test_export_with_wraparound_ranges(self, session): - """ - Test export handles wraparound ranges correctly. - - What this tests: - --------------- - 1. Data in wraparound ranges is exported - 2. No duplicates from split queries - 3. All edge cases handled - 4. Consistent with count operation - - Why this matters: - ---------------- - - Wraparound ranges are common with vnodes - - Export must handle same edge cases as count - - Data integrity is critical - - Foundation for all bulk operations - """ - # Insert data that will span wraparound ranges - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.test_data (id, data, value) - VALUES (?, ?, ?) - """ - ) - - # Insert data with various IDs to ensure coverage - test_data = {} - for i in range(0, 10000, 100): # Sparse data to hit various ranges - test_data[i] = f"data-{i}" - await session.execute(insert_stmt, (i, test_data[i], float(i))) - - # Export and verify - operator = TokenAwareBulkOperator(session) - - exported_data = {} - async for row in operator.export_by_token_ranges( - keyspace="bulk_test", - table="test_data", - split_count=32, # More splits to ensure wraparound handling - ): - exported_data[row.id] = row.data - - print(f"\nExported {len(exported_data)} rows") - assert len(exported_data) == len( - test_data - ), f"Export count mismatch: {len(exported_data)} vs {len(test_data)}" - - # Verify all data was exported correctly - for id_val, expected_data in test_data.items(): - assert id_val in exported_data, f"Missing ID {id_val}" - assert ( - exported_data[id_val] == expected_data - ), f"Data mismatch for ID {id_val}: {exported_data[id_val]} vs {expected_data}" - - @pytest.mark.asyncio - async def test_export_memory_efficiency(self, session): - """ - Test export streaming is memory efficient. - - What this tests: - --------------- - 1. Large exports don't consume excessive memory - 2. Streaming works as expected - 3. Can handle tables larger than memory - 4. Progress tracking during export - - Why this matters: - ---------------- - - Production tables can be TB in size - - Must stream, not buffer all data - - Memory efficiency enables large exports - - Critical for operational feasibility - """ - # Insert larger dataset - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.test_data (id, data, value) - VALUES (?, ?, ?) - """ - ) - - row_count = 10000 - print(f"\nInserting {row_count} rows for memory test...") - - # Insert in batches - batch_size = 100 - for i in range(0, row_count, batch_size): - tasks = [] - for j in range(batch_size): - if i + j < row_count: - # Create larger data values to test memory - data = f"data-{i+j}" * 10 # Make data larger - tasks.append(session.execute(insert_stmt, (i + j, data, float(i + j)))) - await asyncio.gather(*tasks) - - operator = TokenAwareBulkOperator(session) - - # Track memory usage indirectly via row processing rate - rows_exported = 0 - batch_timings = [] - - import time - - start_time = time.time() - last_batch_time = start_time - - async for _row in operator.export_by_token_ranges( - keyspace="bulk_test", table="test_data", split_count=16 - ): - rows_exported += 1 - - # Track timing every 1000 rows - if rows_exported % 1000 == 0: - current_time = time.time() - batch_duration = current_time - last_batch_time - batch_timings.append(batch_duration) - last_batch_time = current_time - print(f" Exported {rows_exported} rows...") - - total_duration = time.time() - start_time - - print("\nExport completed:") - print(f" Total rows: {rows_exported}") - print(f" Total time: {total_duration:.2f}s") - print(f" Rows/sec: {rows_exported/total_duration:.0f}") - - # Verify all rows exported - assert rows_exported == row_count, f"Export count mismatch: {rows_exported} vs {row_count}" - - # Verify consistent performance (no major slowdowns from memory pressure) - if len(batch_timings) > 2: - avg_batch_time = sum(batch_timings) / len(batch_timings) - max_batch_time = max(batch_timings) - assert ( - max_batch_time < avg_batch_time * 3 - ), "Export performance degraded, possible memory issue" - - @pytest.mark.asyncio - async def test_export_with_different_data_types(self, session): - """ - Test export handles various CQL data types correctly. - - What this tests: - --------------- - 1. Different data types are exported correctly - 2. NULL values handled properly - 3. Collections exported accurately - 4. Special characters preserved - - Why this matters: - ---------------- - - Real tables have diverse data types - - Export must preserve data fidelity - - Type handling affects Iceberg mapping - - Data integrity across formats - """ - # Create table with various data types - await session.execute( - """ - CREATE TABLE IF NOT EXISTS bulk_test.complex_data ( - id INT PRIMARY KEY, - text_col TEXT, - int_col INT, - double_col DOUBLE, - bool_col BOOLEAN, - list_col LIST, - set_col SET, - map_col MAP - ) - """ - ) - - await session.execute("TRUNCATE bulk_test.complex_data") - - # Insert test data with various types - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.complex_data - (id, text_col, int_col, double_col, bool_col, list_col, set_col, map_col) - VALUES (?, ?, ?, ?, ?, ?, ?, ?) - """ - ) - - test_data = [ - (1, "normal text", 100, 1.5, True, ["a", "b", "c"], {1, 2, 3}, {"x": 1, "y": 2}), - (2, "special chars: 'quotes' \"double\" \n newline", -50, -2.5, False, [], set(), {}), - (3, None, None, None, None, None, None, None), # NULL values - (4, "", 0, 0.0, True, [""], {0}, {"": 0}), # Empty/zero values - (5, "unicode: ไฝ ๅฅฝ ๐ŸŒŸ", 999999, 3.14159, False, ["ฮฑ", "ฮฒ", "ฮณ"], {-1, -2}, {"ฯ€": 314}), - ] - - for row in test_data: - await session.execute(insert_stmt, row) - - # Export and verify - operator = TokenAwareBulkOperator(session) - - exported_rows = [] - async for row in operator.export_by_token_ranges( - keyspace="bulk_test", table="complex_data", split_count=4 - ): - exported_rows.append(row) - - print(f"\nExported {len(exported_rows)} rows with complex data types") - assert len(exported_rows) == len( - test_data - ), f"Export count mismatch: {len(exported_rows)} vs {len(test_data)}" - - # Sort both by ID for comparison - exported_rows.sort(key=lambda r: r.id) - test_data.sort(key=lambda r: r[0]) - - # Verify each row's data - for exported, expected in zip(exported_rows, test_data, strict=False): - assert exported.id == expected[0] - assert exported.text_col == expected[1] - assert exported.int_col == expected[2] - assert exported.double_col == expected[3] - assert exported.bool_col == expected[4] - - # Collections need special handling - # Note: Cassandra treats empty collections as NULL - if expected[5] is not None and expected[5] != []: - assert exported.list_col is not None, f"list_col is None for row {exported.id}" - assert list(exported.list_col) == expected[5] - else: - # Empty list or None in Cassandra returns as None - assert exported.list_col is None - - if expected[6] is not None and expected[6] != set(): - assert exported.set_col is not None, f"set_col is None for row {exported.id}" - assert set(exported.set_col) == expected[6] - else: - # Empty set or None in Cassandra returns as None - assert exported.set_col is None - - if expected[7] is not None and expected[7] != {}: - assert exported.map_col is not None, f"map_col is None for row {exported.id}" - assert dict(exported.map_col) == expected[7] - else: - # Empty map or None in Cassandra returns as None - assert exported.map_col is None diff --git a/libs/async-cassandra-bulk/examples/tests/integration/test_data_integrity.py b/libs/async-cassandra-bulk/examples/tests/integration/test_data_integrity.py deleted file mode 100644 index 1e82a58..0000000 --- a/libs/async-cassandra-bulk/examples/tests/integration/test_data_integrity.py +++ /dev/null @@ -1,466 +0,0 @@ -""" -Integration tests for data integrity - verifying inserted data is correctly returned. - -What this tests: ---------------- -1. Data inserted is exactly what gets exported -2. All data types are preserved correctly -3. No data corruption during token range queries -4. Prepared statements maintain data integrity - -Why this matters: ----------------- -- Proves end-to-end data correctness -- Validates our token range implementation -- Ensures no data loss or corruption -- Critical for production confidence -""" - -import asyncio -import uuid -from datetime import datetime -from decimal import Decimal - -import pytest - -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator - - -@pytest.mark.integration -class TestDataIntegrity: - """Test that data inserted equals data exported.""" - - @pytest.fixture - async def cluster(self): - """Create connection to test cluster.""" - cluster = AsyncCluster( - contact_points=["localhost"], - port=9042, - ) - yield cluster - await cluster.shutdown() - - @pytest.fixture - async def session(self, cluster): - """Create test session with keyspace and tables.""" - session = await cluster.connect() - - # Create test keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS bulk_test - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 1 - } - """ - ) - - yield session - - @pytest.mark.asyncio - async def test_simple_data_round_trip(self, session): - """ - Test that simple data inserted is exactly what we get back. - - What this tests: - --------------- - 1. Insert known dataset with various values - 2. Export using token ranges - 3. Verify every field matches exactly - 4. No missing or corrupted data - - Why this matters: - ---------------- - - Basic data integrity validation - - Ensures token range queries don't corrupt data - - Validates prepared statement parameter handling - - Foundation for trusting bulk operations - """ - # Create a simple test table - await session.execute( - """ - CREATE TABLE IF NOT EXISTS bulk_test.integrity_test ( - id INT PRIMARY KEY, - name TEXT, - value DOUBLE, - active BOOLEAN - ) - """ - ) - - await session.execute("TRUNCATE bulk_test.integrity_test") - - # Insert test data with prepared statement - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.integrity_test (id, name, value, active) - VALUES (?, ?, ?, ?) - """ - ) - - # Create test dataset with various values - test_data = [ - (1, "Alice", 100.5, True), - (2, "Bob", -50.25, False), - (3, "Charlie", 0.0, True), - (4, None, 999.999, None), # Test NULLs - (5, "", -0.001, False), # Empty string - (6, "Special chars: 'quotes' \"double\"", 3.14159, True), - (7, "Unicode: ไฝ ๅฅฝ ๐ŸŒŸ", 2.71828, False), - (8, "Very long name " * 100, 1.23456, True), # Long string - ] - - # Insert all test data - for row in test_data: - await session.execute(insert_stmt, row) - - # Export using bulk operator - operator = TokenAwareBulkOperator(session) - exported_data = [] - - async for row in operator.export_by_token_ranges( - keyspace="bulk_test", - table="integrity_test", - split_count=4, # Use multiple ranges to test splitting - ): - exported_data.append((row.id, row.name, row.value, row.active)) - - # Sort both datasets by ID for comparison - test_data_sorted = sorted(test_data, key=lambda x: x[0]) - exported_data_sorted = sorted(exported_data, key=lambda x: x[0]) - - # Verify we got all rows - assert len(exported_data_sorted) == len( - test_data_sorted - ), f"Row count mismatch: exported {len(exported_data_sorted)} vs inserted {len(test_data_sorted)}" - - # Verify each row matches exactly - for inserted, exported in zip(test_data_sorted, exported_data_sorted, strict=False): - assert ( - inserted == exported - ), f"Data mismatch for ID {inserted[0]}: inserted {inserted} vs exported {exported}" - - print(f"\nโœ“ All {len(test_data)} rows verified - data integrity maintained") - - @pytest.mark.asyncio - async def test_complex_data_types_round_trip(self, session): - """ - Test complex CQL data types maintain integrity. - - What this tests: - --------------- - 1. Collections (list, set, map) - 2. UUID types - 3. Timestamp/date types - 4. Decimal types - 5. Large text/blob data - - Why this matters: - ---------------- - - Real tables use complex types - - Collections need special handling - - Precision must be maintained - - Production data is complex - """ - # Create table with complex types - await session.execute( - """ - CREATE TABLE IF NOT EXISTS bulk_test.complex_integrity ( - id UUID PRIMARY KEY, - created TIMESTAMP, - amount DECIMAL, - tags SET, - metadata MAP, - events LIST, - data BLOB - ) - """ - ) - - await session.execute("TRUNCATE bulk_test.complex_integrity") - - # Insert test data - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.complex_integrity - (id, created, amount, tags, metadata, events, data) - VALUES (?, ?, ?, ?, ?, ?, ?) - """ - ) - - # Create test data - test_id = uuid.uuid4() - test_created = datetime.utcnow().replace(microsecond=0) # Cassandra timestamp precision - test_amount = Decimal("12345.6789") - test_tags = {"python", "cassandra", "async", "test"} - test_metadata = {"version": 1, "retries": 3, "timeout": 30} - test_events = [ - datetime(2024, 1, 1, 10, 0, 0), - datetime(2024, 1, 2, 11, 30, 0), - datetime(2024, 1, 3, 15, 45, 0), - ] - test_data = b"Binary data with \x00 null bytes and \xff high bytes" - - # Insert the data - await session.execute( - insert_stmt, - ( - test_id, - test_created, - test_amount, - test_tags, - test_metadata, - test_events, - test_data, - ), - ) - - # Export and verify - operator = TokenAwareBulkOperator(session) - exported_rows = [] - - async for row in operator.export_by_token_ranges( - keyspace="bulk_test", - table="complex_integrity", - split_count=2, - ): - exported_rows.append(row) - - # Should have exactly one row - assert len(exported_rows) == 1, f"Expected 1 row, got {len(exported_rows)}" - - row = exported_rows[0] - - # Verify each field - assert row.id == test_id, f"UUID mismatch: {row.id} vs {test_id}" - assert row.created == test_created, f"Timestamp mismatch: {row.created} vs {test_created}" - assert row.amount == test_amount, f"Decimal mismatch: {row.amount} vs {test_amount}" - assert set(row.tags) == test_tags, f"Set mismatch: {set(row.tags)} vs {test_tags}" - assert ( - dict(row.metadata) == test_metadata - ), f"Map mismatch: {dict(row.metadata)} vs {test_metadata}" - assert ( - list(row.events) == test_events - ), f"List mismatch: {list(row.events)} vs {test_events}" - assert bytes(row.data) == test_data, f"Blob mismatch: {bytes(row.data)} vs {test_data}" - - print("\nโœ“ Complex data types verified - all types preserved correctly") - - @pytest.mark.asyncio - async def test_large_dataset_integrity(self, session): # noqa: C901 - """ - Test integrity with larger dataset across many token ranges. - - What this tests: - --------------- - 1. 50K rows with computed values - 2. Verify no rows lost in token ranges - 3. Verify no duplicate rows - 4. Check computed values match - - Why this matters: - ---------------- - - Production tables are large - - Token range bugs appear at scale - - Wraparound ranges must work correctly - - Performance under load - """ - # Create table - await session.execute( - """ - CREATE TABLE IF NOT EXISTS bulk_test.large_integrity ( - id INT PRIMARY KEY, - computed_value DOUBLE, - hash_value TEXT - ) - """ - ) - - await session.execute("TRUNCATE bulk_test.large_integrity") - - # Insert data with computed values - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.large_integrity (id, computed_value, hash_value) - VALUES (?, ?, ?) - """ - ) - - # Function to compute expected values - def compute_value(id_val): - return float(id_val * 3.14159 + id_val**0.5) - - def compute_hash(id_val): - return f"hash_{id_val % 1000:03d}_{id_val}" - - # Insert 50K rows in batches - total_rows = 50000 - batch_size = 1000 - - print(f"\nInserting {total_rows} rows for large dataset test...") - - for batch_start in range(0, total_rows, batch_size): - tasks = [] - for i in range(batch_start, min(batch_start + batch_size, total_rows)): - tasks.append( - session.execute( - insert_stmt, - ( - i, - compute_value(i), - compute_hash(i), - ), - ) - ) - await asyncio.gather(*tasks) - - if (batch_start + batch_size) % 10000 == 0: - print(f" Inserted {batch_start + batch_size} rows...") - - # Export all data - operator = TokenAwareBulkOperator(session) - exported_ids = set() - value_mismatches = [] - hash_mismatches = [] - - print("\nExporting and verifying data...") - - async for row in operator.export_by_token_ranges( - keyspace="bulk_test", - table="large_integrity", - split_count=32, # Many splits to test range handling - ): - # Check for duplicates - if row.id in exported_ids: - pytest.fail(f"Duplicate ID exported: {row.id}") - exported_ids.add(row.id) - - # Verify computed values - expected_value = compute_value(row.id) - if abs(row.computed_value - expected_value) > 0.0001: # Float precision - value_mismatches.append((row.id, row.computed_value, expected_value)) - - expected_hash = compute_hash(row.id) - if row.hash_value != expected_hash: - hash_mismatches.append((row.id, row.hash_value, expected_hash)) - - # Verify completeness - assert ( - len(exported_ids) == total_rows - ), f"Missing rows: exported {len(exported_ids)} vs inserted {total_rows}" - - # Check for missing IDs - expected_ids = set(range(total_rows)) - missing_ids = expected_ids - exported_ids - if missing_ids: - pytest.fail(f"Missing IDs: {sorted(list(missing_ids))[:10]}...") # Show first 10 - - # Check for value mismatches - if value_mismatches: - pytest.fail(f"Value mismatches found: {value_mismatches[:5]}...") # Show first 5 - - if hash_mismatches: - pytest.fail(f"Hash mismatches found: {hash_mismatches[:5]}...") # Show first 5 - - print(f"\nโœ“ All {total_rows} rows verified - large dataset integrity maintained") - print(" - No missing rows") - print(" - No duplicate rows") - print(" - All computed values correct") - print(" - All hash values correct") - - @pytest.mark.asyncio - async def test_wraparound_range_data_integrity(self, session): - """ - Test data integrity specifically for wraparound token ranges. - - What this tests: - --------------- - 1. Insert data with known tokens that span wraparound - 2. Verify wraparound range handling preserves data - 3. No data lost at ring boundaries - 4. Prepared statements work correctly with wraparound - - Why this matters: - ---------------- - - Wraparound ranges are error-prone - - Must split into two queries correctly - - Data at ring boundaries is critical - - Common source of data loss bugs - """ - # Create table - await session.execute( - """ - CREATE TABLE IF NOT EXISTS bulk_test.wraparound_test ( - id INT PRIMARY KEY, - token_value BIGINT, - data TEXT - ) - """ - ) - - await session.execute("TRUNCATE bulk_test.wraparound_test") - - # First, let's find some IDs that hash to extreme token values - print("\nFinding IDs with extreme token values...") - - # Insert some data and check their tokens - insert_stmt = await session.prepare( - """ - INSERT INTO bulk_test.wraparound_test (id, token_value, data) - VALUES (?, ?, ?) - """ - ) - - # Try different IDs to find ones with extreme tokens - test_ids = [] - for i in range(100000, 200000): - # First insert a dummy row to query the token - await session.execute(insert_stmt, (i, 0, f"dummy_{i}")) - result = await session.execute( - f"SELECT token(id) as t FROM bulk_test.wraparound_test WHERE id = {i}" - ) - row = result.one() - if row: - token = row.t - # Remove the dummy row - await session.execute(f"DELETE FROM bulk_test.wraparound_test WHERE id = {i}") - - # Look for very high positive or very low negative tokens - if token > 9000000000000000000 or token < -9000000000000000000: - test_ids.append((i, token)) - await session.execute(insert_stmt, (i, token, f"data_{i}")) - - if len(test_ids) >= 20: - break - - print(f" Found {len(test_ids)} IDs with extreme tokens") - - # Export and verify - operator = TokenAwareBulkOperator(session) - exported_data = {} - - async for row in operator.export_by_token_ranges( - keyspace="bulk_test", - table="wraparound_test", - split_count=8, - ): - exported_data[row.id] = (row.token_value, row.data) - - # Verify all data was exported - for id_val, token_val in test_ids: - assert id_val in exported_data, f"Missing ID {id_val} with token {token_val}" - - exported_token, exported_data_val = exported_data[id_val] - assert ( - exported_token == token_val - ), f"Token mismatch for ID {id_val}: {exported_token} vs {token_val}" - assert ( - exported_data_val == f"data_{id_val}" - ), f"Data mismatch for ID {id_val}: {exported_data_val} vs data_{id_val}" - - print("\nโœ“ Wraparound range data integrity verified") - print(f" - All {len(test_ids)} extreme token rows exported correctly") - print(" - Token values preserved") - print(" - Data values preserved") diff --git a/libs/async-cassandra-bulk/examples/tests/integration/test_export_formats.py b/libs/async-cassandra-bulk/examples/tests/integration/test_export_formats.py deleted file mode 100644 index eedf0ee..0000000 --- a/libs/async-cassandra-bulk/examples/tests/integration/test_export_formats.py +++ /dev/null @@ -1,449 +0,0 @@ -""" -Integration tests for export formats. - -What this tests: ---------------- -1. CSV export with real data -2. JSON export formats (JSONL and array) -3. Parquet export with schema mapping -4. Compression options -5. Data integrity across formats - -Why this matters: ----------------- -- Export formats are critical for data pipelines -- Each format has different use cases -- Parquet is foundation for Iceberg -- Must preserve data types correctly -""" - -import csv -import gzip -import json - -import pytest - -try: - import pyarrow.parquet as pq - - PYARROW_AVAILABLE = True -except ImportError: - PYARROW_AVAILABLE = False - -from async_cassandra import AsyncCluster -from bulk_operations.bulk_operator import TokenAwareBulkOperator - - -@pytest.mark.integration -class TestExportFormats: - """Test export to different formats.""" - - @pytest.fixture - async def cluster(self): - """Create connection to test cluster.""" - cluster = AsyncCluster( - contact_points=["localhost"], - port=9042, - ) - yield cluster - await cluster.shutdown() - - @pytest.fixture - async def session(self, cluster): - """Create test session with test data.""" - session = await cluster.connect() - - # Create test keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS export_test - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 1 - } - """ - ) - - # Create test table with various types - await session.execute( - """ - CREATE TABLE IF NOT EXISTS export_test.data_types ( - id INT PRIMARY KEY, - text_val TEXT, - int_val INT, - float_val FLOAT, - bool_val BOOLEAN, - list_val LIST, - set_val SET, - map_val MAP, - null_val TEXT - ) - """ - ) - - # Clear and insert test data - await session.execute("TRUNCATE export_test.data_types") - - insert_stmt = await session.prepare( - """ - INSERT INTO export_test.data_types - (id, text_val, int_val, float_val, bool_val, - list_val, set_val, map_val, null_val) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) - """ - ) - - # Insert diverse test data - test_data = [ - (1, "test1", 100, 1.5, True, ["a", "b"], {1, 2}, {"k1": "v1"}, None), - (2, "test2", -50, -2.5, False, [], None, {}, None), - (3, "special'chars\"test", 0, 0.0, True, None, {0}, None, None), - (4, "unicode_test_ไฝ ๅฅฝ", 999, 3.14, False, ["x"], {-1}, {"k": "v"}, None), - ] - - for row in test_data: - await session.execute(insert_stmt, row) - - yield session - - @pytest.mark.asyncio - async def test_csv_export_basic(self, session, tmp_path): - """ - Test basic CSV export functionality. - - What this tests: - --------------- - 1. CSV export creates valid file - 2. All rows are exported - 3. Data types are properly serialized - 4. NULL values handled correctly - - Why this matters: - ---------------- - - CSV is most common export format - - Must work with Excel and other tools - - Data integrity is critical - """ - operator = TokenAwareBulkOperator(session) - output_path = tmp_path / "test.csv" - - # Export to CSV - result = await operator.export_to_csv( - keyspace="export_test", - table="data_types", - output_path=output_path, - ) - - # Verify file exists - assert output_path.exists() - assert result.rows_exported == 4 - - # Read and verify content - with open(output_path) as f: - reader = csv.DictReader(f) - rows = list(reader) - - assert len(rows) == 4 - - # Verify first row - row1 = rows[0] - assert row1["id"] == "1" - assert row1["text_val"] == "test1" - assert row1["int_val"] == "100" - assert row1["float_val"] == "1.5" - assert row1["bool_val"] == "true" - assert "[a, b]" in row1["list_val"] - assert row1["null_val"] == "" # Default NULL representation - - @pytest.mark.asyncio - async def test_csv_export_compressed(self, session, tmp_path): - """ - Test CSV export with compression. - - What this tests: - --------------- - 1. Gzip compression works - 2. File has correct extension - 3. Compressed data is valid - 4. Size reduction achieved - - Why this matters: - ---------------- - - Large exports need compression - - Network transfer efficiency - - Storage cost reduction - """ - operator = TokenAwareBulkOperator(session) - output_path = tmp_path / "test.csv" - - # Export with compression - await operator.export_to_csv( - keyspace="export_test", - table="data_types", - output_path=output_path, - compression="gzip", - ) - - # Verify compressed file - compressed_path = output_path.with_suffix(".csv.gzip") - assert compressed_path.exists() - - # Read compressed content - with gzip.open(compressed_path, "rt") as f: - reader = csv.DictReader(f) - rows = list(reader) - - assert len(rows) == 4 - - @pytest.mark.asyncio - async def test_json_export_line_delimited(self, session, tmp_path): - """ - Test JSON line-delimited export. - - What this tests: - --------------- - 1. JSONL format (one JSON per line) - 2. Each line is valid JSON - 3. Data types preserved - 4. Collections handled correctly - - Why this matters: - ---------------- - - JSONL works with streaming tools - - Each line can be processed independently - - Better for large datasets - """ - operator = TokenAwareBulkOperator(session) - output_path = tmp_path / "test.jsonl" - - # Export as JSONL - result = await operator.export_to_json( - keyspace="export_test", - table="data_types", - output_path=output_path, - format_mode="jsonl", - ) - - assert output_path.exists() - assert result.rows_exported == 4 - - # Read and verify JSONL - with open(output_path) as f: - lines = f.readlines() - - assert len(lines) == 4 - - # Parse each line - rows = [json.loads(line) for line in lines] - - # Verify data types - row1 = rows[0] - assert row1["id"] == 1 - assert row1["text_val"] == "test1" - assert row1["bool_val"] is True - assert row1["list_val"] == ["a", "b"] - assert row1["set_val"] == [1, 2] # Sets become lists in JSON - assert row1["map_val"] == {"k1": "v1"} - assert row1["null_val"] is None - - @pytest.mark.asyncio - async def test_json_export_array(self, session, tmp_path): - """ - Test JSON array export. - - What this tests: - --------------- - 1. Valid JSON array format - 2. Proper array structure - 3. Pretty printing option - 4. Complete document - - Why this matters: - ---------------- - - Some APIs expect JSON arrays - - Easier for small datasets - - Human readable with indent - """ - operator = TokenAwareBulkOperator(session) - output_path = tmp_path / "test.json" - - # Export as JSON array - await operator.export_to_json( - keyspace="export_test", - table="data_types", - output_path=output_path, - format_mode="array", - indent=2, - ) - - assert output_path.exists() - - # Read and parse JSON - with open(output_path) as f: - data = json.load(f) - - assert isinstance(data, list) - assert len(data) == 4 - - # Verify structure - assert all(isinstance(row, dict) for row in data) - - @pytest.mark.asyncio - @pytest.mark.skipif(not PYARROW_AVAILABLE, reason="PyArrow not installed") - async def test_parquet_export(self, session, tmp_path): - """ - Test Parquet export - foundation for Iceberg. - - What this tests: - --------------- - 1. Valid Parquet file created - 2. Schema correctly mapped - 3. Data types preserved - 4. Row groups created - - Why this matters: - ---------------- - - Parquet is THE format for Iceberg - - Columnar storage for analytics - - Schema evolution support - - Excellent compression - """ - operator = TokenAwareBulkOperator(session) - output_path = tmp_path / "test.parquet" - - # Export to Parquet - result = await operator.export_to_parquet( - keyspace="export_test", - table="data_types", - output_path=output_path, - row_group_size=2, # Small for testing - ) - - assert output_path.exists() - assert result.rows_exported == 4 - - # Read Parquet file - table = pq.read_table(output_path) - - # Verify schema - schema = table.schema - assert "id" in schema.names - assert "text_val" in schema.names - assert "bool_val" in schema.names - - # Verify data - df = table.to_pandas() - assert len(df) == 4 - - # Check data types preserved - assert df.loc[0, "id"] == 1 - assert df.loc[0, "text_val"] == "test1" - assert df.loc[0, "bool_val"] is True or df.loc[0, "bool_val"] == 1 # numpy bool comparison - - # Verify row groups - parquet_file = pq.ParquetFile(output_path) - assert parquet_file.num_row_groups == 2 # 4 rows / 2 per group - - @pytest.mark.asyncio - async def test_export_with_column_selection(self, session, tmp_path): - """ - Test exporting specific columns only. - - What this tests: - --------------- - 1. Column selection works - 2. Only selected columns exported - 3. Order preserved - 4. Works across all formats - - Why this matters: - ---------------- - - Reduce export size - - Privacy/security (exclude sensitive columns) - - Performance optimization - """ - operator = TokenAwareBulkOperator(session) - columns = ["id", "text_val", "bool_val"] - - # Test CSV - csv_path = tmp_path / "selected.csv" - await operator.export_to_csv( - keyspace="export_test", - table="data_types", - output_path=csv_path, - columns=columns, - ) - - with open(csv_path) as f: - reader = csv.DictReader(f) - row = next(reader) - assert set(row.keys()) == set(columns) - - # Test JSON - json_path = tmp_path / "selected.jsonl" - await operator.export_to_json( - keyspace="export_test", - table="data_types", - output_path=json_path, - columns=columns, - ) - - with open(json_path) as f: - row = json.loads(f.readline()) - assert set(row.keys()) == set(columns) - - @pytest.mark.asyncio - async def test_export_progress_tracking(self, session, tmp_path): - """ - Test progress tracking and resume capability. - - What this tests: - --------------- - 1. Progress callbacks invoked - 2. Progress saved to file - 3. Resume information correct - 4. Stats accurately tracked - - Why this matters: - ---------------- - - Long exports need monitoring - - Resume saves time on failures - - Users need feedback - """ - operator = TokenAwareBulkOperator(session) - output_path = tmp_path / "progress_test.csv" - - progress_updates = [] - - async def track_progress(progress): - progress_updates.append( - { - "rows": progress.rows_exported, - "bytes": progress.bytes_written, - "percentage": progress.progress_percentage, - } - ) - - # Export with progress tracking - result = await operator.export_to_csv( - keyspace="export_test", - table="data_types", - output_path=output_path, - progress_callback=track_progress, - ) - - # Verify progress was tracked - assert len(progress_updates) > 0 - assert result.rows_exported == 4 - assert result.bytes_written > 0 - - # Verify progress file - progress_file = output_path.with_suffix(".csv.progress") - assert progress_file.exists() - - # Load and verify progress - from bulk_operations.exporters import ExportProgress - - loaded = ExportProgress.load(progress_file) - assert loaded.rows_exported == 4 - assert loaded.is_complete diff --git a/libs/async-cassandra-bulk/examples/tests/integration/test_token_discovery.py b/libs/async-cassandra-bulk/examples/tests/integration/test_token_discovery.py deleted file mode 100644 index b99115f..0000000 --- a/libs/async-cassandra-bulk/examples/tests/integration/test_token_discovery.py +++ /dev/null @@ -1,198 +0,0 @@ -""" -Integration tests for token range discovery with vnodes. - -What this tests: ---------------- -1. Token range discovery matches cluster vnodes configuration -2. Validation against nodetool describering output -3. Token distribution across nodes -4. Non-overlapping and complete token coverage - -Why this matters: ----------------- -- Vnodes create hundreds of non-contiguous ranges -- Token metadata must match cluster reality -- Incorrect discovery means data loss -- Production clusters always use vnodes -""" - -import subprocess -from collections import defaultdict - -import pytest - -from async_cassandra import AsyncCluster -from bulk_operations.token_utils import TOTAL_TOKEN_RANGE, discover_token_ranges - - -@pytest.mark.integration -class TestTokenDiscovery: - """Test token range discovery against real Cassandra cluster.""" - - @pytest.fixture - async def cluster(self): - """Create connection to test cluster.""" - # Connect to all three nodes - cluster = AsyncCluster( - contact_points=["localhost", "127.0.0.1", "127.0.0.2"], - port=9042, - ) - yield cluster - await cluster.shutdown() - - @pytest.fixture - async def session(self, cluster): - """Create test session with keyspace.""" - session = await cluster.connect() - - # Create test keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS bulk_test - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 3 - } - """ - ) - - yield session - - @pytest.mark.asyncio - async def test_token_range_discovery_with_vnodes(self, session): - """ - Test token range discovery matches cluster vnodes configuration. - - What this tests: - --------------- - 1. Number of ranges matches vnode configuration - 2. Each node owns approximately equal ranges - 3. All ranges have correct replica information - 4. Token ranges are non-overlapping and complete - - Why this matters: - ---------------- - - With 256 vnodes ร— 3 nodes = ~768 ranges expected - - Vnodes distribute ownership across the ring - - Incorrect discovery means data loss - - Must handle non-contiguous ownership correctly - """ - ranges = await discover_token_ranges(session, "bulk_test") - - # With 3 nodes and 256 vnodes each, expect many ranges - # Due to replication factor 3, each range has 3 replicas - assert len(ranges) > 100, f"Expected many ranges with vnodes, got {len(ranges)}" - - # Count ranges per node - ranges_per_node = defaultdict(int) - for r in ranges: - for replica in r.replicas: - ranges_per_node[replica] += 1 - - print(f"\nToken ranges discovered: {len(ranges)}") - print("Ranges per node:") - for node, count in sorted(ranges_per_node.items()): - print(f" {node}: {count} ranges") - - # Each node should own approximately the same number of ranges - counts = list(ranges_per_node.values()) - if len(counts) >= 3: - avg_count = sum(counts) / len(counts) - for count in counts: - # Allow 20% variance - assert ( - 0.8 * avg_count <= count <= 1.2 * avg_count - ), f"Uneven distribution: {ranges_per_node}" - - # Verify ranges cover the entire ring - sorted_ranges = sorted(ranges, key=lambda r: r.start) - - # With vnodes, tokens are randomly distributed, so the first range - # won't necessarily start at MIN_TOKEN. What matters is: - # 1. No gaps between consecutive ranges - # 2. The last range wraps around to the first range - # 3. Total coverage equals the token space - - # Check for gaps or overlaps between consecutive ranges - gaps = 0 - for i in range(len(sorted_ranges) - 1): - current = sorted_ranges[i] - next_range = sorted_ranges[i + 1] - - # Ranges should be contiguous - if current.end != next_range.start: - gaps += 1 - print(f"Gap found: {current.end} to {next_range.start}") - - assert gaps == 0, f"Found {gaps} gaps in token ranges" - - # Verify the last range wraps around to the first - assert sorted_ranges[-1].end == sorted_ranges[0].start, ( - f"Ring not closed: last range ends at {sorted_ranges[-1].end}, " - f"first range starts at {sorted_ranges[0].start}" - ) - - # Verify total coverage - total_size = sum(r.size for r in ranges) - # Allow for small rounding differences - assert abs(total_size - TOTAL_TOKEN_RANGE) <= len( - ranges - ), f"Total coverage {total_size} differs from expected {TOTAL_TOKEN_RANGE}" - - @pytest.mark.asyncio - async def test_compare_with_nodetool_describering(self, session): - """ - Compare discovered ranges with nodetool describering output. - - What this tests: - --------------- - 1. Our discovery matches nodetool output - 2. Token boundaries are correct - 3. Replica assignments match - 4. No missing or extra ranges - - Why this matters: - ---------------- - - nodetool is the source of truth - - Mismatches indicate bugs in discovery - - Critical for production reliability - - Validates driver metadata accuracy - """ - ranges = await discover_token_ranges(session, "bulk_test") - - # Get nodetool output from first node - try: - result = subprocess.run( - ["podman", "exec", "bulk-cassandra-1", "nodetool", "describering", "bulk_test"], - capture_output=True, - text=True, - check=True, - ) - nodetool_output = result.stdout - except subprocess.CalledProcessError: - # Try docker if podman fails - try: - result = subprocess.run( - ["docker", "exec", "bulk-cassandra-1", "nodetool", "describering", "bulk_test"], - capture_output=True, - text=True, - check=True, - ) - nodetool_output = result.stdout - except subprocess.CalledProcessError as e: - pytest.skip(f"Cannot run nodetool: {e}") - - print("\nNodetool describering output (first 20 lines):") - print("\n".join(nodetool_output.split("\n")[:20])) - - # Parse token count from nodetool output - token_ranges_in_output = nodetool_output.count("TokenRange") - - print("\nComparison:") - print(f" Discovered ranges: {len(ranges)}") - print(f" Nodetool ranges: {token_ranges_in_output}") - - # Should have same number of ranges (allowing small variance) - assert ( - abs(len(ranges) - token_ranges_in_output) <= 5 - ), f"Mismatch in range count: discovered {len(ranges)} vs nodetool {token_ranges_in_output}" diff --git a/libs/async-cassandra-bulk/examples/tests/integration/test_token_splitting.py b/libs/async-cassandra-bulk/examples/tests/integration/test_token_splitting.py deleted file mode 100644 index 72bc290..0000000 --- a/libs/async-cassandra-bulk/examples/tests/integration/test_token_splitting.py +++ /dev/null @@ -1,283 +0,0 @@ -""" -Integration tests for token range splitting functionality. - -What this tests: ---------------- -1. Token range splitting with different strategies -2. Proportional splitting based on range sizes -3. Handling of very small ranges (vnodes) -4. Replica-aware clustering - -Why this matters: ----------------- -- Efficient parallelism requires good splitting -- Vnodes create many small ranges that shouldn't be over-split -- Replica clustering improves coordinator efficiency -- Performance optimization foundation -""" - -import pytest - -from async_cassandra import AsyncCluster -from bulk_operations.token_utils import TokenRangeSplitter, discover_token_ranges - - -@pytest.mark.integration -class TestTokenSplitting: - """Test token range splitting strategies.""" - - @pytest.fixture - async def cluster(self): - """Create connection to test cluster.""" - cluster = AsyncCluster( - contact_points=["localhost"], - port=9042, - ) - yield cluster - await cluster.shutdown() - - @pytest.fixture - async def session(self, cluster): - """Create test session with keyspace.""" - session = await cluster.connect() - - # Create test keyspace - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS bulk_test - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 1 - } - """ - ) - - yield session - - @pytest.mark.asyncio - async def test_token_range_splitting_with_vnodes(self, session): - """ - Test that splitting handles vnode token ranges correctly. - - What this tests: - --------------- - 1. Natural ranges from vnodes are small - 2. Splitting respects range boundaries - 3. Very small ranges aren't over-split - 4. Large splits still cover all ranges - - Why this matters: - ---------------- - - Vnodes create many small ranges - - Over-splitting causes overhead - - Under-splitting reduces parallelism - - Must balance performance - """ - ranges = await discover_token_ranges(session, "bulk_test") - splitter = TokenRangeSplitter() - - # Test different split counts - for split_count in [10, 50, 100, 500]: - splits = splitter.split_proportionally(ranges, split_count) - - print(f"\nSplitting {len(ranges)} ranges into {split_count} splits:") - print(f" Actual splits: {len(splits)}") - - # Verify coverage - total_size = sum(r.size for r in ranges) - split_size = sum(s.size for s in splits) - - assert split_size == total_size, f"Split size mismatch: {split_size} vs {total_size}" - - # With vnodes, we might not achieve the exact split count - # because many ranges are too small to split - if split_count < len(ranges): - assert ( - len(splits) >= split_count * 0.5 - ), f"Too few splits: {len(splits)} (wanted ~{split_count})" - - @pytest.mark.asyncio - async def test_single_range_splitting(self, session): - """ - Test splitting of individual token ranges. - - What this tests: - --------------- - 1. Single range can be split evenly - 2. Last split gets remainder - 3. Small ranges aren't over-split - 4. Split boundaries are correct - - Why this matters: - ---------------- - - Foundation of proportional splitting - - Must handle edge cases correctly - - Affects query generation - - Performance depends on even distribution - """ - ranges = await discover_token_ranges(session, "bulk_test") - splitter = TokenRangeSplitter() - - # Find a reasonably large range to test - sorted_ranges = sorted(ranges, key=lambda r: r.size, reverse=True) - large_range = sorted_ranges[0] - - print("\nTesting single range splitting:") - print(f" Range size: {large_range.size}") - print(f" Range: {large_range.start} to {large_range.end}") - - # Test different split counts - for split_count in [1, 2, 5, 10]: - splits = splitter.split_single_range(large_range, split_count) - - print(f"\n Splitting into {split_count}:") - print(f" Actual splits: {len(splits)}") - - # Verify coverage - assert sum(s.size for s in splits) == large_range.size - - # Verify contiguous - for i in range(len(splits) - 1): - assert splits[i].end == splits[i + 1].start - - # Verify boundaries - assert splits[0].start == large_range.start - assert splits[-1].end == large_range.end - - # Verify replicas preserved - for s in splits: - assert s.replicas == large_range.replicas - - @pytest.mark.asyncio - async def test_replica_clustering(self, session): - """ - Test clustering ranges by replica sets. - - What this tests: - --------------- - 1. Ranges are correctly grouped by replicas - 2. All ranges are included in clusters - 3. No ranges are duplicated - 4. Replica sets are handled consistently - - Why this matters: - ---------------- - - Coordinator efficiency depends on replica locality - - Reduces network hops in multi-DC setups - - Improves cache utilization - - Foundation for topology-aware operations - """ - # For this test, use multi-node replication - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS bulk_test_replicated - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 3 - } - """ - ) - - ranges = await discover_token_ranges(session, "bulk_test_replicated") - splitter = TokenRangeSplitter() - - clusters = splitter.cluster_by_replicas(ranges) - - print("\nReplica clustering results:") - print(f" Total ranges: {len(ranges)}") - print(f" Replica clusters: {len(clusters)}") - - total_clustered = sum(len(ranges_list) for ranges_list in clusters.values()) - print(f" Total ranges in clusters: {total_clustered}") - - # Verify all ranges are clustered - assert total_clustered == len( - ranges - ), f"Not all ranges clustered: {total_clustered} vs {len(ranges)}" - - # Verify no duplicates - seen_ranges = set() - for _replica_set, range_list in clusters.items(): - for r in range_list: - range_key = (r.start, r.end) - assert range_key not in seen_ranges, f"Duplicate range: {range_key}" - seen_ranges.add(range_key) - - # Print cluster distribution - for replica_set, range_list in sorted(clusters.items()): - print(f" Replicas {replica_set}: {len(range_list)} ranges") - - @pytest.mark.asyncio - async def test_proportional_splitting_accuracy(self, session): - """ - Test that proportional splitting maintains relative sizes. - - What this tests: - --------------- - 1. Large ranges get more splits than small ones - 2. Total coverage is preserved - 3. Split distribution matches range distribution - 4. No ranges are lost or duplicated - - Why this matters: - ---------------- - - Even work distribution across ranges - - Prevents hotspots from uneven splitting - - Optimizes parallel execution - - Critical for performance - """ - ranges = await discover_token_ranges(session, "bulk_test") - splitter = TokenRangeSplitter() - - # Calculate range size distribution - total_size = sum(r.size for r in ranges) - range_fractions = [(r, r.size / total_size) for r in ranges] - - # Sort by size for analysis - range_fractions.sort(key=lambda x: x[1], reverse=True) - - print("\nRange size distribution:") - print(f" Largest range: {range_fractions[0][1]:.2%} of total") - print(f" Smallest range: {range_fractions[-1][1]:.2%} of total") - print(f" Median range: {range_fractions[len(range_fractions)//2][1]:.2%} of total") - - # Test proportional splitting - target_splits = 100 - splits = splitter.split_proportionally(ranges, target_splits) - - # Analyze split distribution - splits_per_range = {} - for split in splits: - # Find which original range this split came from - for orig_range in ranges: - if (split.start >= orig_range.start and split.end <= orig_range.end) or ( - orig_range.start == split.start and orig_range.end == split.end - ): - key = (orig_range.start, orig_range.end) - splits_per_range[key] = splits_per_range.get(key, 0) + 1 - break - - # Verify proportionality - print("\nProportional splitting results:") - print(f" Target splits: {target_splits}") - print(f" Actual splits: {len(splits)}") - print(f" Ranges that got splits: {len(splits_per_range)}") - - # Large ranges should get more splits - large_range = range_fractions[0][0] - large_range_key = (large_range.start, large_range.end) - large_range_splits = splits_per_range.get(large_range_key, 0) - - small_range = range_fractions[-1][0] - small_range_key = (small_range.start, small_range.end) - small_range_splits = splits_per_range.get(small_range_key, 0) - - print(f" Largest range got {large_range_splits} splits") - print(f" Smallest range got {small_range_splits} splits") - - # Large ranges should generally get more splits - # (unless they're still too small to split effectively) - if large_range.size > small_range.size * 10: - assert ( - large_range_splits >= small_range_splits - ), "Large range should get at least as many splits as small range" diff --git a/libs/async-cassandra-bulk/examples/tests/unit/__init__.py b/libs/async-cassandra-bulk/examples/tests/unit/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/libs/async-cassandra-bulk/examples/tests/unit/test_bulk_operator.py b/libs/async-cassandra-bulk/examples/tests/unit/test_bulk_operator.py deleted file mode 100644 index af03562..0000000 --- a/libs/async-cassandra-bulk/examples/tests/unit/test_bulk_operator.py +++ /dev/null @@ -1,381 +0,0 @@ -""" -Unit tests for TokenAwareBulkOperator. - -What this tests: ---------------- -1. Parallel execution of token range queries -2. Result aggregation and streaming -3. Progress tracking -4. Error handling and recovery - -Why this matters: ----------------- -- Ensures correct parallel processing -- Validates data completeness -- Confirms non-blocking async behavior -- Handles failures gracefully - -Additional context: ---------------------------------- -These tests mock the async-cassandra library to test -our bulk operation logic in isolation. -""" - -import asyncio -from unittest.mock import AsyncMock, Mock, patch - -import pytest - -from bulk_operations.bulk_operator import ( - BulkOperationError, - BulkOperationStats, - TokenAwareBulkOperator, -) - - -class TestTokenAwareBulkOperator: - """Test the main bulk operator class.""" - - @pytest.fixture - def mock_cluster(self): - """Create a mock AsyncCluster.""" - cluster = Mock() - cluster.contact_points = ["127.0.0.1", "127.0.0.2", "127.0.0.3"] - return cluster - - @pytest.fixture - def mock_session(self, mock_cluster): - """Create a mock AsyncSession.""" - session = Mock() - # Mock the underlying sync session that has cluster attribute - session._session = Mock() - session._session.cluster = mock_cluster - session.execute = AsyncMock() - session.execute_stream = AsyncMock() - session.prepare = AsyncMock(return_value=Mock()) # Mock prepare method - - # Mock metadata structure - metadata = Mock() - - # Create proper column mock - partition_key_col = Mock() - partition_key_col.name = "id" # Set the name attribute properly - - keyspaces = { - "test_ks": Mock(tables={"test_table": Mock(partition_key=[partition_key_col])}) - } - metadata.keyspaces = keyspaces - mock_cluster.metadata = metadata - - return session - - @pytest.mark.unit - async def test_count_by_token_ranges_single_node(self, mock_session): - """ - Test counting rows with token ranges on single node. - - What this tests: - --------------- - 1. Token range discovery is called correctly - 2. Queries are generated for each token range - 3. Results are aggregated properly - 4. Single node operation works correctly - - Why this matters: - ---------------- - - Ensures basic counting functionality works - - Validates token range splitting logic - - Confirms proper result aggregation - - Foundation for more complex multi-node operations - """ - operator = TokenAwareBulkOperator(mock_session) - - # Mock token range discovery - with patch( - "bulk_operations.bulk_operator.discover_token_ranges", new_callable=AsyncMock - ) as mock_discover: - # Create proper TokenRange mocks - from bulk_operations.token_utils import TokenRange - - mock_ranges = [ - TokenRange(start=-1000, end=0, replicas=["127.0.0.1"]), - TokenRange(start=0, end=1000, replicas=["127.0.0.1"]), - ] - mock_discover.return_value = mock_ranges - - # Mock query results - mock_session.execute.side_effect = [ - Mock(one=Mock(return_value=Mock(count=500))), # First range - Mock(one=Mock(return_value=Mock(count=300))), # Second range - ] - - # Execute count - result = await operator.count_by_token_ranges( - keyspace="test_ks", table="test_table", split_count=2 - ) - - assert result == 800 - assert mock_session.execute.call_count == 2 - - @pytest.mark.unit - async def test_count_with_parallel_execution(self, mock_session): - """ - Test that counts are executed in parallel. - - What this tests: - --------------- - 1. Multiple token ranges are processed concurrently - 2. Parallelism limits are respected - 3. Total execution time reflects parallel processing - 4. Results are correctly aggregated from parallel tasks - - Why this matters: - ---------------- - - Parallel execution is critical for performance - - Must not block the event loop - - Resource limits must be respected - - Common pattern in production bulk operations - """ - operator = TokenAwareBulkOperator(mock_session) - - # Track execution times - execution_times = [] - - async def mock_execute_with_delay(stmt, params=None): - start = asyncio.get_event_loop().time() - await asyncio.sleep(0.1) # Simulate query time - execution_times.append(asyncio.get_event_loop().time() - start) - return Mock(one=Mock(return_value=Mock(count=100))) - - mock_session.execute = mock_execute_with_delay - - with patch( - "bulk_operations.bulk_operator.discover_token_ranges", new_callable=AsyncMock - ) as mock_discover: - # Create 4 ranges - from bulk_operations.token_utils import TokenRange - - mock_ranges = [ - TokenRange(start=i * 1000, end=(i + 1) * 1000, replicas=["node1"]) for i in range(4) - ] - mock_discover.return_value = mock_ranges - - # Execute count - start_time = asyncio.get_event_loop().time() - result = await operator.count_by_token_ranges( - keyspace="test_ks", table="test_table", split_count=4, parallelism=4 - ) - total_time = asyncio.get_event_loop().time() - start_time - - assert result == 400 # 4 ranges * 100 each - # If executed in parallel, total time should be ~0.1s, not 0.4s - assert total_time < 0.2 - - @pytest.mark.unit - async def test_count_with_error_handling(self, mock_session): - """ - Test error handling during count operations. - - What this tests: - --------------- - 1. Partial failures are handled gracefully - 2. BulkOperationError is raised with partial results - 3. Individual errors are collected and reported - 4. Operation continues despite individual failures - - Why this matters: - ---------------- - - Network issues can cause partial failures - - Users need visibility into what succeeded - - Partial results are often useful - - Critical for production reliability - """ - operator = TokenAwareBulkOperator(mock_session) - - with patch( - "bulk_operations.bulk_operator.discover_token_ranges", new_callable=AsyncMock - ) as mock_discover: - from bulk_operations.token_utils import TokenRange - - mock_ranges = [ - TokenRange(start=0, end=1000, replicas=["node1"]), - TokenRange(start=1000, end=2000, replicas=["node2"]), - ] - mock_discover.return_value = mock_ranges - - # First succeeds, second fails - mock_session.execute.side_effect = [ - Mock(one=Mock(return_value=Mock(count=500))), - Exception("Connection timeout"), - ] - - # Should raise BulkOperationError - with pytest.raises(BulkOperationError) as exc_info: - await operator.count_by_token_ranges( - keyspace="test_ks", table="test_table", split_count=2 - ) - - assert "Failed to count" in str(exc_info.value) - assert exc_info.value.partial_result == 500 - - @pytest.mark.unit - async def test_export_streaming(self, mock_session): - """ - Test streaming export functionality. - - What this tests: - --------------- - 1. Token ranges are discovered for export - 2. Results are streamed asynchronously - 3. Memory usage remains constant (streaming) - 4. All rows are yielded in order - - Why this matters: - ---------------- - - Streaming prevents memory exhaustion - - Essential for large dataset exports - - Async iteration must work correctly - - Foundation for Iceberg export functionality - """ - operator = TokenAwareBulkOperator(mock_session) - - # Mock token range discovery - with patch( - "bulk_operations.bulk_operator.discover_token_ranges", new_callable=AsyncMock - ) as mock_discover: - from bulk_operations.token_utils import TokenRange - - mock_ranges = [TokenRange(start=0, end=1000, replicas=["node1"])] - mock_discover.return_value = mock_ranges - - # Mock streaming results - async def mock_stream_results(): - for i in range(10): - row = Mock() - row.id = i - row.name = f"row_{i}" - yield row - - mock_stream_context = AsyncMock() - mock_stream_context.__aenter__.return_value = mock_stream_results() - mock_stream_context.__aexit__.return_value = None - - mock_session.execute_stream.return_value = mock_stream_context - - # Collect exported rows - exported_rows = [] - async for row in operator.export_by_token_ranges( - keyspace="test_ks", table="test_table", split_count=1 - ): - exported_rows.append(row) - - assert len(exported_rows) == 10 - assert exported_rows[0].id == 0 - assert exported_rows[9].name == "row_9" - - @pytest.mark.unit - async def test_progress_callback(self, mock_session): - """ - Test progress callback functionality. - - What this tests: - --------------- - 1. Progress callbacks are invoked during operation - 2. Statistics are updated correctly - 3. Progress percentage is calculated accurately - 4. Final statistics reflect complete operation - - Why this matters: - ---------------- - - Users need visibility into long-running operations - - Progress tracking enables better UX - - Statistics help with performance tuning - - Critical for production monitoring - """ - operator = TokenAwareBulkOperator(mock_session) - progress_updates = [] - - def progress_callback(stats: BulkOperationStats): - progress_updates.append( - { - "rows": stats.rows_processed, - "ranges": stats.ranges_completed, - "progress": stats.progress_percentage, - } - ) - - # Mock setup - with patch( - "bulk_operations.bulk_operator.discover_token_ranges", new_callable=AsyncMock - ) as mock_discover: - from bulk_operations.token_utils import TokenRange - - mock_ranges = [ - TokenRange(start=0, end=1000, replicas=["node1"]), - TokenRange(start=1000, end=2000, replicas=["node2"]), - ] - mock_discover.return_value = mock_ranges - - mock_session.execute.side_effect = [ - Mock(one=Mock(return_value=Mock(count=500))), - Mock(one=Mock(return_value=Mock(count=300))), - ] - - # Execute with progress callback - await operator.count_by_token_ranges( - keyspace="test_ks", - table="test_table", - split_count=2, - progress_callback=progress_callback, - ) - - assert len(progress_updates) >= 2 - # Check final progress - final_update = progress_updates[-1] - assert final_update["ranges"] == 2 - assert final_update["progress"] == 100.0 - - @pytest.mark.unit - async def test_operation_stats(self, mock_session): - """ - Test operation statistics collection. - - What this tests: - --------------- - 1. Statistics are collected during operations - 2. Duration is calculated correctly - 3. Rows per second metric is accurate - 4. All statistics fields are populated - - Why this matters: - ---------------- - - Performance metrics guide optimization - - Statistics enable capacity planning - - Benchmarking requires accurate metrics - - Production monitoring depends on these stats - """ - operator = TokenAwareBulkOperator(mock_session) - - with patch( - "bulk_operations.bulk_operator.discover_token_ranges", new_callable=AsyncMock - ) as mock_discover: - from bulk_operations.token_utils import TokenRange - - mock_ranges = [TokenRange(start=0, end=1000, replicas=["node1"])] - mock_discover.return_value = mock_ranges - - # Mock returns the same value for all calls (it's a single range) - mock_count_result = Mock() - mock_count_result.one.return_value = Mock(count=1000) - mock_session.execute.return_value = mock_count_result - - # Get stats after operation - count, stats = await operator.count_by_token_ranges_with_stats( - keyspace="test_ks", table="test_table", split_count=1 - ) - - assert count == 1000 - assert stats.rows_processed == 1000 - assert stats.ranges_completed == 1 - assert stats.duration_seconds > 0 - assert stats.rows_per_second > 0 diff --git a/libs/async-cassandra-bulk/examples/tests/unit/test_csv_exporter.py b/libs/async-cassandra-bulk/examples/tests/unit/test_csv_exporter.py deleted file mode 100644 index 9f17fff..0000000 --- a/libs/async-cassandra-bulk/examples/tests/unit/test_csv_exporter.py +++ /dev/null @@ -1,365 +0,0 @@ -"""Unit tests for CSV exporter. - -What this tests: ---------------- -1. CSV header generation -2. Row serialization with different data types -3. NULL value handling -4. Collection serialization -5. Compression support -6. Progress tracking - -Why this matters: ----------------- -- CSV is a common export format -- Data type handling must be consistent -- Resume capability is critical for large exports -- Compression saves disk space -""" - -import csv -import gzip -import io -import uuid -from datetime import datetime -from unittest.mock import Mock - -import pytest - -from bulk_operations.bulk_operator import TokenAwareBulkOperator -from bulk_operations.exporters import CSVExporter, ExportFormat, ExportProgress - - -class MockRow: - """Mock Cassandra row object.""" - - def __init__(self, **kwargs): - self._fields = list(kwargs.keys()) - for key, value in kwargs.items(): - setattr(self, key, value) - - -class TestCSVExporter: - """Test CSV export functionality.""" - - @pytest.fixture - def mock_operator(self): - """Create mock bulk operator.""" - operator = Mock(spec=TokenAwareBulkOperator) - operator.session = Mock() - operator.session._session = Mock() - operator.session._session.cluster = Mock() - operator.session._session.cluster.metadata = Mock() - return operator - - @pytest.fixture - def exporter(self, mock_operator): - """Create CSV exporter instance.""" - return CSVExporter(mock_operator) - - def test_csv_value_serialization(self, exporter): - """ - Test serialization of different value types to CSV. - - What this tests: - --------------- - 1. NULL values become empty strings - 2. Booleans become true/false - 3. Collections get formatted properly - 4. Bytes are hex encoded - 5. Timestamps use ISO format - - Why this matters: - ---------------- - - CSV needs consistent string representation - - Must be reversible for imports - - Standard tools should understand the format - """ - # NULL handling - assert exporter._serialize_csv_value(None) == "" - - # Primitives - assert exporter._serialize_csv_value(True) == "true" - assert exporter._serialize_csv_value(False) == "false" - assert exporter._serialize_csv_value(42) == "42" - assert exporter._serialize_csv_value(3.14) == "3.14" - assert exporter._serialize_csv_value("test") == "test" - - # UUID - test_uuid = uuid.uuid4() - assert exporter._serialize_csv_value(test_uuid) == str(test_uuid) - - # Datetime - test_dt = datetime(2024, 1, 1, 12, 0, 0) - assert exporter._serialize_csv_value(test_dt) == "2024-01-01T12:00:00" - - # Collections - assert exporter._serialize_csv_value([1, 2, 3]) == "[1, 2, 3]" - assert exporter._serialize_csv_value({"a", "b"}) == "[a, b]" or "[b, a]" - assert exporter._serialize_csv_value({"k1": "v1", "k2": "v2"}) in [ - "{k1: v1, k2: v2}", - "{k2: v2, k1: v1}", - ] - - # Bytes - assert exporter._serialize_csv_value(b"\x00\x01\x02") == "000102" - - def test_null_string_customization(self, mock_operator): - """ - Test custom NULL string representation. - - What this tests: - --------------- - 1. Default empty string for NULL - 2. Custom NULL strings like "NULL" or "\\N" - 3. Consistent handling across all types - - Why this matters: - ---------------- - - Different tools expect different NULL representations - - PostgreSQL uses \\N, MySQL uses NULL - - Must be configurable for compatibility - """ - # Default exporter uses empty string - default_exporter = CSVExporter(mock_operator) - assert default_exporter._serialize_csv_value(None) == "" - - # Custom NULL string - custom_exporter = CSVExporter(mock_operator, null_string="NULL") - assert custom_exporter._serialize_csv_value(None) == "NULL" - - # PostgreSQL style - pg_exporter = CSVExporter(mock_operator, null_string="\\N") - assert pg_exporter._serialize_csv_value(None) == "\\N" - - @pytest.mark.asyncio - async def test_write_header(self, exporter): - """ - Test CSV header writing. - - What this tests: - --------------- - 1. Header contains column names - 2. Proper delimiter usage - 3. Quoting when needed - - Why this matters: - ---------------- - - Headers enable column mapping - - Must match data row format - - Standard CSV compliance - """ - output = io.StringIO() - columns = ["id", "name", "created_at", "tags"] - - await exporter.write_header(output, columns) - output.seek(0) - - reader = csv.reader(output) - header = next(reader) - assert header == columns - - @pytest.mark.asyncio - async def test_write_row(self, exporter): - """ - Test writing data rows to CSV. - - What this tests: - --------------- - 1. Row data properly formatted - 2. Complex types serialized - 3. Byte count tracking - 4. Thread safety with lock - - Why this matters: - ---------------- - - Data integrity is critical - - Concurrent writes must be safe - - Progress tracking needs accurate bytes - """ - output = io.StringIO() - - # Create test row - row = MockRow( - id=1, - name="Test User", - active=True, - score=99.5, - tags=["tag1", "tag2"], - metadata={"key": "value"}, - created_at=datetime(2024, 1, 1, 12, 0, 0), - ) - - bytes_written = await exporter.write_row(output, row) - output.seek(0) - - # Verify output - reader = csv.reader(output) - values = next(reader) - - assert values[0] == "1" - assert values[1] == "Test User" - assert values[2] == "true" - assert values[3] == "99.5" - assert values[4] == "[tag1, tag2]" - assert values[5] == "{key: value}" - assert values[6] == "2024-01-01T12:00:00" - - # Verify byte count - assert bytes_written > 0 - - @pytest.mark.asyncio - async def test_export_with_compression(self, mock_operator, tmp_path): - """ - Test CSV export with compression. - - What this tests: - --------------- - 1. Gzip compression works - 2. File has correct extension - 3. Compressed data is valid - - Why this matters: - ---------------- - - Large exports need compression - - Must work with standard tools - - File naming conventions matter - """ - exporter = CSVExporter(mock_operator, compression="gzip") - output_path = tmp_path / "test.csv" - - # Mock the export stream - test_rows = [ - MockRow(id=1, name="Alice", score=95.5), - MockRow(id=2, name="Bob", score=87.3), - ] - - async def mock_export(*args, **kwargs): - for row in test_rows: - yield row - - mock_operator.export_by_token_ranges = mock_export - - # Mock metadata - mock_keyspace = Mock() - mock_table = Mock() - mock_table.columns = {"id": None, "name": None, "score": None} - mock_keyspace.tables = {"test_table": mock_table} - mock_operator.session._session.cluster.metadata.keyspaces = {"test_ks": mock_keyspace} - - # Export - await exporter.export( - keyspace="test_ks", - table="test_table", - output_path=output_path, - ) - - # Verify compressed file exists - compressed_path = output_path.with_suffix(".csv.gzip") - assert compressed_path.exists() - - # Verify content - with gzip.open(compressed_path, "rt") as f: - reader = csv.reader(f) - header = next(reader) - assert header == ["id", "name", "score"] - - row1 = next(reader) - assert row1 == ["1", "Alice", "95.5"] - - row2 = next(reader) - assert row2 == ["2", "Bob", "87.3"] - - @pytest.mark.asyncio - async def test_export_progress_tracking(self, mock_operator, tmp_path): - """ - Test progress tracking during export. - - What this tests: - --------------- - 1. Progress initialized correctly - 2. Row count tracked - 3. Progress saved to file - 4. Completion marked - - Why this matters: - ---------------- - - Long exports need monitoring - - Resume capability requires state - - Users need feedback - """ - exporter = CSVExporter(mock_operator) - output_path = tmp_path / "test.csv" - - # Mock export - test_rows = [MockRow(id=i, value=f"test{i}") for i in range(100)] - - async def mock_export(*args, **kwargs): - for row in test_rows: - yield row - - mock_operator.export_by_token_ranges = mock_export - - # Mock metadata - mock_keyspace = Mock() - mock_table = Mock() - mock_table.columns = {"id": None, "value": None} - mock_keyspace.tables = {"test_table": mock_table} - mock_operator.session._session.cluster.metadata.keyspaces = {"test_ks": mock_keyspace} - - # Track progress callbacks - progress_updates = [] - - async def progress_callback(progress): - progress_updates.append(progress.rows_exported) - - # Export - progress = await exporter.export( - keyspace="test_ks", - table="test_table", - output_path=output_path, - progress_callback=progress_callback, - ) - - # Verify progress - assert progress.keyspace == "test_ks" - assert progress.table == "test_table" - assert progress.format == ExportFormat.CSV - assert progress.rows_exported == 100 - assert progress.completed_at is not None - - # Verify progress file - progress_file = output_path.with_suffix(".csv.progress") - assert progress_file.exists() - - # Load and verify - loaded_progress = ExportProgress.load(progress_file) - assert loaded_progress.rows_exported == 100 - - def test_custom_delimiter_and_quoting(self, mock_operator): - """ - Test custom CSV formatting options. - - What this tests: - --------------- - 1. Tab delimiter - 2. Pipe delimiter - 3. Different quoting styles - - Why this matters: - ---------------- - - Different systems expect different formats - - Must handle data with delimiters - - Flexibility for integration - """ - # Tab-delimited - tab_exporter = CSVExporter(mock_operator, delimiter="\t") - assert tab_exporter.delimiter == "\t" - - # Pipe-delimited - pipe_exporter = CSVExporter(mock_operator, delimiter="|") - assert pipe_exporter.delimiter == "|" - - # Quote all - quote_all_exporter = CSVExporter(mock_operator, quoting=csv.QUOTE_ALL) - assert quote_all_exporter.quoting == csv.QUOTE_ALL diff --git a/libs/async-cassandra-bulk/examples/tests/unit/test_helpers.py b/libs/async-cassandra-bulk/examples/tests/unit/test_helpers.py deleted file mode 100644 index 8f06738..0000000 --- a/libs/async-cassandra-bulk/examples/tests/unit/test_helpers.py +++ /dev/null @@ -1,19 +0,0 @@ -""" -Helper utilities for unit tests. -""" - - -class MockToken: - """Mock token that supports comparison for sorting.""" - - def __init__(self, value): - self.value = value - - def __lt__(self, other): - return self.value < other.value - - def __eq__(self, other): - return self.value == other.value - - def __repr__(self): - return f"MockToken({self.value})" diff --git a/libs/async-cassandra-bulk/examples/tests/unit/test_iceberg_catalog.py b/libs/async-cassandra-bulk/examples/tests/unit/test_iceberg_catalog.py deleted file mode 100644 index c19a2cf..0000000 --- a/libs/async-cassandra-bulk/examples/tests/unit/test_iceberg_catalog.py +++ /dev/null @@ -1,241 +0,0 @@ -"""Unit tests for Iceberg catalog configuration. - -What this tests: ---------------- -1. Filesystem catalog creation -2. Warehouse directory setup -3. Custom catalog configuration -4. Catalog loading - -Why this matters: ----------------- -- Catalog is the entry point to Iceberg -- Proper configuration is critical -- Warehouse location affects data storage -- Supports multiple catalog types -""" - -import tempfile -import unittest -from pathlib import Path -from unittest.mock import Mock, patch - -from pyiceberg.catalog import Catalog - -from bulk_operations.iceberg.catalog import create_filesystem_catalog, get_or_create_catalog - - -class TestIcebergCatalog(unittest.TestCase): - """Test Iceberg catalog configuration.""" - - def setUp(self): - """Set up test fixtures.""" - self.temp_dir = tempfile.mkdtemp() - self.warehouse_path = Path(self.temp_dir) / "test_warehouse" - - def tearDown(self): - """Clean up test fixtures.""" - import shutil - - shutil.rmtree(self.temp_dir, ignore_errors=True) - - def test_create_filesystem_catalog_default_path(self): - """ - Test creating filesystem catalog with default path. - - What this tests: - --------------- - 1. Default warehouse path is created - 2. Catalog is properly configured - 3. SQLite URI is correct - - Why this matters: - ---------------- - - Easy setup for development - - Consistent default behavior - - No external dependencies - """ - with patch("bulk_operations.iceberg.catalog.Path.cwd") as mock_cwd: - mock_cwd.return_value = Path(self.temp_dir) - - catalog = create_filesystem_catalog("test_catalog") - - # Check catalog properties - self.assertEqual(catalog.name, "test_catalog") - - # Check warehouse directory was created - expected_warehouse = Path(self.temp_dir) / "iceberg_warehouse" - self.assertTrue(expected_warehouse.exists()) - - def test_create_filesystem_catalog_custom_path(self): - """ - Test creating filesystem catalog with custom path. - - What this tests: - --------------- - 1. Custom warehouse path is used - 2. Directory is created if missing - 3. Path objects are handled - - Why this matters: - ---------------- - - Flexibility in storage location - - Integration with existing infrastructure - - Path handling consistency - """ - catalog = create_filesystem_catalog( - name="custom_catalog", warehouse_path=self.warehouse_path - ) - - # Check catalog name - self.assertEqual(catalog.name, "custom_catalog") - - # Check warehouse directory exists - self.assertTrue(self.warehouse_path.exists()) - self.assertTrue(self.warehouse_path.is_dir()) - - def test_create_filesystem_catalog_string_path(self): - """ - Test creating catalog with string path. - - What this tests: - --------------- - 1. String paths are converted to Path objects - 2. Catalog works with string paths - - Why this matters: - ---------------- - - API flexibility - - Backward compatibility - - User convenience - """ - str_path = str(self.warehouse_path) - catalog = create_filesystem_catalog(name="string_path_catalog", warehouse_path=str_path) - - self.assertEqual(catalog.name, "string_path_catalog") - self.assertTrue(Path(str_path).exists()) - - def test_get_or_create_catalog_default(self): - """ - Test get_or_create_catalog with defaults. - - What this tests: - --------------- - 1. Default filesystem catalog is created - 2. Same parameters as create_filesystem_catalog - - Why this matters: - ---------------- - - Simplified API for common case - - Consistent behavior - """ - with patch("bulk_operations.iceberg.catalog.create_filesystem_catalog") as mock_create: - mock_catalog = Mock(spec=Catalog) - mock_create.return_value = mock_catalog - - result = get_or_create_catalog( - catalog_name="default_test", warehouse_path=self.warehouse_path - ) - - # Verify create_filesystem_catalog was called - mock_create.assert_called_once_with("default_test", self.warehouse_path) - self.assertEqual(result, mock_catalog) - - def test_get_or_create_catalog_custom_config(self): - """ - Test get_or_create_catalog with custom configuration. - - What this tests: - --------------- - 1. Custom config overrides defaults - 2. load_catalog is used for custom configs - - Why this matters: - ---------------- - - Support for different catalog types - - Flexibility for production deployments - - Integration with existing catalogs - """ - custom_config = { - "type": "rest", - "uri": "https://iceberg-catalog.example.com", - "credential": "token123", - } - - with patch("bulk_operations.iceberg.catalog.load_catalog") as mock_load: - mock_catalog = Mock(spec=Catalog) - mock_load.return_value = mock_catalog - - result = get_or_create_catalog(catalog_name="rest_catalog", config=custom_config) - - # Verify load_catalog was called with custom config - mock_load.assert_called_once_with("rest_catalog", **custom_config) - self.assertEqual(result, mock_catalog) - - def test_warehouse_directory_creation(self): - """ - Test that warehouse directory is created with proper permissions. - - What this tests: - --------------- - 1. Directory is created if missing - 2. Parent directories are created - 3. Existing directories are not affected - - Why this matters: - ---------------- - - Data needs a place to live - - Permissions affect data security - - Idempotent operation - """ - nested_path = self.warehouse_path / "nested" / "warehouse" - - # Ensure it doesn't exist - self.assertFalse(nested_path.exists()) - - # Create catalog - create_filesystem_catalog(name="nested_test", warehouse_path=nested_path) - - # Check all directories were created - self.assertTrue(nested_path.exists()) - self.assertTrue(nested_path.is_dir()) - self.assertTrue(nested_path.parent.exists()) - - # Create again - should not fail - create_filesystem_catalog(name="nested_test2", warehouse_path=nested_path) - self.assertTrue(nested_path.exists()) - - def test_catalog_properties(self): - """ - Test that catalog has expected properties. - - What this tests: - --------------- - 1. Catalog type is set correctly - 2. Warehouse location is set - 3. URI format is correct - - Why this matters: - ---------------- - - Properties affect catalog behavior - - Debugging and monitoring - - Integration requirements - """ - catalog = create_filesystem_catalog( - name="properties_test", warehouse_path=self.warehouse_path - ) - - # Check basic properties - self.assertEqual(catalog.name, "properties_test") - - # For SQL catalog, we'd check additional properties - # but they're not exposed in the base Catalog interface - - # Verify catalog can be used (basic smoke test) - # This would fail if catalog is misconfigured - namespaces = list(catalog.list_namespaces()) - self.assertIsInstance(namespaces, list) - - -if __name__ == "__main__": - unittest.main() diff --git a/libs/async-cassandra-bulk/examples/tests/unit/test_iceberg_schema_mapper.py b/libs/async-cassandra-bulk/examples/tests/unit/test_iceberg_schema_mapper.py deleted file mode 100644 index 9acc402..0000000 --- a/libs/async-cassandra-bulk/examples/tests/unit/test_iceberg_schema_mapper.py +++ /dev/null @@ -1,362 +0,0 @@ -"""Unit tests for Cassandra to Iceberg schema mapping. - -What this tests: ---------------- -1. CQL type to Iceberg type conversions -2. Collection type handling (list, set, map) -3. Field ID assignment -4. Primary key handling (required vs nullable) - -Why this matters: ----------------- -- Schema mapping is critical for data integrity -- Type mismatches can cause data loss -- Field IDs enable schema evolution -- Nullability affects query semantics -""" - -import unittest -from unittest.mock import Mock - -from pyiceberg.types import ( - BinaryType, - BooleanType, - DateType, - DecimalType, - DoubleType, - FloatType, - IntegerType, - ListType, - LongType, - MapType, - StringType, - TimestamptzType, -) - -from bulk_operations.iceberg.schema_mapper import CassandraToIcebergSchemaMapper - - -class TestCassandraToIcebergSchemaMapper(unittest.TestCase): - """Test schema mapping from Cassandra to Iceberg.""" - - def setUp(self): - """Set up test fixtures.""" - self.mapper = CassandraToIcebergSchemaMapper() - - def test_simple_type_mappings(self): - """ - Test mapping of simple CQL types to Iceberg types. - - What this tests: - --------------- - 1. String types (text, ascii, varchar) - 2. Numeric types (int, bigint, float, double) - 3. Boolean type - 4. Binary type (blob) - - Why this matters: - ---------------- - - Ensures basic data types are preserved - - Critical for data integrity - - Foundation for complex types - """ - test_cases = [ - # String types - ("text", StringType), - ("ascii", StringType), - ("varchar", StringType), - # Integer types - ("tinyint", IntegerType), - ("smallint", IntegerType), - ("int", IntegerType), - ("bigint", LongType), - ("counter", LongType), - # Floating point - ("float", FloatType), - ("double", DoubleType), - # Other types - ("boolean", BooleanType), - ("blob", BinaryType), - ("date", DateType), - ("timestamp", TimestamptzType), - ("uuid", StringType), - ("timeuuid", StringType), - ("inet", StringType), - ] - - for cql_type, expected_type in test_cases: - with self.subTest(cql_type=cql_type): - result = self.mapper._map_cql_type(cql_type) - self.assertIsInstance(result, expected_type) - - def test_decimal_type_mapping(self): - """ - Test decimal and varint type mappings. - - What this tests: - --------------- - 1. Decimal type with default precision - 2. Varint as decimal with 0 scale - - Why this matters: - ---------------- - - Financial data requires exact decimal representation - - Varint needs appropriate precision - """ - # Decimal - decimal_type = self.mapper._map_cql_type("decimal") - self.assertIsInstance(decimal_type, DecimalType) - self.assertEqual(decimal_type.precision, 38) - self.assertEqual(decimal_type.scale, 10) - - # Varint (arbitrary precision integer) - varint_type = self.mapper._map_cql_type("varint") - self.assertIsInstance(varint_type, DecimalType) - self.assertEqual(varint_type.precision, 38) - self.assertEqual(varint_type.scale, 0) - - def test_collection_type_mappings(self): - """ - Test mapping of collection types. - - What this tests: - --------------- - 1. List type with element type - 2. Set type (becomes list in Iceberg) - 3. Map type with key and value types - - Why this matters: - ---------------- - - Collections are common in Cassandra - - Iceberg has no native set type - - Nested types need proper handling - """ - # List - list_type = self.mapper._map_cql_type("list") - self.assertIsInstance(list_type, ListType) - self.assertIsInstance(list_type.element_type, StringType) - self.assertFalse(list_type.element_required) - - # Set (becomes List in Iceberg) - set_type = self.mapper._map_cql_type("set") - self.assertIsInstance(set_type, ListType) - self.assertIsInstance(set_type.element_type, IntegerType) - - # Map - map_type = self.mapper._map_cql_type("map") - self.assertIsInstance(map_type, MapType) - self.assertIsInstance(map_type.key_type, StringType) - self.assertIsInstance(map_type.value_type, DoubleType) - self.assertFalse(map_type.value_required) - - def test_nested_collection_types(self): - """ - Test mapping of nested collection types. - - What this tests: - --------------- - 1. List> - 2. Map> - - Why this matters: - ---------------- - - Cassandra supports nested collections - - Complex data structures need proper mapping - """ - # List> - nested_list = self.mapper._map_cql_type("list>") - self.assertIsInstance(nested_list, ListType) - self.assertIsInstance(nested_list.element_type, ListType) - self.assertIsInstance(nested_list.element_type.element_type, IntegerType) - - # Map> - nested_map = self.mapper._map_cql_type("map>") - self.assertIsInstance(nested_map, MapType) - self.assertIsInstance(nested_map.key_type, StringType) - self.assertIsInstance(nested_map.value_type, ListType) - self.assertIsInstance(nested_map.value_type.element_type, DoubleType) - - def test_frozen_type_handling(self): - """ - Test handling of frozen collections. - - What this tests: - --------------- - 1. Frozen> - 2. Frozen types are unwrapped - - Why this matters: - ---------------- - - Frozen is a Cassandra concept not in Iceberg - - Inner type should be preserved - """ - frozen_list = self.mapper._map_cql_type("frozen>") - self.assertIsInstance(frozen_list, ListType) - self.assertIsInstance(frozen_list.element_type, StringType) - - def test_field_id_assignment(self): - """ - Test unique field ID assignment. - - What this tests: - --------------- - 1. Sequential field IDs - 2. Unique IDs for nested fields - 3. ID counter reset - - Why this matters: - ---------------- - - Field IDs enable schema evolution - - Must be unique within schema - - IDs are permanent for a field - """ - # Reset counter - self.mapper.reset_field_ids() - - # Create mock column metadata - col1 = Mock() - col1.cql_type = "text" - col1.is_primary_key = True - - col2 = Mock() - col2.cql_type = "int" - col2.is_primary_key = False - - col3 = Mock() - col3.cql_type = "list" - col3.is_primary_key = False - - # Map columns - field1 = self.mapper._map_column("id", col1) - field2 = self.mapper._map_column("value", col2) - field3 = self.mapper._map_column("tags", col3) - - # Check field IDs - self.assertEqual(field1.field_id, 1) - self.assertEqual(field2.field_id, 2) - self.assertEqual(field3.field_id, 4) # ID 3 was used for list element - - # List type should have element ID too - self.assertEqual(field3.field_type.element_id, 3) - - def test_primary_key_required_fields(self): - """ - Test that primary key columns are marked as required. - - What this tests: - --------------- - 1. Primary key columns are required (not null) - 2. Non-primary columns are nullable - - Why this matters: - ---------------- - - Primary keys cannot be null in Cassandra - - Affects Iceberg query semantics - - Important for data validation - """ - # Primary key column - pk_col = Mock() - pk_col.cql_type = "text" - pk_col.is_primary_key = True - - pk_field = self.mapper._map_column("id", pk_col) - self.assertTrue(pk_field.required) - - # Regular column - reg_col = Mock() - reg_col.cql_type = "text" - reg_col.is_primary_key = False - - reg_field = self.mapper._map_column("name", reg_col) - self.assertFalse(reg_field.required) - - def test_table_schema_mapping(self): - """ - Test mapping of complete table schema. - - What this tests: - --------------- - 1. Multiple columns mapped correctly - 2. Schema contains all fields - 3. Field order preserved - - Why this matters: - ---------------- - - Complete schema mapping is the main use case - - All columns must be included - - Order affects data files - """ - # Mock table metadata - table_meta = Mock() - - # Mock columns - id_col = Mock() - id_col.cql_type = "uuid" - id_col.is_primary_key = True - - name_col = Mock() - name_col.cql_type = "text" - name_col.is_primary_key = False - - tags_col = Mock() - tags_col.cql_type = "set" - tags_col.is_primary_key = False - - table_meta.columns = { - "id": id_col, - "name": name_col, - "tags": tags_col, - } - - # Map schema - schema = self.mapper.map_table_schema(table_meta) - - # Verify schema - self.assertEqual(len(schema.fields), 3) - - # Check field names and types - field_names = [f.name for f in schema.fields] - self.assertEqual(field_names, ["id", "name", "tags"]) - - # Check types - self.assertIsInstance(schema.fields[0].field_type, StringType) - self.assertIsInstance(schema.fields[1].field_type, StringType) - self.assertIsInstance(schema.fields[2].field_type, ListType) - - def test_unknown_type_fallback(self): - """ - Test that unknown types fall back to string. - - What this tests: - --------------- - 1. Unknown CQL types become strings - 2. No exceptions thrown - - Why this matters: - ---------------- - - Future Cassandra versions may add types - - Graceful degradation is better than failure - """ - unknown_type = self.mapper._map_cql_type("future_type") - self.assertIsInstance(unknown_type, StringType) - - def test_time_type_mapping(self): - """ - Test time type mapping. - - What this tests: - --------------- - 1. Time type maps to LongType - 2. Represents nanoseconds since midnight - - Why this matters: - ---------------- - - Time representation differs between systems - - Precision must be preserved - """ - time_type = self.mapper._map_cql_type("time") - self.assertIsInstance(time_type, LongType) - - -if __name__ == "__main__": - unittest.main() diff --git a/libs/async-cassandra-bulk/examples/tests/unit/test_token_ranges.py b/libs/async-cassandra-bulk/examples/tests/unit/test_token_ranges.py deleted file mode 100644 index 1949b0e..0000000 --- a/libs/async-cassandra-bulk/examples/tests/unit/test_token_ranges.py +++ /dev/null @@ -1,320 +0,0 @@ -""" -Unit tests for token range operations. - -What this tests: ---------------- -1. Token range calculation and splitting -2. Proportional distribution of ranges -3. Handling of ring wraparound -4. Replica awareness - -Why this matters: ----------------- -- Correct token ranges ensure complete data coverage -- Proportional splitting ensures balanced workload -- Proper handling prevents missing or duplicate data -- Replica awareness enables data locality - -Additional context: ---------------------------------- -Token ranges in Cassandra use Murmur3 hash with range: --9223372036854775808 to 9223372036854775807 -""" - -from unittest.mock import MagicMock, Mock - -import pytest - -from bulk_operations.token_utils import ( - TokenRange, - TokenRangeSplitter, - discover_token_ranges, - generate_token_range_query, -) - - -class TestTokenRange: - """Test TokenRange data class.""" - - @pytest.mark.unit - def test_token_range_creation(self): - """Test creating a token range.""" - range = TokenRange(start=-9223372036854775808, end=0, replicas=["node1", "node2", "node3"]) - - assert range.start == -9223372036854775808 - assert range.end == 0 - assert range.size == 9223372036854775808 - assert range.replicas == ["node1", "node2", "node3"] - assert 0.49 < range.fraction < 0.51 # About 50% of ring - - @pytest.mark.unit - def test_token_range_wraparound(self): - """Test token range that wraps around the ring.""" - # Range from positive to negative (wraps around) - range = TokenRange(start=9223372036854775800, end=-9223372036854775800, replicas=["node1"]) - - # Size calculation should handle wraparound - expected_size = 16 # Small range wrapping around - assert range.size == expected_size - assert range.fraction < 0.001 # Very small fraction of ring - - @pytest.mark.unit - def test_token_range_full_ring(self): - """Test token range covering entire ring.""" - range = TokenRange( - start=-9223372036854775808, - end=9223372036854775807, - replicas=["node1", "node2", "node3"], - ) - - assert range.size == 18446744073709551615 # 2^64 - 1 - assert range.fraction == 1.0 # 100% of ring - - -class TestTokenRangeSplitter: - """Test token range splitting logic.""" - - @pytest.mark.unit - def test_split_single_range_evenly(self): - """Test splitting a single range into equal parts.""" - splitter = TokenRangeSplitter() - original = TokenRange(start=0, end=1000, replicas=["node1", "node2"]) - - splits = splitter.split_single_range(original, 4) - - assert len(splits) == 4 - # Check splits are contiguous and cover entire range - assert splits[0].start == 0 - assert splits[0].end == 250 - assert splits[1].start == 250 - assert splits[1].end == 500 - assert splits[2].start == 500 - assert splits[2].end == 750 - assert splits[3].start == 750 - assert splits[3].end == 1000 - - # All splits should have same replicas - for split in splits: - assert split.replicas == ["node1", "node2"] - - @pytest.mark.unit - def test_split_proportionally(self): - """Test proportional splitting based on range sizes.""" - splitter = TokenRangeSplitter() - - # Create ranges of different sizes - ranges = [ - TokenRange(start=0, end=1000, replicas=["node1"]), # 10% of total - TokenRange(start=1000, end=9000, replicas=["node2"]), # 80% of total - TokenRange(start=9000, end=10000, replicas=["node3"]), # 10% of total - ] - - # Request 10 splits total - splits = splitter.split_proportionally(ranges, 10) - - # Should get approximately 1, 8, 1 splits for each range - node1_splits = [s for s in splits if s.replicas == ["node1"]] - node2_splits = [s for s in splits if s.replicas == ["node2"]] - node3_splits = [s for s in splits if s.replicas == ["node3"]] - - assert len(node1_splits) == 1 - assert len(node2_splits) == 8 - assert len(node3_splits) == 1 - assert len(splits) == 10 - - @pytest.mark.unit - def test_split_with_minimum_size(self): - """Test that small ranges don't get over-split.""" - splitter = TokenRangeSplitter() - - # Very small range - small_range = TokenRange(start=0, end=10, replicas=["node1"]) - - # Request many splits - splits = splitter.split_single_range(small_range, 100) - - # Should not create more splits than makes sense - # (implementation should have minimum split size) - assert len(splits) <= 10 # Assuming minimum split size of 1 - - @pytest.mark.unit - def test_cluster_by_replicas(self): - """Test clustering ranges by their replica sets.""" - splitter = TokenRangeSplitter() - - ranges = [ - TokenRange(start=0, end=100, replicas=["node1", "node2"]), - TokenRange(start=100, end=200, replicas=["node2", "node3"]), - TokenRange(start=200, end=300, replicas=["node1", "node2"]), - TokenRange(start=300, end=400, replicas=["node2", "node3"]), - ] - - clustered = splitter.cluster_by_replicas(ranges) - - # Should have 2 clusters based on replica sets - assert len(clustered) == 2 - - # Find clusters - cluster1 = None - cluster2 = None - for replicas, cluster_ranges in clustered.items(): - if set(replicas) == {"node1", "node2"}: - cluster1 = cluster_ranges - elif set(replicas) == {"node2", "node3"}: - cluster2 = cluster_ranges - - assert cluster1 is not None - assert cluster2 is not None - assert len(cluster1) == 2 - assert len(cluster2) == 2 - - -class TestTokenRangeDiscovery: - """Test discovering token ranges from cluster metadata.""" - - @pytest.mark.unit - async def test_discover_token_ranges(self): - """ - Test discovering token ranges from cluster metadata. - - What this tests: - --------------- - 1. Extraction from Cassandra metadata - 2. All token ranges are discovered - 3. Replica information is captured - 4. Async operation works correctly - - Why this matters: - ---------------- - - Must discover all ranges for completeness - - Replica info enables local processing - - Integration point with driver metadata - - Foundation of token-aware operations - """ - # Mock cluster metadata - mock_session = Mock() - mock_cluster = Mock() - mock_metadata = Mock() - mock_token_map = Mock() - - # Set up mock relationships - mock_session._session = Mock() - mock_session._session.cluster = mock_cluster - mock_cluster.metadata = mock_metadata - mock_metadata.token_map = mock_token_map - - # Mock tokens in the ring - from .test_helpers import MockToken - - mock_token1 = MockToken(-9223372036854775808) - mock_token2 = MockToken(0) - mock_token3 = MockToken(9223372036854775807) - mock_token_map.ring = [mock_token1, mock_token2, mock_token3] - - # Mock replicas - mock_token_map.get_replicas = MagicMock( - side_effect=[ - [Mock(address="127.0.0.1"), Mock(address="127.0.0.2")], - [Mock(address="127.0.0.2"), Mock(address="127.0.0.3")], - [Mock(address="127.0.0.3"), Mock(address="127.0.0.1")], # For wraparound - ] - ) - - # Discover ranges - ranges = await discover_token_ranges(mock_session, "test_keyspace") - - assert len(ranges) == 3 # Three tokens create three ranges - assert ranges[0].start == -9223372036854775808 - assert ranges[0].end == 0 - assert ranges[0].replicas == ["127.0.0.1", "127.0.0.2"] - assert ranges[1].start == 0 - assert ranges[1].end == 9223372036854775807 - assert ranges[1].replicas == ["127.0.0.2", "127.0.0.3"] - assert ranges[2].start == 9223372036854775807 - assert ranges[2].end == -9223372036854775808 # Wraparound - assert ranges[2].replicas == ["127.0.0.3", "127.0.0.1"] - - -class TestTokenRangeQueryGeneration: - """Test generating CQL queries with token ranges.""" - - @pytest.mark.unit - def test_generate_basic_token_range_query(self): - """ - Test generating a basic token range query. - - What this tests: - --------------- - 1. Valid CQL syntax generation - 2. Token function usage is correct - 3. Range boundaries use proper operators - 4. Fully qualified table names - - Why this matters: - ---------------- - - Query syntax must be valid CQL - - Token function enables range scans - - Boundary operators prevent gaps/overlaps - - Production queries depend on this - """ - range = TokenRange(start=0, end=1000, replicas=["node1"]) - - query = generate_token_range_query( - keyspace="test_ks", table="test_table", partition_keys=["id"], token_range=range - ) - - expected = "SELECT * FROM test_ks.test_table " "WHERE token(id) > 0 AND token(id) <= 1000" - assert query == expected - - @pytest.mark.unit - def test_generate_query_with_multiple_partition_keys(self): - """Test query generation with composite partition key.""" - range = TokenRange(start=-1000, end=1000, replicas=["node1"]) - - query = generate_token_range_query( - keyspace="test_ks", - table="test_table", - partition_keys=["country", "city"], - token_range=range, - ) - - expected = ( - "SELECT * FROM test_ks.test_table " - "WHERE token(country, city) > -1000 AND token(country, city) <= 1000" - ) - assert query == expected - - @pytest.mark.unit - def test_generate_query_with_column_selection(self): - """Test query generation with specific columns.""" - range = TokenRange(start=0, end=1000, replicas=["node1"]) - - query = generate_token_range_query( - keyspace="test_ks", - table="test_table", - partition_keys=["id"], - token_range=range, - columns=["id", "name", "created_at"], - ) - - expected = ( - "SELECT id, name, created_at FROM test_ks.test_table " - "WHERE token(id) > 0 AND token(id) <= 1000" - ) - assert query == expected - - @pytest.mark.unit - def test_generate_query_with_min_token(self): - """Test query generation starting from minimum token.""" - range = TokenRange(start=-9223372036854775808, end=0, replicas=["node1"]) # Min token - - query = generate_token_range_query( - keyspace="test_ks", table="test_table", partition_keys=["id"], token_range=range - ) - - # First range should use >= instead of > - expected = ( - "SELECT * FROM test_ks.test_table " - "WHERE token(id) >= -9223372036854775808 AND token(id) <= 0" - ) - assert query == expected diff --git a/libs/async-cassandra-bulk/examples/tests/unit/test_token_utils.py b/libs/async-cassandra-bulk/examples/tests/unit/test_token_utils.py deleted file mode 100644 index 8fe2de9..0000000 --- a/libs/async-cassandra-bulk/examples/tests/unit/test_token_utils.py +++ /dev/null @@ -1,388 +0,0 @@ -""" -Unit tests for token range utilities. - -What this tests: ---------------- -1. Token range size calculations -2. Range splitting logic -3. Wraparound handling -4. Proportional distribution -5. Replica clustering - -Why this matters: ----------------- -- Ensures data completeness -- Prevents missing rows -- Maintains proper load distribution -- Enables efficient parallel processing - -Additional context: ---------------------------------- -Token ranges in Cassandra use Murmur3 hash which -produces 128-bit values from -2^63 to 2^63-1. -""" - -from unittest.mock import Mock - -import pytest - -from bulk_operations.token_utils import ( - MAX_TOKEN, - MIN_TOKEN, - TOTAL_TOKEN_RANGE, - TokenRange, - TokenRangeSplitter, - discover_token_ranges, - generate_token_range_query, -) - - -class TestTokenRange: - """Test the TokenRange dataclass.""" - - @pytest.mark.unit - def test_token_range_size_normal(self): - """ - Test size calculation for normal ranges. - - What this tests: - --------------- - 1. Size calculation for positive ranges - 2. Size calculation for negative ranges - 3. Basic arithmetic correctness - 4. No wraparound edge cases - - Why this matters: - ---------------- - - Token range sizes determine split proportions - - Incorrect sizes lead to unbalanced loads - - Foundation for all range splitting logic - - Critical for even data distribution - """ - range = TokenRange(start=0, end=1000, replicas=["node1"]) - assert range.size == 1000 - - range = TokenRange(start=-1000, end=0, replicas=["node1"]) - assert range.size == 1000 - - @pytest.mark.unit - def test_token_range_size_wraparound(self): - """ - Test size calculation for ranges that wrap around. - - What this tests: - --------------- - 1. Wraparound from MAX_TOKEN to MIN_TOKEN - 2. Correct size calculation across boundaries - 3. Edge case handling for ring topology - 4. Boundary arithmetic correctness - - Why this matters: - ---------------- - - Cassandra's token ring wraps around - - Last range often crosses the boundary - - Incorrect handling causes missing data - - Real clusters always have wraparound ranges - """ - # Range wraps from near max to near min - range = TokenRange(start=MAX_TOKEN - 1000, end=MIN_TOKEN + 1000, replicas=["node1"]) - expected_size = 1000 + 1000 + 1 # 1000 on each side plus the boundary - assert range.size == expected_size - - @pytest.mark.unit - def test_token_range_fraction(self): - """Test fraction calculation.""" - # Quarter of the ring - quarter_size = TOTAL_TOKEN_RANGE // 4 - range = TokenRange(start=0, end=quarter_size, replicas=["node1"]) - assert abs(range.fraction - 0.25) < 0.001 - - -class TestTokenRangeSplitter: - """Test the TokenRangeSplitter class.""" - - @pytest.fixture - def splitter(self): - """Create a TokenRangeSplitter instance.""" - return TokenRangeSplitter() - - @pytest.mark.unit - def test_split_single_range_no_split(self, splitter): - """Test that requesting 1 or 0 splits returns original range.""" - range = TokenRange(start=0, end=1000, replicas=["node1"]) - - result = splitter.split_single_range(range, 1) - assert len(result) == 1 - assert result[0].start == 0 - assert result[0].end == 1000 - - @pytest.mark.unit - def test_split_single_range_even_split(self, splitter): - """Test splitting a range into even parts.""" - range = TokenRange(start=0, end=1000, replicas=["node1"]) - - result = splitter.split_single_range(range, 4) - assert len(result) == 4 - - # Check splits - assert result[0].start == 0 - assert result[0].end == 250 - assert result[1].start == 250 - assert result[1].end == 500 - assert result[2].start == 500 - assert result[2].end == 750 - assert result[3].start == 750 - assert result[3].end == 1000 - - @pytest.mark.unit - def test_split_single_range_small_range(self, splitter): - """Test that very small ranges aren't split.""" - range = TokenRange(start=0, end=2, replicas=["node1"]) - - result = splitter.split_single_range(range, 10) - assert len(result) == 1 # Too small to split - - @pytest.mark.unit - def test_split_proportionally_empty(self, splitter): - """Test proportional splitting with empty input.""" - result = splitter.split_proportionally([], 10) - assert result == [] - - @pytest.mark.unit - def test_split_proportionally_single_range(self, splitter): - """Test proportional splitting with single range.""" - ranges = [TokenRange(start=0, end=1000, replicas=["node1"])] - - result = splitter.split_proportionally(ranges, 4) - assert len(result) == 4 - - @pytest.mark.unit - def test_split_proportionally_multiple_ranges(self, splitter): - """ - Test proportional splitting with ranges of different sizes. - - What this tests: - --------------- - 1. Proportional distribution based on size - 2. Larger ranges get more splits - 3. Rounding behavior is reasonable - 4. All input ranges are covered - - Why this matters: - ---------------- - - Uneven token distribution is common - - Load balancing requires proportional splits - - Prevents hotspots in processing - - Mimics real cluster token distributions - """ - ranges = [ - TokenRange(start=0, end=1000, replicas=["node1"]), # Size 1000 - TokenRange(start=1000, end=4000, replicas=["node2"]), # Size 3000 - ] - - result = splitter.split_proportionally(ranges, 4) - - # Should split proportionally: 1 split for first, 3 for second - # But implementation uses round(), so might be slightly different - assert len(result) >= 2 - assert len(result) <= 4 - - @pytest.mark.unit - def test_cluster_by_replicas(self, splitter): - """ - Test clustering ranges by replica sets. - - What this tests: - --------------- - 1. Ranges are grouped by replica nodes - 2. Replica order doesn't affect grouping - 3. All ranges are included in clusters - 4. Unique replica sets are identified - - Why this matters: - ---------------- - - Enables coordinator-local processing - - Reduces network traffic in operations - - Improves performance through locality - - Critical for multi-datacenter efficiency - """ - ranges = [ - TokenRange(start=0, end=100, replicas=["node1", "node2"]), - TokenRange(start=100, end=200, replicas=["node2", "node3"]), - TokenRange(start=200, end=300, replicas=["node1", "node2"]), - TokenRange(start=300, end=400, replicas=["node3", "node1"]), - ] - - clusters = splitter.cluster_by_replicas(ranges) - - # Should have 3 unique replica sets - assert len(clusters) == 3 - - # Check that ranges are properly grouped - key1 = tuple(sorted(["node1", "node2"])) - assert key1 in clusters - assert len(clusters[key1]) == 2 - - -class TestDiscoverTokenRanges: - """Test token range discovery from cluster metadata.""" - - @pytest.mark.unit - async def test_discover_token_ranges_success(self): - """ - Test successful token range discovery. - - What this tests: - --------------- - 1. Token ranges are extracted from metadata - 2. Replica information is preserved - 3. All ranges from token map are returned - 4. Async operation completes successfully - - Why this matters: - ---------------- - - Discovery is the foundation of token-aware ops - - Replica awareness enables local reads - - Must handle all Cassandra metadata structures - - Critical for multi-datacenter deployments - """ - # Mock session and cluster - mock_session = Mock() - mock_cluster = Mock() - mock_metadata = Mock() - mock_token_map = Mock() - - # Setup tokens in the ring - from .test_helpers import MockToken - - mock_token1 = MockToken(-1000) - mock_token2 = MockToken(0) - mock_token3 = MockToken(1000) - mock_token_map.ring = [mock_token1, mock_token2, mock_token3] - - # Setup replicas - mock_replica1 = Mock() - mock_replica1.address = "192.168.1.1" - mock_replica2 = Mock() - mock_replica2.address = "192.168.1.2" - - mock_token_map.get_replicas.side_effect = [ - [mock_replica1, mock_replica2], - [mock_replica2, mock_replica1], - [mock_replica1, mock_replica2], # For the third token range - ] - - mock_metadata.token_map = mock_token_map - mock_cluster.metadata = mock_metadata - mock_session._session = Mock() - mock_session._session.cluster = mock_cluster - - # Test discovery - ranges = await discover_token_ranges(mock_session, "test_ks") - - assert len(ranges) == 3 # Three tokens create three ranges - assert ranges[0].start == -1000 - assert ranges[0].end == 0 - assert ranges[0].replicas == ["192.168.1.1", "192.168.1.2"] - assert ranges[1].start == 0 - assert ranges[1].end == 1000 - assert ranges[1].replicas == ["192.168.1.2", "192.168.1.1"] - assert ranges[2].start == 1000 - assert ranges[2].end == -1000 # Wraparound range - assert ranges[2].replicas == ["192.168.1.1", "192.168.1.2"] - - @pytest.mark.unit - async def test_discover_token_ranges_no_token_map(self): - """Test error when token map is not available.""" - mock_session = Mock() - mock_cluster = Mock() - mock_metadata = Mock() - mock_metadata.token_map = None - mock_cluster.metadata = mock_metadata - mock_session._session = Mock() - mock_session._session.cluster = mock_cluster - - with pytest.raises(RuntimeError, match="Token map not available"): - await discover_token_ranges(mock_session, "test_ks") - - -class TestGenerateTokenRangeQuery: - """Test CQL query generation for token ranges.""" - - @pytest.mark.unit - def test_generate_query_all_columns(self): - """Test query generation with all columns.""" - query = generate_token_range_query( - keyspace="test_ks", - table="test_table", - partition_keys=["id"], - token_range=TokenRange(start=0, end=1000, replicas=["node1"]), - ) - - expected = "SELECT * FROM test_ks.test_table " "WHERE token(id) > 0 AND token(id) <= 1000" - assert query == expected - - @pytest.mark.unit - def test_generate_query_specific_columns(self): - """Test query generation with specific columns.""" - query = generate_token_range_query( - keyspace="test_ks", - table="test_table", - partition_keys=["id"], - token_range=TokenRange(start=0, end=1000, replicas=["node1"]), - columns=["id", "name", "value"], - ) - - expected = ( - "SELECT id, name, value FROM test_ks.test_table " - "WHERE token(id) > 0 AND token(id) <= 1000" - ) - assert query == expected - - @pytest.mark.unit - def test_generate_query_minimum_token(self): - """ - Test query generation for minimum token edge case. - - What this tests: - --------------- - 1. MIN_TOKEN uses >= instead of > - 2. Prevents missing first token value - 3. Query syntax is valid CQL - 4. Edge case is handled correctly - - Why this matters: - ---------------- - - MIN_TOKEN is a valid token value - - Using > would skip data at MIN_TOKEN - - Common source of missing data bugs - - DSBulk compatibility requires this behavior - """ - query = generate_token_range_query( - keyspace="test_ks", - table="test_table", - partition_keys=["id"], - token_range=TokenRange(start=MIN_TOKEN, end=0, replicas=["node1"]), - ) - - expected = ( - f"SELECT * FROM test_ks.test_table " - f"WHERE token(id) >= {MIN_TOKEN} AND token(id) <= 0" - ) - assert query == expected - - @pytest.mark.unit - def test_generate_query_compound_partition_key(self): - """Test query generation with compound partition key.""" - query = generate_token_range_query( - keyspace="test_ks", - table="test_table", - partition_keys=["id", "type"], - token_range=TokenRange(start=0, end=1000, replicas=["node1"]), - ) - - expected = ( - "SELECT * FROM test_ks.test_table " - "WHERE token(id, type) > 0 AND token(id, type) <= 1000" - ) - assert query == expected diff --git a/libs/async-cassandra-bulk/examples/visualize_tokens.py b/libs/async-cassandra-bulk/examples/visualize_tokens.py deleted file mode 100755 index 98c1c25..0000000 --- a/libs/async-cassandra-bulk/examples/visualize_tokens.py +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env python3 -""" -Visualize token distribution in the Cassandra cluster. - -This script helps understand how vnodes distribute tokens -across the cluster and validates our token range discovery. -""" - -import asyncio -from collections import defaultdict - -from rich.console import Console -from rich.table import Table - -from async_cassandra import AsyncCluster -from bulk_operations.token_utils import MAX_TOKEN, MIN_TOKEN, discover_token_ranges - -console = Console() - - -def analyze_node_distribution(ranges): - """Analyze and display token distribution by node.""" - primary_owner_count = defaultdict(int) - all_replica_count = defaultdict(int) - - for r in ranges: - # First replica is primary owner - if r.replicas: - primary_owner_count[r.replicas[0]] += 1 - for replica in r.replicas: - all_replica_count[replica] += 1 - - # Display node statistics - table = Table(title="Token Distribution by Node") - table.add_column("Node", style="cyan") - table.add_column("Primary Ranges", style="green") - table.add_column("Total Ranges (with replicas)", style="yellow") - table.add_column("Percentage of Ring", style="magenta") - - total_primary = sum(primary_owner_count.values()) - - for node in sorted(all_replica_count.keys()): - primary = primary_owner_count.get(node, 0) - total = all_replica_count.get(node, 0) - percentage = (primary / total_primary * 100) if total_primary > 0 else 0 - - table.add_row(node, str(primary), str(total), f"{percentage:.1f}%") - - console.print(table) - return primary_owner_count - - -def analyze_range_sizes(ranges): - """Analyze and display token range sizes.""" - console.print("\n[bold]Token Range Size Analysis[/bold]") - - range_sizes = [r.size for r in ranges] - avg_size = sum(range_sizes) / len(range_sizes) - min_size = min(range_sizes) - max_size = max(range_sizes) - - console.print(f"Average range size: {avg_size:,.0f}") - console.print(f"Smallest range: {min_size:,}") - console.print(f"Largest range: {max_size:,}") - console.print(f"Size ratio (max/min): {max_size/min_size:.2f}x") - - -def validate_ring_coverage(ranges): - """Validate token ring coverage for gaps.""" - console.print("\n[bold]Token Ring Coverage Validation[/bold]") - - sorted_ranges = sorted(ranges, key=lambda r: r.start) - - # Check for gaps - gaps = [] - for i in range(len(sorted_ranges) - 1): - current = sorted_ranges[i] - next_range = sorted_ranges[i + 1] - if current.end != next_range.start: - gaps.append((current.end, next_range.start)) - - if gaps: - console.print(f"[red]โš  Found {len(gaps)} gaps in token ring![/red]") - for gap_start, gap_end in gaps[:5]: # Show first 5 - console.print(f" Gap: {gap_start} to {gap_end}") - else: - console.print("[green]โœ“ No gaps found - complete ring coverage[/green]") - - # Check first and last ranges - if sorted_ranges[0].start == MIN_TOKEN: - console.print("[green]โœ“ First range starts at MIN_TOKEN[/green]") - else: - console.print(f"[red]โš  First range starts at {sorted_ranges[0].start}, not MIN_TOKEN[/red]") - - if sorted_ranges[-1].end == MAX_TOKEN: - console.print("[green]โœ“ Last range ends at MAX_TOKEN[/green]") - else: - console.print(f"[yellow]Last range ends at {sorted_ranges[-1].end}[/yellow]") - - return sorted_ranges - - -def display_sample_ranges(sorted_ranges): - """Display sample token ranges.""" - console.print("\n[bold]Sample Token Ranges (first 5)[/bold]") - sample_table = Table() - sample_table.add_column("Range #", style="cyan") - sample_table.add_column("Start", style="green") - sample_table.add_column("End", style="yellow") - sample_table.add_column("Size", style="magenta") - sample_table.add_column("Replicas", style="blue") - - for i, r in enumerate(sorted_ranges[:5]): - sample_table.add_row( - str(i + 1), str(r.start), str(r.end), f"{r.size:,}", ", ".join(r.replicas) - ) - - console.print(sample_table) - - -async def visualize_token_distribution(): - """Visualize how tokens are distributed across the cluster.""" - - console.print("[cyan]Connecting to Cassandra cluster...[/cyan]") - - async with AsyncCluster(contact_points=["localhost"]) as cluster, cluster.connect() as session: - # Create test keyspace if needed - await session.execute( - """ - CREATE KEYSPACE IF NOT EXISTS token_test - WITH replication = { - 'class': 'SimpleStrategy', - 'replication_factor': 3 - } - """ - ) - - console.print("[green]โœ“ Connected to cluster[/green]\n") - - # Discover token ranges - ranges = await discover_token_ranges(session, "token_test") - - # Analyze distribution - console.print("[bold]Token Range Analysis[/bold]") - console.print(f"Total ranges discovered: {len(ranges)}") - console.print("Expected with 3 nodes ร— 256 vnodes: ~768 ranges\n") - - # Analyze node distribution - primary_owner_count = analyze_node_distribution(ranges) - - # Analyze range sizes - analyze_range_sizes(ranges) - - # Validate ring coverage - sorted_ranges = validate_ring_coverage(ranges) - - # Display sample ranges - display_sample_ranges(sorted_ranges) - - # Vnode insight - console.print("\n[bold]Vnode Configuration Insight[/bold]") - console.print(f"With {len(primary_owner_count)} nodes and {len(ranges)} ranges:") - console.print(f"Average vnodes per node: {len(ranges) / len(primary_owner_count):.1f}") - console.print("This matches the expected 256 vnodes per node configuration.") - - -if __name__ == "__main__": - try: - asyncio.run(visualize_token_distribution()) - except KeyboardInterrupt: - console.print("\n[yellow]Visualization cancelled[/yellow]") - except Exception as e: - console.print(f"\n[red]Error: {e}[/red]") - import traceback - - traceback.print_exc() diff --git a/libs/async-cassandra/pyproject.toml b/libs/async-cassandra/pyproject.toml index d513837..4940021 100644 --- a/libs/async-cassandra/pyproject.toml +++ b/libs/async-cassandra/pyproject.toml @@ -62,6 +62,7 @@ test = [ "httpx>=0.24.0", "uvicorn>=0.23.0", "psutil>=5.9.0", + "pyarrow>=10.0.0", ] docs = [ "sphinx>=6.0.0", diff --git a/libs/async-cassandra/tests/integration/test_example_scripts.py b/libs/async-cassandra/tests/integration/test_example_scripts.py index 7ed2629..2b67a0f 100644 --- a/libs/async-cassandra/tests/integration/test_example_scripts.py +++ b/libs/async-cassandra/tests/integration/test_example_scripts.py @@ -91,13 +91,15 @@ async def test_streaming_basic_example(self, cassandra_cluster): # Verify expected output patterns # The examples use logging which outputs to stderr output = result.stderr if result.stderr else result.stdout - assert "Basic Streaming Example" in output + assert "BASIC STREAMING EXAMPLE" in output assert "Inserted 100000 test events" in output or "Inserted 100,000 test events" in output - assert "Streaming completed:" in output + assert "Streaming completed!" in output assert "Total events: 100,000" in output or "Total events: 100000" in output - assert "Filtered Streaming Example" in output - assert "Page-Based Streaming Example (True Async Paging)" in output - assert "Pages are fetched asynchronously" in output + assert "FILTERED STREAMING EXAMPLE" in output + assert "PAGE-BASED STREAMING EXAMPLE (True Async Paging)" in output + assert ( + "Pages are fetched ON-DEMAND" in output or "Pages were fetched asynchronously" in output + ) # Verify keyspace was cleaned up async with AsyncCluster(["localhost"]) as cluster: @@ -152,8 +154,8 @@ async def test_export_large_table_example(self, cassandra_cluster, tmp_path): # Verify expected output (might be in stdout or stderr due to logging) output = result.stdout + result.stderr - assert "Created 5000 sample products" in output - assert "Export completed:" in output + assert "Created 5,000 sample products" in output + assert "EXPORT COMPLETED SUCCESSFULLY!" in output assert "Rows exported: 5,000" in output assert f"Output directory: {export_dir}" in output @@ -235,16 +237,16 @@ async def test_context_manager_safety_demo(self, cassandra_cluster): # Verify all demonstrations ran (might be in stdout or stderr due to logging) output = result.stdout + result.stderr - assert "Demonstrating Query Error Safety" in output + assert "QUERY ERROR SAFETY DEMONSTRATION" in output assert "Query failed as expected" in output - assert "Session still works after error" in output + assert "Session is healthy!" in output - assert "Demonstrating Streaming Error Safety" in output + assert "STREAMING ERROR SAFETY DEMONSTRATION" in output assert "Streaming failed as expected" in output assert "Successfully streamed" in output - assert "Demonstrating Context Manager Isolation" in output - assert "Demonstrating Concurrent Safety" in output + assert "CONTEXT MANAGER ISOLATION DEMONSTRATION" in output + assert "CONCURRENT OPERATIONS SAFETY DEMONSTRATION" in output # Verify key takeaways are shown assert "Query errors don't close sessions" in output @@ -285,15 +287,19 @@ async def test_metrics_simple_example(self, cassandra_cluster): # Verify metrics output (might be in stdout or stderr due to logging) output = result.stdout + result.stderr - assert "Query Metrics Example" in output or "async-cassandra Metrics Example" in output - assert "Connection Health Monitoring" in output - assert "Error Tracking Example" in output or "Expected error recorded" in output - assert "Performance Summary" in output + assert "ASYNC-CASSANDRA METRICS COLLECTION EXAMPLE" in output + assert "CONNECTION HEALTH MONITORING" in output + assert "ERROR TRACKING DEMONSTRATION" in output or "Expected error captured" in output + assert "PERFORMANCE METRICS SUMMARY" in output # Verify statistics are shown assert "Total queries:" in output or "Query Metrics:" in output assert "Success rate:" in output or "Success Rate:" in output - assert "Average latency:" in output or "Average Duration:" in output + assert ( + "Average latency:" in output + or "Average Duration:" in output + or "Query Performance:" in output + ) @pytest.mark.timeout(240) # Override default timeout for this test (lots of data) async def test_realtime_processing_example(self, cassandra_cluster): @@ -333,15 +339,19 @@ async def test_realtime_processing_example(self, cassandra_cluster): output = result.stdout + result.stderr # Check that setup completed - assert "Setting up sensor data" in output - assert "Sample data inserted" in output + assert "Setting up IoT sensor data simulation" in output + assert "Sample data setup complete" in output # Check that processing occurred - assert "Processing Historical Data" in output or "Processing historical data" in output - assert "Processing completed" in output or "readings processed" in output + assert "PROCESSING HISTORICAL DATA" in output or "Processing Historical Data" in output + assert ( + "Processing completed" in output + or "readings processed" in output + or "Analysis complete!" in output + ) # Check that real-time simulation ran - assert "Simulating Real-Time Processing" in output or "Processing cycle" in output + assert "SIMULATING REAL-TIME PROCESSING" in output or "Processing cycle" in output # Verify cleanup assert "Cleaning up" in output @@ -436,11 +446,12 @@ async def test_export_to_parquet_example(self, cassandra_cluster, tmp_path): output = result.stderr if result.stderr else result.stdout assert "Setting up test data" in output assert "Test data setup complete" in output - assert "Example 1: Export Entire Table" in output - assert "Example 2: Export Filtered Data" in output - assert "Example 3: Export with Different Compression" in output - assert "Export completed successfully!" in output - assert "Verifying Exported Files" in output + assert "EXPORT SUMMARY" in output + assert "SNAPPY compression:" in output + assert "GZIP compression:" in output + assert "LZ4 compression:" in output + assert "Three exports completed:" in output + assert "VERIFYING EXPORTED PARQUET FILES" in output assert f"Output directory: {export_dir}" in output # Verify Parquet files were created (look recursively in subdirectories)