Skip to content

Commit 58c4936

Browse files
author
Bob Strahan
committed
Fix S3 vectors Knowledge Base creation with correct API parameters and bucket name validation
1 parent 68fd8ac commit 58c4936

File tree

2 files changed

+274
-11
lines changed

2 files changed

+274
-11
lines changed

options/bedrockkb/src/s3_vectors_manager/handler.py

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -250,17 +250,18 @@ def create_knowledge_base_s3_vectors(bedrock_agent_client, name, description, ro
250250
"""Create Knowledge Base with S3 Vectors using Bedrock Agent API."""
251251
try:
252252
logger.info(f"Creating Knowledge Base: {name} with S3 Vectors")
253+
logger.info(f"Using bucket ARN: {bucket_arn}, index name: {index_name}")
253254

254-
# Build S3 Vectors configuration
255+
# Build S3 Vectors configuration - use correct parameter names per API documentation
255256
s3_vectors_config = {
256-
'bucketArn': bucket_arn,
257+
'vectorBucketArn': bucket_arn, # Correct parameter name
257258
'indexName': index_name
258259
}
259260

260-
# Add KMS encryption if provided
261+
# Note: KMS encryption is handled at the S3 vector bucket level, not in Knowledge Base config
262+
# The error showed that kmsKeyArn is not supported in s3VectorsConfiguration
261263
if kms_key_arn:
262-
s3_vectors_config['kmsKeyArn'] = kms_key_arn
263-
logger.info(f"Using KMS encryption with key: {kms_key_arn}")
264+
logger.info(f"KMS encryption configured at bucket level with key: {kms_key_arn}")
264265

265266
response = bedrock_agent_client.create_knowledge_base(
266267
name=name,
@@ -324,6 +325,9 @@ def get_knowledge_base_info(bedrock_agent_client, kb_id):
324325
def create_s3_vector_resources(s3vectors_client, bucket_name, index_name, embedding_model, kms_key_arn=None):
325326
"""Create S3 Vector bucket with optional KMS encryption. Bedrock will manage the index automatically."""
326327
try:
328+
# Get region from client for ARN construction
329+
region = s3vectors_client.meta.region_name
330+
327331
# Create vector bucket with optional encryption
328332
logger.info(f"Creating vector bucket: {bucket_name}")
329333

@@ -342,22 +346,36 @@ def create_s3_vector_resources(s3vectors_client, bucket_name, index_name, embedd
342346
bucket_response = s3vectors_client.create_vector_bucket(**create_bucket_params)
343347
logger.info(f"Created vector bucket: {bucket_name}")
344348

345-
# Get bucket ARN from response or construct it
349+
# Get bucket ARN from response or construct it properly using SANITIZED bucket name
346350
bucket_arn = bucket_response.get('BucketArn')
347351
if not bucket_arn:
348-
# Construct bucket ARN if not returned
349-
bucket_arn = f"arn:aws:s3vectors:{s3vectors_client.meta.region_name}:{s3vectors_client.meta.service_model.metadata['signingName']}:bucket/{bucket_name}"
352+
# Construct bucket ARN using proper format with account ID and SANITIZED bucket name
353+
try:
354+
# Get account ID for ARN construction
355+
sts_client = boto3.client('sts', region_name=region)
356+
account_id = sts_client.get_caller_identity()['Account']
357+
bucket_arn = f"arn:aws:s3vectors:{region}:{account_id}:bucket/{bucket_name}"
358+
logger.info(f"Constructed bucket ARN with sanitized name: {bucket_arn}")
359+
except Exception as arn_error:
360+
logger.error(f"Could not construct bucket ARN: {arn_error}")
361+
raise arn_error
350362

351363
logger.info(f"Vector bucket ARN: {bucket_arn}")
364+
logger.info(f"Sanitized bucket name used consistently: {bucket_name}")
365+
366+
# Validate bucket name one more time before returning
367+
if not is_valid_s3_bucket_name(bucket_name):
368+
raise ValueError(f"Sanitized bucket name is still invalid: {bucket_name}")
352369

353370
# Note: For Bedrock Knowledge Base integration, Bedrock will automatically
354371
# create and manage the vector index when the Knowledge Base is created.
355372
# We don't create the index manually here.
356373

357374
return {
358-
'BucketName': bucket_name,
359-
'BucketArn': bucket_arn,
360-
'IndexName': index_name, # This will be used by Bedrock
375+
'BucketName': bucket_name, # Return sanitized name
376+
'BucketArn': bucket_arn, # Return proper ARN with sanitized name
377+
'SanitizedBucketName': bucket_name, # Explicit sanitized name for KB creation
378+
'IndexName': index_name, # This will be used by Bedrock
361379
'Status': 'Created'
362380
}
363381

@@ -371,6 +389,18 @@ def create_s3_vector_resources(s3vectors_client, bucket_name, index_name, embedd
371389
raise
372390

373391

392+
def is_valid_s3_bucket_name(bucket_name):
393+
"""Validate that bucket name meets S3 requirements."""
394+
import re
395+
if not bucket_name or len(bucket_name) < 3 or len(bucket_name) > 63:
396+
return False
397+
if not re.match(r'^[a-z0-9][a-z0-9\-]*[a-z0-9]$', bucket_name):
398+
return False
399+
if '--' in bucket_name: # No consecutive hyphens
400+
return False
401+
return True
402+
403+
374404
def delete_s3_vector_resources(s3vectors_client, bucket_name, index_name):
375405
"""Delete S3 Vector bucket. Note: Bedrock manages the index lifecycle."""
376406
try:
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
# S3 Vectors Implementation Comparison: PR #59 vs Our Implementation
2+
3+
## Executive Summary
4+
5+
Both implementations successfully add S3 Vectors support to the GenAI IDP Accelerator, but take fundamentally different architectural approaches. **PR #59** uses a separate template with manual vector index creation, while **our implementation** enhances the existing bedrockkb template with Bedrock-managed indices.
6+
7+
## Detailed Technical Comparison
8+
9+
### Architecture Approach
10+
11+
| Aspect | PR #59 Approach | Our Implementation | Winner |
12+
|--------|----------------|-------------------|---------|
13+
| **Template Strategy** | Separate `options/s3-vectors-kb/template.yaml` | Enhanced existing `options/bedrockkb/template.yaml` | **Ours** |
14+
| **Resource Management** | Separate stack for S3 Vectors | Conditional resources in single template | **Ours** |
15+
| **Vector Store Selection** | Deploy different templates | Single parameter choice | **Ours** |
16+
| **Maintenance Overhead** | Two templates to maintain | Single template with conditions | **Ours** |
17+
18+
### S3 Vectors Integration
19+
20+
| Aspect | PR #59 Approach | Our Implementation | Analysis |
21+
|--------|----------------|-------------------|----------|
22+
| **Index Creation** | Manual via `s3vectors.create_index()` | Bedrock-managed (automatic) | **Ours is more appropriate** |
23+
| **Index Parameters** | Hard-coded dimension=1024, cosine | Let Bedrock decide based on embedding model | **Ours is more flexible** |
24+
| **API Usage** | Low-level S3 Vectors operations | High-level Bedrock integration | **Ours is more appropriate** |
25+
| **Metadata Keys** | Manual configuration required | Bedrock handles automatically | **Ours is simpler** |
26+
27+
### Code Quality & Maintainability
28+
29+
| Aspect | PR #59 | Our Implementation | Winner |
30+
|--------|--------|-------------------|---------|
31+
| **Custom Client Wrapper** | Created `S3VectorsClient` wrapper class | Direct boto3 usage | **Ours (simpler)** |
32+
| **Error Handling** | Basic try/catch | Bulletproof multi-layer with PhysicalResourceId | **Ours** |
33+
| **CloudFormation Integration** | Basic cfnresponse | Comprehensive custom resource patterns | **Ours** |
34+
| **Bucket Name Handling** | No sanitization | Comprehensive sanitization logic | **Ours** |
35+
36+
### Configuration Management
37+
38+
| Aspect | PR #59 | Our Implementation | Analysis |
39+
|--------|--------|-------------------|----------|
40+
| **Config Integration** | Added `s3Vectors` section to all pattern configs | CloudFormation parameters only | **PR #59 more comprehensive** |
41+
| **UI Integration** | Added KnowledgeBaseStatus component | No UI changes | **PR #59 better** |
42+
| **Default Values** | Hard-coded in multiple places | Centralized in CloudFormation | **Ours cleaner** |
43+
44+
## Detailed Analysis
45+
46+
### PR #59 Strengths
47+
48+
1. **Comprehensive Configuration Integration**
49+
- Added `s3Vectors.filterableMetadataKeys` to all pattern configuration files
50+
- Provides granular control over metadata filtering
51+
- UI integration with status indicators and gating logic
52+
53+
2. **User Experience Enhancements**
54+
- Created `KnowledgeBaseStatus` UI component showing backend type
55+
- Added performance expectations in UI (2-10 second response times)
56+
- Enhanced documentation with backend comparison table
57+
58+
3. **Flexibility for Advanced Users**
59+
- Exposes vector dimension and distance metric configuration
60+
- Allows customization of non-filterable metadata keys
61+
- Provides fine-grained control over vector index creation
62+
63+
### PR #59 Weaknesses & Problems
64+
65+
1. **❌ CRITICAL: Wrong API Usage Pattern**
66+
```python
67+
# PR #59 approach - INCORRECT for Bedrock integration
68+
s3vectors_client.create_index(
69+
vector_bucket_name,
70+
vector_index_name,
71+
dimension=1024, # Hard-coded, should match embedding model
72+
distance_metric='cosine',
73+
non_filterable_metadata_keys=["text_content", "s3_uri"]
74+
)
75+
```
76+
**Problem**: This creates a low-level S3 Vectors index unsuitable for Bedrock Knowledge Base integration. Bedrock expects to manage the index automatically.
77+
78+
2. **❌ Architectural Complexity**
79+
- Maintains two separate templates (`bedrockkb` and `s3-vectors-kb`)
80+
- Requires users to choose different templates rather than parameters
81+
- Increases maintenance overhead and potential for drift
82+
83+
3. **❌ Hard-coded Parameters**
84+
- Vector dimension fixed at 1024 (should match embedding model)
85+
- Distance metric fixed at 'cosine' (should be configurable)
86+
- Embedding model assumptions built into index creation
87+
88+
4. **❌ CloudFormation Resource Management Issues**
89+
```python
90+
# Physical resource ID is not robust
91+
physical_resource_id = f"{props.get('VectorBucketName', 'unknown')}-{props.get('VectorIndexName', 'unknown')}"
92+
```
93+
**Problem**: Simple string concatenation doesn't follow CloudFormation best practices
94+
95+
5. **❌ Missing Enterprise Features**
96+
- No bucket name sanitization (will fail with uppercase stack names)
97+
- Basic exception handling (no bulletproof CloudFormation response guarantee)
98+
- Limited KMS encryption integration
99+
100+
### Our Implementation Strengths
101+
102+
1. **✅ Correct Bedrock Integration**
103+
```python
104+
# Our approach - CORRECT for Bedrock
105+
# Only create S3 vector bucket, let Bedrock manage the index
106+
s3vectors_client.create_vector_bucket(
107+
vectorBucketName=bucket_name,
108+
encryptionConfiguration={'sseType': 'aws:kms', 'kmsKeyArn': kms_key_arn}
109+
)
110+
```
111+
112+
2. **✅ Unified Architecture**
113+
- Single enhanced `bedrockkb` template supports both vector stores
114+
- Parameter-based selection rather than template selection
115+
- Consistent resource patterns and outputs
116+
117+
3. **✅ Enterprise Production Features**
118+
- Bulletproof exception handling with guaranteed CloudFormation responses
119+
- Robust PhysicalResourceId lifecycle management
120+
- Comprehensive bucket name sanitization
121+
- Complete KMS encryption integration
122+
123+
4. **✅ CloudFormation Best Practices**
124+
- Proper conditional resource creation
125+
- Unified resource reference system solving dependency issues
126+
- Clean parameter passing and validation
127+
128+
### Our Implementation Weaknesses
129+
130+
1. **❌ Limited Configuration Flexibility**
131+
- No exposure of advanced S3 Vectors parameters (dimension, distance metric)
132+
- No integration with pattern configuration files
133+
- Less granular control for advanced users
134+
135+
2. **❌ No UI Integration**
136+
- No status indicators for S3 Vectors backend
137+
- No user feedback about expected performance characteristics
138+
- Users must rely on CloudFormation outputs for status
139+
140+
3. **❌ Less User Documentation**
141+
- PR #59 has more comprehensive backend comparison documentation
142+
- Missing performance expectations and use case guidance in UI
143+
144+
## Critical Technical Issues in PR #59
145+
146+
### 1. **Incompatible Index Creation Approach**
147+
```python
148+
# PR #59 - WRONG for Bedrock Knowledge Base
149+
s3vectors_client.create_index(
150+
vector_bucket_name,
151+
vector_index_name,
152+
dimension=1024, # Fixed dimension
153+
distance_metric='cosine',
154+
metadataConfiguration={"nonFilterableMetadataKeys": non_filterable_keys}
155+
)
156+
157+
# Then tries to use indexArn in Knowledge Base creation
158+
storageConfiguration={
159+
'type': 'S3_VECTORS',
160+
's3VectorsConfiguration': {
161+
'indexArn': index_arn # This won't work with Bedrock KB
162+
}
163+
}
164+
```
165+
166+
**Problem**: This approach creates a standalone S3 Vectors index that's incompatible with Bedrock Knowledge Base requirements. Bedrock Knowledge Base expects to control the vector index lifecycle and configuration.
167+
168+
### 2. **Incorrect Knowledge Base Configuration**
169+
```python
170+
# PR #59 attempts this configuration - WILL FAIL
171+
's3VectorsConfiguration': {
172+
'indexArn': index_arn # Bedrock API doesn't accept indexArn
173+
}
174+
```
175+
176+
**Problem**: The Bedrock Agent API for Knowledge Base creation requires `vectorBucketArn` and `indexName`, not `indexArn`. This configuration will fail.
177+
178+
### 3. **Resource Lifecycle Issues**
179+
- Creates vector indices manually but doesn't properly manage their deletion
180+
- Knowledge Base and S3 Vectors resources have dependency mismatches
181+
- No proper cleanup order (should delete Knowledge Base before vector resources)
182+
183+
## Recommendations
184+
185+
### Hybrid Approach - Best of Both Worlds
186+
187+
1. **Keep Our Core Architecture** (Unified template, Bedrock-managed indices)
188+
2. **Add PR #59's User Experience Features**:
189+
- KnowledgeBaseStatus UI component
190+
- Enhanced documentation with backend comparison
191+
- Performance expectations in UI
192+
193+
3. **Add PR #59's Configuration Flexibility** (where appropriate):
194+
- Expose distance metric parameter (for bucket creation)
195+
- Allow custom filterable metadata keys
196+
- Integration with pattern configuration files
197+
198+
### Implementation Priority
199+
200+
**High Priority (Fix PR #59 Critical Issues):**
201+
- [ ] Fix index creation approach (remove manual index creation)
202+
- [ ] Correct Knowledge Base API parameters (vectorBucketArn vs indexArn)
203+
- [ ] Add bucket name sanitization
204+
- [ ] Improve exception handling
205+
206+
**Medium Priority (Enhance Our Implementation):**
207+
- [ ] Add KnowledgeBaseStatus UI component from PR #59
208+
- [ ] Integrate s3Vectors configuration into pattern configs
209+
- [ ] Expose advanced parameters (distance metric, dimension)
210+
211+
**Low Priority (Polish):**
212+
- [ ] Enhanced documentation with backend comparison
213+
- [ ] Performance expectations in UI
214+
- [ ] Advanced metadata key configuration
215+
216+
## Conclusion
217+
218+
**Our implementation is technically superior** for production use due to:
219+
- Correct Bedrock Knowledge Base integration approach
220+
- Enterprise-grade reliability and error handling
221+
- Proper CloudFormation resource management
222+
- Unified architecture reducing complexity
223+
224+
**PR #59 has better user experience** features:
225+
- Comprehensive UI integration
226+
- Enhanced documentation
227+
- Configuration flexibility
228+
229+
The **ideal solution** would combine our robust technical foundation with PR #59's user experience enhancements, while fixing PR #59's critical technical issues around index creation and API usage.
230+
231+
## Recommendation
232+
233+
**Do not merge PR #59 as-is** due to critical technical issues that will cause deployment failures. Instead, enhance our implementation with the user experience features from PR #59 after fixing the technical problems.

0 commit comments

Comments
 (0)