Skip to content

Commit 86b90ce

Browse files
Updated tests and enhanced kill caller with an offset
Signed-off-by: Varun Deep Saini <varun.23bcs10048@ms.sst.scaler.com>
1 parent baf371e commit 86b90ce

File tree

26 files changed

+672
-90
lines changed

26 files changed

+672
-90
lines changed
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
bundle:
2+
name: wal-chain-test
3+
4+
resources:
5+
jobs:
6+
# Linear chain: job_01 -> job_02 -> ... -> job_10
7+
# Execution order: job_01 first, job_10 last
8+
job_01:
9+
name: "job-01"
10+
description: "first in chain"
11+
tasks:
12+
- task_key: "task"
13+
spark_python_task:
14+
python_file: ./test.py
15+
new_cluster:
16+
spark_version: 15.4.x-scala2.12
17+
node_type_id: i3.xlarge
18+
num_workers: 0
19+
job_02:
20+
name: "job-02"
21+
description: "depends on ${resources.jobs.job_01.id}"
22+
tasks:
23+
- task_key: "task"
24+
spark_python_task:
25+
python_file: ./test.py
26+
new_cluster:
27+
spark_version: 15.4.x-scala2.12
28+
node_type_id: i3.xlarge
29+
num_workers: 0
30+
job_03:
31+
name: "job-03"
32+
description: "depends on ${resources.jobs.job_02.id}"
33+
tasks:
34+
- task_key: "task"
35+
spark_python_task:
36+
python_file: ./test.py
37+
new_cluster:
38+
spark_version: 15.4.x-scala2.12
39+
node_type_id: i3.xlarge
40+
num_workers: 0
41+
job_04:
42+
name: "job-04"
43+
description: "depends on ${resources.jobs.job_03.id}"
44+
tasks:
45+
- task_key: "task"
46+
spark_python_task:
47+
python_file: ./test.py
48+
new_cluster:
49+
spark_version: 15.4.x-scala2.12
50+
node_type_id: i3.xlarge
51+
num_workers: 0
52+
job_05:
53+
name: "job-05"
54+
description: "depends on ${resources.jobs.job_04.id}"
55+
tasks:
56+
- task_key: "task"
57+
spark_python_task:
58+
python_file: ./test.py
59+
new_cluster:
60+
spark_version: 15.4.x-scala2.12
61+
node_type_id: i3.xlarge
62+
num_workers: 0
63+
job_06:
64+
name: "job-06"
65+
description: "depends on ${resources.jobs.job_05.id}"
66+
tasks:
67+
- task_key: "task"
68+
spark_python_task:
69+
python_file: ./test.py
70+
new_cluster:
71+
spark_version: 15.4.x-scala2.12
72+
node_type_id: i3.xlarge
73+
num_workers: 0
74+
job_07:
75+
name: "job-07"
76+
description: "depends on ${resources.jobs.job_06.id}"
77+
tasks:
78+
- task_key: "task"
79+
spark_python_task:
80+
python_file: ./test.py
81+
new_cluster:
82+
spark_version: 15.4.x-scala2.12
83+
node_type_id: i3.xlarge
84+
num_workers: 0
85+
job_08:
86+
name: "job-08"
87+
description: "depends on ${resources.jobs.job_07.id}"
88+
tasks:
89+
- task_key: "task"
90+
spark_python_task:
91+
python_file: ./test.py
92+
new_cluster:
93+
spark_version: 15.4.x-scala2.12
94+
node_type_id: i3.xlarge
95+
num_workers: 0
96+
job_09:
97+
name: "job-09"
98+
description: "depends on ${resources.jobs.job_08.id}"
99+
tasks:
100+
- task_key: "task"
101+
spark_python_task:
102+
python_file: ./test.py
103+
new_cluster:
104+
spark_version: 15.4.x-scala2.12
105+
node_type_id: i3.xlarge
106+
num_workers: 0
107+
job_10:
108+
name: "job-10"
109+
description: "depends on ${resources.jobs.job_09.id}"
110+
tasks:
111+
- task_key: "task"
112+
spark_python_task:
113+
python_file: ./test.py
114+
new_cluster:
115+
spark_version: 15.4.x-scala2.12
116+
node_type_id: i3.xlarge
117+
num_workers: 0

acceptance/bundle/deploy/wal/chain-10-jobs/out.test.toml

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
=== First deploy (crashes on job_10) ===
2+
3+
>>> errcode [CLI] bundle deploy
4+
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files...
5+
Deploying resources...
6+
[PROCESS_KILLED]
7+
8+
Exit code: [KILLED]
9+
10+
=== WAL content after crash ===
11+
{"lineage":"[UUID]","serial": [SERIAL]}
12+
{"k":"resources.jobs.job_01","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"first in chain","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-01","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]}}}
13+
{"k":"resources.jobs.job_02","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-02","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_01","label":"${resources.jobs.job_01.id}"}]}}
14+
{"k":"resources.jobs.job_03","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-03","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_02","label":"${resources.jobs.job_02.id}"}]}}
15+
{"k":"resources.jobs.job_04","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-04","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_03","label":"${resources.jobs.job_03.id}"}]}}
16+
{"k":"resources.jobs.job_05","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-05","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_04","label":"${resources.jobs.job_04.id}"}]}}
17+
{"k":"resources.jobs.job_06","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-06","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_05","label":"${resources.jobs.job_05.id}"}]}}
18+
{"k":"resources.jobs.job_07","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-07","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_06","label":"${resources.jobs.job_06.id}"}]}}
19+
{"k":"resources.jobs.job_08","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-08","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_07","label":"${resources.jobs.job_07.id}"}]}}
20+
{"k":"resources.jobs.job_09","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/state/metadata.json"},"description":"depends on 1001","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"job-09","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files/test.py"},"task_key":"task"}]},"depends_on":[{"node":"resources.jobs.job_08","label":"${resources.jobs.job_08.id}"}]}}
21+
22+
=== Number of jobs saved in WAL ===
23+
9
24+
25+
=== Bundle summary (reads from WAL) ===
26+
Name: wal-chain-test
27+
Target: default
28+
Workspace:
29+
User: [USERNAME]
30+
Path: /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default
31+
Resources:
32+
Jobs:
33+
job_01:
34+
Name: job-01
35+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
36+
job_02:
37+
Name: job-02
38+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
39+
job_03:
40+
Name: job-03
41+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
42+
job_04:
43+
Name: job-04
44+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
45+
job_05:
46+
Name: job-05
47+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
48+
job_06:
49+
Name: job-06
50+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
51+
job_07:
52+
Name: job-07
53+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
54+
job_08:
55+
Name: job-08
56+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
57+
job_09:
58+
Name: job-09
59+
URL: [DATABRICKS_URL]/jobs/1001?o=[NUMID]
60+
job_10:
61+
Name: job-10
62+
URL: (not deployed)
63+
64+
=== Second deploy (recovery) ===
65+
66+
>>> [CLI] bundle deploy --force-lock
67+
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-chain-test/default/files...
68+
Deploying resources...
69+
Updating deployment state...
70+
Deployment complete!
71+
72+
=== WAL after successful deploy ===
73+
WAL deleted (expected)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
echo "=== First deploy (crashes on job_10) ==="
2+
trace errcode $CLI bundle deploy
3+
4+
echo ""
5+
echo "=== WAL content after crash ==="
6+
cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "No WAL file"
7+
8+
echo ""
9+
echo "=== Number of jobs saved in WAL ==="
10+
grep -c '"k":"resources.jobs' .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "0"
11+
12+
echo ""
13+
echo "=== Bundle summary (reads from WAL) ==="
14+
$CLI bundle summary
15+
16+
echo ""
17+
echo "=== Second deploy (recovery) ==="
18+
trace $CLI bundle deploy --force-lock
19+
20+
echo ""
21+
echo "=== WAL after successful deploy ==="
22+
cat .databricks/bundle/default/resources.json.wal 2>/dev/null || echo "WAL deleted (expected)"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Placeholder for Spark task
2+
print("Hello from test job")
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Linear chain: job_01 -> job_02 -> ... -> job_10
2+
# Let first 9 jobs/create succeed, then kill on the 10th
3+
4+
[[Server]]
5+
Pattern = "POST /api/2.2/jobs/create"
6+
KillCallerOffset = 9
7+
KillCaller = 1
8+
Response.Body = '{"job_id": 1001}'
9+
10+
[[Server]]
11+
Pattern = "POST /api/2.2/jobs/reset"
12+
Response.Body = '{}'
13+
14+
[[Server]]
15+
Pattern = "GET /api/2.2/jobs/get"
16+
Response.Body = '{"job_id": 1001, "settings": {"name": "test-job"}}'
17+
18+
# Strip single-node cluster warnings
19+
[[Repls]]
20+
Old = '(?s)Warning: Single node cluster is not correctly configured.*?ResourceClass: SingleNode\n \n\n'
21+
New = ''

acceptance/bundle/deploy/wal/crash-after-create/databricks.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ resources:
55
jobs:
66
job_a:
77
name: "test-job-a"
8+
description: "first job"
89
tasks:
910
- task_key: "task-a"
1011
spark_python_task:
@@ -13,3 +14,14 @@ resources:
1314
spark_version: 15.4.x-scala2.12
1415
node_type_id: i3.xlarge
1516
num_workers: 0
17+
job_b:
18+
name: "test-job-b"
19+
description: "depends on ${resources.jobs.job_a.id}"
20+
tasks:
21+
- task_key: "task-b"
22+
spark_python_task:
23+
python_file: ./test.py
24+
new_cluster:
25+
spark_version: 15.4.x-scala2.12
26+
node_type_id: i3.xlarge
27+
num_workers: 0

acceptance/bundle/deploy/wal/crash-after-create/output.txt

Lines changed: 65 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,66 @@
1-
=== Creating state directory ===
2-
=== Creating WAL file (simulating crash after job create) ===
3-
=== WAL content before deploy ===
4-
{"lineage":"test-lineage-123","serial": [SERIAL]}
5-
{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"name":"test-job-a"}}}
6-
=== Deploy (should recover from WAL) ===
7-
8-
>>> [CLI] bundle deploy
9-
Warning: Single node cluster is not correctly configured
10-
at resources.jobs.job_a.tasks[0].new_cluster
11-
in databricks.yml:13:13
1+
=== First deploy (crashes after job_a create, before job_b) ===
2+
3+
>>> errcode [CLI] bundle deploy
4+
Warning: [SINGLE_NODE_WARNING]
5+
6+
num_workers should be 0 only for single-node clusters. To create a
7+
valid single node cluster please ensure that the following properties
8+
are correctly set in the cluster specification:
9+
10+
spark_conf:
11+
spark.databricks.cluster.profile: singleNode
12+
spark.master: local[*]
13+
14+
custom_tags:
15+
ResourceClass: SingleNode
16+
17+
18+
Warning: [SINGLE_NODE_WARNING]
19+
20+
num_workers should be 0 only for single-node clusters. To create a
21+
valid single node cluster please ensure that the following properties
22+
are correctly set in the cluster specification:
23+
24+
spark_conf:
25+
spark.databricks.cluster.profile: singleNode
26+
spark.master: local[*]
27+
28+
custom_tags:
29+
ResourceClass: SingleNode
30+
31+
32+
Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files...
33+
Deploying resources...
34+
[PROCESS_KILLED]
35+
36+
Exit code: [KILLED]
37+
=== WAL should exist after crash ===
38+
WAL exists (expected)
39+
{"lineage":"[UUID]","serial": [SERIAL]}
40+
{"k":"resources.jobs.job_a","v":{"__id__": "[ID]","state":{"deployment":{"kind":"BUNDLE","metadata_file_path":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/state/metadata.json"},"description":"first job","edit_mode":"UI_LOCKED","format":"MULTI_TASK","max_concurrent_runs":1,"name":"test-job-a","queue":{"enabled":true},"tasks":[{"new_cluster":{"node_type_id":"[NODE_TYPE_ID]","num_workers":0,"spark_version":"15.4.x-scala2.12"},"spark_python_task":{"python_file":"/Workspace/Users/[USERNAME]/.bundle/wal-crash-test/default/files/test.py"},"task_key":"task-a"}]}}}
41+
=== State file after crash (should be empty) ===
42+
{
43+
"serial": [SERIAL],
44+
"state_keys": []
45+
}
46+
=== Second deploy (should recover from WAL and complete) ===
47+
48+
>>> [CLI] bundle deploy --force-lock
49+
Warning: [SINGLE_NODE_WARNING]
50+
51+
num_workers should be 0 only for single-node clusters. To create a
52+
valid single node cluster please ensure that the following properties
53+
are correctly set in the cluster specification:
54+
55+
spark_conf:
56+
spark.databricks.cluster.profile: singleNode
57+
spark.master: local[*]
58+
59+
custom_tags:
60+
ResourceClass: SingleNode
61+
62+
63+
Warning: [SINGLE_NODE_WARNING]
1264

1365
num_workers should be 0 only for single-node clusters. To create a
1466
valid single node cluster please ensure that the following properties
@@ -28,10 +80,10 @@ Updating deployment state...
2880
Deployment complete!
2981
=== State file after recovery ===
3082
{
31-
"lineage": "test-lineage-123",
3283
"serial": [SERIAL],
3384
"state_keys": [
34-
"resources.jobs.job_a"
85+
"resources.jobs.job_a",
86+
"resources.jobs.job_b"
3587
]
3688
}
3789
=== WAL file after successful deploy ===

acceptance/bundle/deploy/wal/crash-after-create/script

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,22 @@
1-
echo "=== Creating state directory ==="
2-
mkdir -p .databricks/bundle/default
1+
echo "=== First deploy (crashes after job_a create, before job_b) ==="
2+
trace errcode $CLI bundle deploy
33

4-
echo "=== Creating WAL file (simulating crash after job create) ==="
5-
cat > .databricks/bundle/default/resources.json.wal << 'EOF'
6-
{"lineage":"test-lineage-123","serial":1}
7-
{"k":"resources.jobs.job_a","v":{"__id__":"1001","state":{"name":"test-job-a"}}}
8-
EOF
4+
echo "=== WAL should exist after crash ==="
5+
if [ -f ".databricks/bundle/default/resources.json.wal" ]; then
6+
echo "WAL exists (expected)"
7+
cat .databricks/bundle/default/resources.json.wal
8+
else
9+
echo "WAL missing (unexpected)"
10+
fi
911

10-
echo "=== WAL content before deploy ==="
11-
cat .databricks/bundle/default/resources.json.wal
12+
echo "=== State file after crash (should be empty) ==="
13+
cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}'
1214

13-
echo "=== Deploy (should recover from WAL) ==="
14-
trace $CLI bundle deploy
15+
echo "=== Second deploy (should recover from WAL and complete) ==="
16+
trace $CLI bundle deploy --force-lock
1517

1618
echo "=== State file after recovery ==="
17-
cat .databricks/bundle/default/resources.json | jq -S '{lineage: .lineage, serial: .serial, state_keys: (.state | keys)}'
19+
cat .databricks/bundle/default/resources.json | jq -S '{serial: .serial, state_keys: (.state | keys)}'
1820

1921
echo "=== WAL file after successful deploy ==="
2022
if [ -f ".databricks/bundle/default/resources.json.wal" ]; then

0 commit comments

Comments
 (0)