Skip to content

Commit 9b79f46

Browse files
CSTACKEX-18_2: adding VM snapshot logic
1 parent 3f0019a commit 9b79f46

File tree

4 files changed

+1228
-0
lines changed

4 files changed

+1228
-0
lines changed

plugins/storage/volume/ontap/pom.xml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,16 @@
8484
<artifactId>cloud-engine-storage-volume</artifactId>
8585
<version>${project.version}</version>
8686
</dependency>
87+
<dependency>
88+
<groupId>org.apache.cloudstack</groupId>
89+
<artifactId>cloud-engine-storage-snapshot</artifactId>
90+
<version>${project.version}</version>
91+
</dependency>
92+
<dependency>
93+
<groupId>org.apache.cloudstack</groupId>
94+
<artifactId>cloud-server</artifactId>
95+
<version>${project.version}</version>
96+
</dependency>
8797
<dependency>
8898
<groupId>io.swagger</groupId>
8999
<artifactId>swagger-annotations</artifactId>
Lines changed: 382 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,382 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.cloudstack.storage.vmsnapshot;
20+
21+
import java.util.ArrayList;
22+
import java.util.List;
23+
import java.util.Map;
24+
import java.util.concurrent.TimeUnit;
25+
26+
import javax.naming.ConfigurationException;
27+
28+
import org.apache.cloudstack.engine.subsystem.api.storage.SnapshotInfo;
29+
import org.apache.cloudstack.engine.subsystem.api.storage.StrategyPriority;
30+
import org.apache.cloudstack.engine.subsystem.api.storage.VMSnapshotOptions;
31+
import org.apache.cloudstack.engine.subsystem.api.storage.VolumeInfo;
32+
import org.apache.cloudstack.storage.datastore.db.StoragePoolVO;
33+
import org.apache.cloudstack.storage.to.VolumeObjectTO;
34+
import org.apache.cloudstack.storage.utils.Constants;
35+
import org.apache.commons.collections.CollectionUtils;
36+
import org.apache.logging.log4j.LogManager;
37+
import org.apache.logging.log4j.Logger;
38+
39+
import com.cloud.agent.api.CreateVMSnapshotAnswer;
40+
import com.cloud.agent.api.CreateVMSnapshotCommand;
41+
import com.cloud.agent.api.FreezeThawVMAnswer;
42+
import com.cloud.agent.api.FreezeThawVMCommand;
43+
import com.cloud.agent.api.VMSnapshotTO;
44+
import com.cloud.event.EventTypes;
45+
import com.cloud.exception.AgentUnavailableException;
46+
import com.cloud.exception.OperationTimedoutException;
47+
import com.cloud.hypervisor.Hypervisor;
48+
import com.cloud.storage.GuestOSVO;
49+
import com.cloud.storage.VolumeVO;
50+
import com.cloud.uservm.UserVm;
51+
import com.cloud.utils.exception.CloudRuntimeException;
52+
import com.cloud.utils.fsm.NoTransitionException;
53+
import com.cloud.vm.VirtualMachine;
54+
import com.cloud.vm.snapshot.VMSnapshot;
55+
import com.cloud.vm.snapshot.VMSnapshotDetailsVO;
56+
import com.cloud.vm.snapshot.VMSnapshotVO;
57+
58+
/**
59+
* VM Snapshot strategy for NetApp ONTAP managed storage.
60+
*
61+
* <p>This strategy handles VM-level (instance) snapshots for VMs whose volumes
62+
* reside on ONTAP managed primary storage using the NFS protocol. It uses the
63+
* QEMU guest agent to freeze/thaw the VM file systems for consistency, and
64+
* delegates per-volume snapshot creation to the existing CloudStack snapshot
65+
* framework which routes to {@code StorageSystemSnapshotStrategy} →
66+
* {@code OntapPrimaryDatastoreDriver.takeSnapshot()} (ONTAP file clone).</p>
67+
*
68+
* <h3>Flow:</h3>
69+
* <ol>
70+
* <li>Freeze the VM via QEMU guest agent ({@code fsfreeze})</li>
71+
* <li>For each attached volume, create a storage-level snapshot (ONTAP file clone)</li>
72+
* <li>Thaw the VM</li>
73+
* <li>Record VM snapshot ↔ volume snapshot mappings in {@code vm_snapshot_details}</li>
74+
* </ol>
75+
*
76+
* <h3>Strategy Selection:</h3>
77+
* <p>Returns {@code StrategyPriority.HIGHEST} when:</p>
78+
* <ul>
79+
* <li>Hypervisor is KVM</li>
80+
* <li>Snapshot type is Disk-only (no memory)</li>
81+
* <li>All VM volumes are on ONTAP managed NFS primary storage</li>
82+
* </ul>
83+
*/
84+
public class OntapVMSnapshotStrategy extends StorageVMSnapshotStrategy {
85+
86+
private static final Logger logger = LogManager.getLogger(OntapVMSnapshotStrategy.class);
87+
88+
@Override
89+
public boolean configure(String name, Map<String, Object> params) throws ConfigurationException {
90+
return super.configure(name, params);
91+
}
92+
93+
// ──────────────────────────────────────────────────────────────────────────
94+
// Strategy Selection
95+
// ──────────────────────────────────────────────────────────────────────────
96+
97+
@Override
98+
public StrategyPriority canHandle(VMSnapshot vmSnapshot) {
99+
VMSnapshotVO vmSnapshotVO = (VMSnapshotVO) vmSnapshot;
100+
101+
// For existing (non-Allocated) snapshots, check if we created them
102+
if (!VMSnapshot.State.Allocated.equals(vmSnapshotVO.getState())) {
103+
List<VMSnapshotDetailsVO> vmSnapshotDetails = vmSnapshotDetailsDao.findDetails(vmSnapshot.getId(), STORAGE_SNAPSHOT);
104+
if (CollectionUtils.isEmpty(vmSnapshotDetails)) {
105+
return StrategyPriority.CANT_HANDLE;
106+
}
107+
// Verify the volumes are still on ONTAP storage
108+
if (allVolumesOnOntapManagedStorage(vmSnapshot.getVmId())) {
109+
return StrategyPriority.HIGHEST;
110+
}
111+
return StrategyPriority.CANT_HANDLE;
112+
}
113+
114+
// For new snapshots, check if Disk-only and all volumes on ONTAP
115+
if (vmSnapshotVO.getType() != VMSnapshot.Type.Disk) {
116+
logger.debug("ONTAP VM snapshot strategy cannot handle memory snapshots for VM [{}]", vmSnapshot.getVmId());
117+
return StrategyPriority.CANT_HANDLE;
118+
}
119+
120+
if (allVolumesOnOntapManagedStorage(vmSnapshot.getVmId())) {
121+
return StrategyPriority.HIGHEST;
122+
}
123+
124+
return StrategyPriority.CANT_HANDLE;
125+
}
126+
127+
@Override
128+
public StrategyPriority canHandle(Long vmId, Long rootPoolId, boolean snapshotMemory) {
129+
if (snapshotMemory) {
130+
logger.debug("ONTAP VM snapshot strategy cannot handle memory snapshots for VM [{}]", vmId);
131+
return StrategyPriority.CANT_HANDLE;
132+
}
133+
134+
if (allVolumesOnOntapManagedStorage(vmId)) {
135+
return StrategyPriority.HIGHEST;
136+
}
137+
138+
return StrategyPriority.CANT_HANDLE;
139+
}
140+
141+
/**
142+
* Checks whether all volumes of a VM reside on ONTAP managed primary storage.
143+
*/
144+
private boolean allVolumesOnOntapManagedStorage(long vmId) {
145+
UserVm userVm = userVmDao.findById(vmId);
146+
if (userVm == null) {
147+
logger.debug("VM with id [{}] not found", vmId);
148+
return false;
149+
}
150+
151+
if (!Hypervisor.HypervisorType.KVM.equals(userVm.getHypervisorType())) {
152+
logger.debug("ONTAP VM snapshot strategy only supports KVM hypervisor, VM [{}] uses [{}]",
153+
vmId, userVm.getHypervisorType());
154+
return false;
155+
}
156+
157+
if (!VirtualMachine.State.Running.equals(userVm.getState())) {
158+
logger.debug("ONTAP VM snapshot strategy requires a running VM, VM [{}] is in state [{}]",
159+
vmId, userVm.getState());
160+
return false;
161+
}
162+
163+
List<VolumeVO> volumes = volumeDao.findByInstance(vmId);
164+
if (volumes == null || volumes.isEmpty()) {
165+
logger.debug("No volumes found for VM [{}]", vmId);
166+
return false;
167+
}
168+
169+
for (VolumeVO volume : volumes) {
170+
if (volume.getPoolId() == null) {
171+
return false;
172+
}
173+
StoragePoolVO pool = storagePool.findById(volume.getPoolId());
174+
if (pool == null) {
175+
return false;
176+
}
177+
if (!pool.isManaged()) {
178+
logger.debug("Volume [{}] is on non-managed storage pool [{}], not ONTAP",
179+
volume.getId(), pool.getName());
180+
return false;
181+
}
182+
if (!Constants.ONTAP_PLUGIN_NAME.equals(pool.getStorageProviderName())) {
183+
logger.debug("Volume [{}] is on managed pool [{}] with provider [{}], not ONTAP",
184+
volume.getId(), pool.getName(), pool.getStorageProviderName());
185+
return false;
186+
}
187+
}
188+
189+
logger.debug("All volumes of VM [{}] are on ONTAP managed storage, this strategy can handle", vmId);
190+
return true;
191+
}
192+
193+
// ──────────────────────────────────────────────────────────────────────────
194+
// Take VM Snapshot
195+
// ──────────────────────────────────────────────────────────────────────────
196+
197+
/**
198+
* Takes a VM-level snapshot by freezing the VM, creating per-volume snapshots
199+
* on ONTAP storage (file clones), and then thawing the VM.
200+
*
201+
* <p>The quiesce option is always {@code true} for ONTAP to ensure filesystem
202+
* consistency across all volumes. The QEMU guest agent must be installed and
203+
* running inside the guest VM.</p>
204+
*/
205+
@Override
206+
public VMSnapshot takeVMSnapshot(VMSnapshot vmSnapshot) {
207+
Long hostId = vmSnapshotHelper.pickRunningHost(vmSnapshot.getVmId());
208+
UserVm userVm = userVmDao.findById(vmSnapshot.getVmId());
209+
VMSnapshotVO vmSnapshotVO = (VMSnapshotVO) vmSnapshot;
210+
211+
CreateVMSnapshotAnswer answer = null;
212+
FreezeThawVMAnswer freezeAnswer = null;
213+
FreezeThawVMCommand thawCmd = null;
214+
FreezeThawVMAnswer thawAnswer = null;
215+
List<SnapshotInfo> forRollback = new ArrayList<>();
216+
long startFreeze = 0;
217+
218+
try {
219+
vmSnapshotHelper.vmSnapshotStateTransitTo(vmSnapshotVO, VMSnapshot.Event.CreateRequested);
220+
} catch (NoTransitionException e) {
221+
throw new CloudRuntimeException(e.getMessage());
222+
}
223+
224+
boolean result = false;
225+
try {
226+
GuestOSVO guestOS = guestOSDao.findById(userVm.getGuestOSId());
227+
List<VolumeObjectTO> volumeTOs = vmSnapshotHelper.getVolumeTOList(userVm.getId());
228+
229+
long prev_chain_size = 0;
230+
long virtual_size = 0;
231+
232+
// Build snapshot parent chain
233+
VMSnapshotTO current = null;
234+
VMSnapshotVO currentSnapshot = vmSnapshotDao.findCurrentSnapshotByVmId(userVm.getId());
235+
if (currentSnapshot != null) {
236+
current = vmSnapshotHelper.getSnapshotWithParents(currentSnapshot);
237+
}
238+
239+
// For ONTAP managed NFS, always quiesce the VM for filesystem consistency
240+
boolean quiescevm = true;
241+
VMSnapshotOptions options = vmSnapshotVO.getOptions();
242+
if (options != null && !options.needQuiesceVM()) {
243+
logger.info("Quiesce option was set to false, but overriding to true for ONTAP managed storage " +
244+
"to ensure filesystem consistency across all volumes");
245+
}
246+
247+
VMSnapshotTO target = new VMSnapshotTO(vmSnapshot.getId(), vmSnapshot.getName(),
248+
vmSnapshot.getType(), null, vmSnapshot.getDescription(), false, current, quiescevm);
249+
250+
if (current == null) {
251+
vmSnapshotVO.setParent(null);
252+
} else {
253+
vmSnapshotVO.setParent(current.getId());
254+
}
255+
256+
CreateVMSnapshotCommand ccmd = new CreateVMSnapshotCommand(
257+
userVm.getInstanceName(), userVm.getUuid(), target, volumeTOs, guestOS.getDisplayName());
258+
259+
logger.info("Creating ONTAP VM Snapshot for VM [{}] with quiesce=true", userVm.getInstanceName());
260+
261+
// Prepare volume info list
262+
List<VolumeInfo> volumeInfos = new ArrayList<>();
263+
for (VolumeObjectTO volumeObjectTO : volumeTOs) {
264+
volumeInfos.add(volumeDataFactory.getVolume(volumeObjectTO.getId()));
265+
virtual_size += volumeObjectTO.getSize();
266+
VolumeVO volumeVO = volumeDao.findById(volumeObjectTO.getId());
267+
prev_chain_size += volumeVO.getVmSnapshotChainSize() == null ? 0 : volumeVO.getVmSnapshotChainSize();
268+
}
269+
270+
// ── Step 1: Freeze the VM ──
271+
FreezeThawVMCommand freezeCommand = new FreezeThawVMCommand(userVm.getInstanceName());
272+
freezeCommand.setOption(FreezeThawVMCommand.FREEZE);
273+
freezeAnswer = (FreezeThawVMAnswer) agentMgr.send(hostId, freezeCommand);
274+
startFreeze = System.nanoTime();
275+
276+
thawCmd = new FreezeThawVMCommand(userVm.getInstanceName());
277+
thawCmd.setOption(FreezeThawVMCommand.THAW);
278+
279+
if (freezeAnswer == null || !freezeAnswer.getResult()) {
280+
String detail = (freezeAnswer != null) ? freezeAnswer.getDetails() : "no response from agent";
281+
throw new CloudRuntimeException("Could not freeze VM [" + userVm.getInstanceName() +
282+
"] for ONTAP snapshot. Ensure qemu-guest-agent is installed and running. Details: " + detail);
283+
}
284+
285+
logger.info("VM [{}] frozen successfully via QEMU guest agent", userVm.getInstanceName());
286+
287+
// ── Step 2: Create per-volume snapshots (ONTAP file clones) ──
288+
try {
289+
for (VolumeInfo vol : volumeInfos) {
290+
long startSnapshot = System.nanoTime();
291+
292+
SnapshotInfo snapInfo = createDiskSnapshot(vmSnapshot, forRollback, vol);
293+
294+
if (snapInfo == null) {
295+
throw new CloudRuntimeException("Could not take ONTAP snapshot for volume id=" + vol.getId());
296+
}
297+
298+
logger.info("ONTAP snapshot for volume [{}] (id={}) completed in {} ms",
299+
vol.getName(), vol.getId(),
300+
TimeUnit.MILLISECONDS.convert(System.nanoTime() - startSnapshot, TimeUnit.NANOSECONDS));
301+
}
302+
} finally {
303+
// ── Step 3: Thaw the VM (always, even on error) ──
304+
try {
305+
thawAnswer = (FreezeThawVMAnswer) agentMgr.send(hostId, thawCmd);
306+
if (thawAnswer != null && thawAnswer.getResult()) {
307+
logger.info("VM [{}] thawed successfully. Total freeze duration: {} ms",
308+
userVm.getInstanceName(),
309+
TimeUnit.MILLISECONDS.convert(System.nanoTime() - startFreeze, TimeUnit.NANOSECONDS));
310+
} else {
311+
logger.warn("Failed to thaw VM [{}]: {}", userVm.getInstanceName(),
312+
(thawAnswer != null) ? thawAnswer.getDetails() : "no response");
313+
}
314+
} catch (Exception thawEx) {
315+
logger.error("Exception while thawing VM [{}]: {}", userVm.getInstanceName(), thawEx.getMessage(), thawEx);
316+
}
317+
}
318+
319+
// ── Step 4: Finalize ──
320+
answer = new CreateVMSnapshotAnswer(ccmd, true, "");
321+
answer.setVolumeTOs(volumeTOs);
322+
323+
processAnswer(vmSnapshotVO, userVm, answer, null);
324+
logger.info("ONTAP VM Snapshot [{}] created successfully for VM [{}]",
325+
vmSnapshot.getName(), userVm.getInstanceName());
326+
327+
long new_chain_size = 0;
328+
for (VolumeObjectTO volumeTo : answer.getVolumeTOs()) {
329+
publishUsageEvent(EventTypes.EVENT_VM_SNAPSHOT_CREATE, vmSnapshot, userVm, volumeTo);
330+
new_chain_size += volumeTo.getSize();
331+
}
332+
publishUsageEvent(EventTypes.EVENT_VM_SNAPSHOT_ON_PRIMARY, vmSnapshot, userVm,
333+
new_chain_size - prev_chain_size, virtual_size);
334+
335+
result = true;
336+
return vmSnapshot;
337+
338+
} catch (OperationTimedoutException e) {
339+
logger.error("ONTAP VM Snapshot [{}] timed out: {}", vmSnapshot.getName(), e.getMessage());
340+
throw new CloudRuntimeException("Creating Instance Snapshot: " + vmSnapshot.getName() + " timed out: " + e.getMessage());
341+
} catch (AgentUnavailableException e) {
342+
logger.error("ONTAP VM Snapshot [{}] failed, agent unavailable: {}", vmSnapshot.getName(), e.getMessage());
343+
throw new CloudRuntimeException("Creating Instance Snapshot: " + vmSnapshot.getName() + " failed: " + e.getMessage());
344+
} catch (CloudRuntimeException e) {
345+
throw e;
346+
} finally {
347+
if (!result) {
348+
// Rollback all disk snapshots created so far
349+
for (SnapshotInfo snapshotInfo : forRollback) {
350+
try {
351+
rollbackDiskSnapshot(snapshotInfo);
352+
} catch (Exception rollbackEx) {
353+
logger.error("Failed to rollback snapshot [{}]: {}", snapshotInfo.getId(), rollbackEx.getMessage());
354+
}
355+
}
356+
357+
// Ensure VM is thawed if we haven't done so
358+
if (thawAnswer == null && freezeAnswer != null && freezeAnswer.getResult()) {
359+
try {
360+
logger.info("Thawing VM [{}] during error cleanup", userVm.getInstanceName());
361+
thawAnswer = (FreezeThawVMAnswer) agentMgr.send(hostId, thawCmd);
362+
} catch (Exception ex) {
363+
logger.error("Could not thaw VM during cleanup: {}", ex.getMessage());
364+
}
365+
}
366+
367+
// Clean up VM snapshot details and transition state
368+
try {
369+
List<VMSnapshotDetailsVO> vmSnapshotDetails = vmSnapshotDetailsDao.listDetails(vmSnapshot.getId());
370+
for (VMSnapshotDetailsVO detail : vmSnapshotDetails) {
371+
if (STORAGE_SNAPSHOT.equals(detail.getName())) {
372+
vmSnapshotDetailsDao.remove(detail.getId());
373+
}
374+
}
375+
vmSnapshotHelper.vmSnapshotStateTransitTo(vmSnapshot, VMSnapshot.Event.OperationFailed);
376+
} catch (NoTransitionException e1) {
377+
logger.error("Cannot set VM Snapshot state to OperationFailed: {}", e1.getMessage());
378+
}
379+
}
380+
}
381+
}
382+
}

0 commit comments

Comments
 (0)