|
| 1 | +/* |
| 2 | + * Licensed to the Apache Software Foundation (ASF) under one |
| 3 | + * or more contributor license agreements. See the NOTICE file |
| 4 | + * distributed with this work for additional information |
| 5 | + * regarding copyright ownership. The ASF licenses this file |
| 6 | + * to you under the Apache License, Version 2.0 (the |
| 7 | + * "License"); you may not use this file except in compliance |
| 8 | + * with the License. You may obtain a copy of the License at |
| 9 | + * |
| 10 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | + * |
| 12 | + * Unless required by applicable law or agreed to in writing, |
| 13 | + * software distributed under the License is distributed on an |
| 14 | + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 15 | + * KIND, either express or implied. See the License for the |
| 16 | + * specific language governing permissions and limitations |
| 17 | + * under the License. |
| 18 | + */ |
| 19 | +package org.apache.cloudstack.storage.vmsnapshot; |
| 20 | + |
| 21 | +import java.util.ArrayList; |
| 22 | +import java.util.List; |
| 23 | +import java.util.Map; |
| 24 | +import java.util.concurrent.TimeUnit; |
| 25 | + |
| 26 | +import javax.naming.ConfigurationException; |
| 27 | + |
| 28 | +import org.apache.cloudstack.engine.subsystem.api.storage.SnapshotInfo; |
| 29 | +import org.apache.cloudstack.engine.subsystem.api.storage.StrategyPriority; |
| 30 | +import org.apache.cloudstack.engine.subsystem.api.storage.VMSnapshotOptions; |
| 31 | +import org.apache.cloudstack.engine.subsystem.api.storage.VolumeInfo; |
| 32 | +import org.apache.cloudstack.storage.datastore.db.StoragePoolVO; |
| 33 | +import org.apache.cloudstack.storage.to.VolumeObjectTO; |
| 34 | +import org.apache.cloudstack.storage.utils.Constants; |
| 35 | +import org.apache.commons.collections.CollectionUtils; |
| 36 | +import org.apache.logging.log4j.LogManager; |
| 37 | +import org.apache.logging.log4j.Logger; |
| 38 | + |
| 39 | +import com.cloud.agent.api.CreateVMSnapshotAnswer; |
| 40 | +import com.cloud.agent.api.CreateVMSnapshotCommand; |
| 41 | +import com.cloud.agent.api.FreezeThawVMAnswer; |
| 42 | +import com.cloud.agent.api.FreezeThawVMCommand; |
| 43 | +import com.cloud.agent.api.VMSnapshotTO; |
| 44 | +import com.cloud.event.EventTypes; |
| 45 | +import com.cloud.exception.AgentUnavailableException; |
| 46 | +import com.cloud.exception.OperationTimedoutException; |
| 47 | +import com.cloud.hypervisor.Hypervisor; |
| 48 | +import com.cloud.storage.GuestOSVO; |
| 49 | +import com.cloud.storage.VolumeVO; |
| 50 | +import com.cloud.uservm.UserVm; |
| 51 | +import com.cloud.utils.exception.CloudRuntimeException; |
| 52 | +import com.cloud.utils.fsm.NoTransitionException; |
| 53 | +import com.cloud.vm.VirtualMachine; |
| 54 | +import com.cloud.vm.snapshot.VMSnapshot; |
| 55 | +import com.cloud.vm.snapshot.VMSnapshotDetailsVO; |
| 56 | +import com.cloud.vm.snapshot.VMSnapshotVO; |
| 57 | + |
| 58 | +/** |
| 59 | + * VM Snapshot strategy for NetApp ONTAP managed storage. |
| 60 | + * |
| 61 | + * <p>This strategy handles VM-level (instance) snapshots for VMs whose volumes |
| 62 | + * reside on ONTAP managed primary storage using the NFS protocol. It uses the |
| 63 | + * QEMU guest agent to freeze/thaw the VM file systems for consistency, and |
| 64 | + * delegates per-volume snapshot creation to the existing CloudStack snapshot |
| 65 | + * framework which routes to {@code StorageSystemSnapshotStrategy} → |
| 66 | + * {@code OntapPrimaryDatastoreDriver.takeSnapshot()} (ONTAP file clone).</p> |
| 67 | + * |
| 68 | + * <h3>Flow:</h3> |
| 69 | + * <ol> |
| 70 | + * <li>Freeze the VM via QEMU guest agent ({@code fsfreeze})</li> |
| 71 | + * <li>For each attached volume, create a storage-level snapshot (ONTAP file clone)</li> |
| 72 | + * <li>Thaw the VM</li> |
| 73 | + * <li>Record VM snapshot ↔ volume snapshot mappings in {@code vm_snapshot_details}</li> |
| 74 | + * </ol> |
| 75 | + * |
| 76 | + * <h3>Strategy Selection:</h3> |
| 77 | + * <p>Returns {@code StrategyPriority.HIGHEST} when:</p> |
| 78 | + * <ul> |
| 79 | + * <li>Hypervisor is KVM</li> |
| 80 | + * <li>Snapshot type is Disk-only (no memory)</li> |
| 81 | + * <li>All VM volumes are on ONTAP managed NFS primary storage</li> |
| 82 | + * </ul> |
| 83 | + */ |
| 84 | +public class OntapVMSnapshotStrategy extends StorageVMSnapshotStrategy { |
| 85 | + |
| 86 | + private static final Logger logger = LogManager.getLogger(OntapVMSnapshotStrategy.class); |
| 87 | + |
| 88 | + @Override |
| 89 | + public boolean configure(String name, Map<String, Object> params) throws ConfigurationException { |
| 90 | + return super.configure(name, params); |
| 91 | + } |
| 92 | + |
| 93 | + // ────────────────────────────────────────────────────────────────────────── |
| 94 | + // Strategy Selection |
| 95 | + // ────────────────────────────────────────────────────────────────────────── |
| 96 | + |
| 97 | + @Override |
| 98 | + public StrategyPriority canHandle(VMSnapshot vmSnapshot) { |
| 99 | + VMSnapshotVO vmSnapshotVO = (VMSnapshotVO) vmSnapshot; |
| 100 | + |
| 101 | + // For existing (non-Allocated) snapshots, check if we created them |
| 102 | + if (!VMSnapshot.State.Allocated.equals(vmSnapshotVO.getState())) { |
| 103 | + List<VMSnapshotDetailsVO> vmSnapshotDetails = vmSnapshotDetailsDao.findDetails(vmSnapshot.getId(), STORAGE_SNAPSHOT); |
| 104 | + if (CollectionUtils.isEmpty(vmSnapshotDetails)) { |
| 105 | + return StrategyPriority.CANT_HANDLE; |
| 106 | + } |
| 107 | + // Verify the volumes are still on ONTAP storage |
| 108 | + if (allVolumesOnOntapManagedStorage(vmSnapshot.getVmId())) { |
| 109 | + return StrategyPriority.HIGHEST; |
| 110 | + } |
| 111 | + return StrategyPriority.CANT_HANDLE; |
| 112 | + } |
| 113 | + |
| 114 | + // For new snapshots, check if Disk-only and all volumes on ONTAP |
| 115 | + if (vmSnapshotVO.getType() != VMSnapshot.Type.Disk) { |
| 116 | + logger.debug("ONTAP VM snapshot strategy cannot handle memory snapshots for VM [{}]", vmSnapshot.getVmId()); |
| 117 | + return StrategyPriority.CANT_HANDLE; |
| 118 | + } |
| 119 | + |
| 120 | + if (allVolumesOnOntapManagedStorage(vmSnapshot.getVmId())) { |
| 121 | + return StrategyPriority.HIGHEST; |
| 122 | + } |
| 123 | + |
| 124 | + return StrategyPriority.CANT_HANDLE; |
| 125 | + } |
| 126 | + |
| 127 | + @Override |
| 128 | + public StrategyPriority canHandle(Long vmId, Long rootPoolId, boolean snapshotMemory) { |
| 129 | + if (snapshotMemory) { |
| 130 | + logger.debug("ONTAP VM snapshot strategy cannot handle memory snapshots for VM [{}]", vmId); |
| 131 | + return StrategyPriority.CANT_HANDLE; |
| 132 | + } |
| 133 | + |
| 134 | + if (allVolumesOnOntapManagedStorage(vmId)) { |
| 135 | + return StrategyPriority.HIGHEST; |
| 136 | + } |
| 137 | + |
| 138 | + return StrategyPriority.CANT_HANDLE; |
| 139 | + } |
| 140 | + |
| 141 | + /** |
| 142 | + * Checks whether all volumes of a VM reside on ONTAP managed primary storage. |
| 143 | + */ |
| 144 | + private boolean allVolumesOnOntapManagedStorage(long vmId) { |
| 145 | + UserVm userVm = userVmDao.findById(vmId); |
| 146 | + if (userVm == null) { |
| 147 | + logger.debug("VM with id [{}] not found", vmId); |
| 148 | + return false; |
| 149 | + } |
| 150 | + |
| 151 | + if (!Hypervisor.HypervisorType.KVM.equals(userVm.getHypervisorType())) { |
| 152 | + logger.debug("ONTAP VM snapshot strategy only supports KVM hypervisor, VM [{}] uses [{}]", |
| 153 | + vmId, userVm.getHypervisorType()); |
| 154 | + return false; |
| 155 | + } |
| 156 | + |
| 157 | + if (!VirtualMachine.State.Running.equals(userVm.getState())) { |
| 158 | + logger.debug("ONTAP VM snapshot strategy requires a running VM, VM [{}] is in state [{}]", |
| 159 | + vmId, userVm.getState()); |
| 160 | + return false; |
| 161 | + } |
| 162 | + |
| 163 | + List<VolumeVO> volumes = volumeDao.findByInstance(vmId); |
| 164 | + if (volumes == null || volumes.isEmpty()) { |
| 165 | + logger.debug("No volumes found for VM [{}]", vmId); |
| 166 | + return false; |
| 167 | + } |
| 168 | + |
| 169 | + for (VolumeVO volume : volumes) { |
| 170 | + if (volume.getPoolId() == null) { |
| 171 | + return false; |
| 172 | + } |
| 173 | + StoragePoolVO pool = storagePool.findById(volume.getPoolId()); |
| 174 | + if (pool == null) { |
| 175 | + return false; |
| 176 | + } |
| 177 | + if (!pool.isManaged()) { |
| 178 | + logger.debug("Volume [{}] is on non-managed storage pool [{}], not ONTAP", |
| 179 | + volume.getId(), pool.getName()); |
| 180 | + return false; |
| 181 | + } |
| 182 | + if (!Constants.ONTAP_PLUGIN_NAME.equals(pool.getStorageProviderName())) { |
| 183 | + logger.debug("Volume [{}] is on managed pool [{}] with provider [{}], not ONTAP", |
| 184 | + volume.getId(), pool.getName(), pool.getStorageProviderName()); |
| 185 | + return false; |
| 186 | + } |
| 187 | + } |
| 188 | + |
| 189 | + logger.debug("All volumes of VM [{}] are on ONTAP managed storage, this strategy can handle", vmId); |
| 190 | + return true; |
| 191 | + } |
| 192 | + |
| 193 | + // ────────────────────────────────────────────────────────────────────────── |
| 194 | + // Take VM Snapshot |
| 195 | + // ────────────────────────────────────────────────────────────────────────── |
| 196 | + |
| 197 | + /** |
| 198 | + * Takes a VM-level snapshot by freezing the VM, creating per-volume snapshots |
| 199 | + * on ONTAP storage (file clones), and then thawing the VM. |
| 200 | + * |
| 201 | + * <p>The quiesce option is always {@code true} for ONTAP to ensure filesystem |
| 202 | + * consistency across all volumes. The QEMU guest agent must be installed and |
| 203 | + * running inside the guest VM.</p> |
| 204 | + */ |
| 205 | + @Override |
| 206 | + public VMSnapshot takeVMSnapshot(VMSnapshot vmSnapshot) { |
| 207 | + Long hostId = vmSnapshotHelper.pickRunningHost(vmSnapshot.getVmId()); |
| 208 | + UserVm userVm = userVmDao.findById(vmSnapshot.getVmId()); |
| 209 | + VMSnapshotVO vmSnapshotVO = (VMSnapshotVO) vmSnapshot; |
| 210 | + |
| 211 | + CreateVMSnapshotAnswer answer = null; |
| 212 | + FreezeThawVMAnswer freezeAnswer = null; |
| 213 | + FreezeThawVMCommand thawCmd = null; |
| 214 | + FreezeThawVMAnswer thawAnswer = null; |
| 215 | + List<SnapshotInfo> forRollback = new ArrayList<>(); |
| 216 | + long startFreeze = 0; |
| 217 | + |
| 218 | + try { |
| 219 | + vmSnapshotHelper.vmSnapshotStateTransitTo(vmSnapshotVO, VMSnapshot.Event.CreateRequested); |
| 220 | + } catch (NoTransitionException e) { |
| 221 | + throw new CloudRuntimeException(e.getMessage()); |
| 222 | + } |
| 223 | + |
| 224 | + boolean result = false; |
| 225 | + try { |
| 226 | + GuestOSVO guestOS = guestOSDao.findById(userVm.getGuestOSId()); |
| 227 | + List<VolumeObjectTO> volumeTOs = vmSnapshotHelper.getVolumeTOList(userVm.getId()); |
| 228 | + |
| 229 | + long prev_chain_size = 0; |
| 230 | + long virtual_size = 0; |
| 231 | + |
| 232 | + // Build snapshot parent chain |
| 233 | + VMSnapshotTO current = null; |
| 234 | + VMSnapshotVO currentSnapshot = vmSnapshotDao.findCurrentSnapshotByVmId(userVm.getId()); |
| 235 | + if (currentSnapshot != null) { |
| 236 | + current = vmSnapshotHelper.getSnapshotWithParents(currentSnapshot); |
| 237 | + } |
| 238 | + |
| 239 | + // For ONTAP managed NFS, always quiesce the VM for filesystem consistency |
| 240 | + boolean quiescevm = true; |
| 241 | + VMSnapshotOptions options = vmSnapshotVO.getOptions(); |
| 242 | + if (options != null && !options.needQuiesceVM()) { |
| 243 | + logger.info("Quiesce option was set to false, but overriding to true for ONTAP managed storage " + |
| 244 | + "to ensure filesystem consistency across all volumes"); |
| 245 | + } |
| 246 | + |
| 247 | + VMSnapshotTO target = new VMSnapshotTO(vmSnapshot.getId(), vmSnapshot.getName(), |
| 248 | + vmSnapshot.getType(), null, vmSnapshot.getDescription(), false, current, quiescevm); |
| 249 | + |
| 250 | + if (current == null) { |
| 251 | + vmSnapshotVO.setParent(null); |
| 252 | + } else { |
| 253 | + vmSnapshotVO.setParent(current.getId()); |
| 254 | + } |
| 255 | + |
| 256 | + CreateVMSnapshotCommand ccmd = new CreateVMSnapshotCommand( |
| 257 | + userVm.getInstanceName(), userVm.getUuid(), target, volumeTOs, guestOS.getDisplayName()); |
| 258 | + |
| 259 | + logger.info("Creating ONTAP VM Snapshot for VM [{}] with quiesce=true", userVm.getInstanceName()); |
| 260 | + |
| 261 | + // Prepare volume info list |
| 262 | + List<VolumeInfo> volumeInfos = new ArrayList<>(); |
| 263 | + for (VolumeObjectTO volumeObjectTO : volumeTOs) { |
| 264 | + volumeInfos.add(volumeDataFactory.getVolume(volumeObjectTO.getId())); |
| 265 | + virtual_size += volumeObjectTO.getSize(); |
| 266 | + VolumeVO volumeVO = volumeDao.findById(volumeObjectTO.getId()); |
| 267 | + prev_chain_size += volumeVO.getVmSnapshotChainSize() == null ? 0 : volumeVO.getVmSnapshotChainSize(); |
| 268 | + } |
| 269 | + |
| 270 | + // ── Step 1: Freeze the VM ── |
| 271 | + FreezeThawVMCommand freezeCommand = new FreezeThawVMCommand(userVm.getInstanceName()); |
| 272 | + freezeCommand.setOption(FreezeThawVMCommand.FREEZE); |
| 273 | + freezeAnswer = (FreezeThawVMAnswer) agentMgr.send(hostId, freezeCommand); |
| 274 | + startFreeze = System.nanoTime(); |
| 275 | + |
| 276 | + thawCmd = new FreezeThawVMCommand(userVm.getInstanceName()); |
| 277 | + thawCmd.setOption(FreezeThawVMCommand.THAW); |
| 278 | + |
| 279 | + if (freezeAnswer == null || !freezeAnswer.getResult()) { |
| 280 | + String detail = (freezeAnswer != null) ? freezeAnswer.getDetails() : "no response from agent"; |
| 281 | + throw new CloudRuntimeException("Could not freeze VM [" + userVm.getInstanceName() + |
| 282 | + "] for ONTAP snapshot. Ensure qemu-guest-agent is installed and running. Details: " + detail); |
| 283 | + } |
| 284 | + |
| 285 | + logger.info("VM [{}] frozen successfully via QEMU guest agent", userVm.getInstanceName()); |
| 286 | + |
| 287 | + // ── Step 2: Create per-volume snapshots (ONTAP file clones) ── |
| 288 | + try { |
| 289 | + for (VolumeInfo vol : volumeInfos) { |
| 290 | + long startSnapshot = System.nanoTime(); |
| 291 | + |
| 292 | + SnapshotInfo snapInfo = createDiskSnapshot(vmSnapshot, forRollback, vol); |
| 293 | + |
| 294 | + if (snapInfo == null) { |
| 295 | + throw new CloudRuntimeException("Could not take ONTAP snapshot for volume id=" + vol.getId()); |
| 296 | + } |
| 297 | + |
| 298 | + logger.info("ONTAP snapshot for volume [{}] (id={}) completed in {} ms", |
| 299 | + vol.getName(), vol.getId(), |
| 300 | + TimeUnit.MILLISECONDS.convert(System.nanoTime() - startSnapshot, TimeUnit.NANOSECONDS)); |
| 301 | + } |
| 302 | + } finally { |
| 303 | + // ── Step 3: Thaw the VM (always, even on error) ── |
| 304 | + try { |
| 305 | + thawAnswer = (FreezeThawVMAnswer) agentMgr.send(hostId, thawCmd); |
| 306 | + if (thawAnswer != null && thawAnswer.getResult()) { |
| 307 | + logger.info("VM [{}] thawed successfully. Total freeze duration: {} ms", |
| 308 | + userVm.getInstanceName(), |
| 309 | + TimeUnit.MILLISECONDS.convert(System.nanoTime() - startFreeze, TimeUnit.NANOSECONDS)); |
| 310 | + } else { |
| 311 | + logger.warn("Failed to thaw VM [{}]: {}", userVm.getInstanceName(), |
| 312 | + (thawAnswer != null) ? thawAnswer.getDetails() : "no response"); |
| 313 | + } |
| 314 | + } catch (Exception thawEx) { |
| 315 | + logger.error("Exception while thawing VM [{}]: {}", userVm.getInstanceName(), thawEx.getMessage(), thawEx); |
| 316 | + } |
| 317 | + } |
| 318 | + |
| 319 | + // ── Step 4: Finalize ── |
| 320 | + answer = new CreateVMSnapshotAnswer(ccmd, true, ""); |
| 321 | + answer.setVolumeTOs(volumeTOs); |
| 322 | + |
| 323 | + processAnswer(vmSnapshotVO, userVm, answer, null); |
| 324 | + logger.info("ONTAP VM Snapshot [{}] created successfully for VM [{}]", |
| 325 | + vmSnapshot.getName(), userVm.getInstanceName()); |
| 326 | + |
| 327 | + long new_chain_size = 0; |
| 328 | + for (VolumeObjectTO volumeTo : answer.getVolumeTOs()) { |
| 329 | + publishUsageEvent(EventTypes.EVENT_VM_SNAPSHOT_CREATE, vmSnapshot, userVm, volumeTo); |
| 330 | + new_chain_size += volumeTo.getSize(); |
| 331 | + } |
| 332 | + publishUsageEvent(EventTypes.EVENT_VM_SNAPSHOT_ON_PRIMARY, vmSnapshot, userVm, |
| 333 | + new_chain_size - prev_chain_size, virtual_size); |
| 334 | + |
| 335 | + result = true; |
| 336 | + return vmSnapshot; |
| 337 | + |
| 338 | + } catch (OperationTimedoutException e) { |
| 339 | + logger.error("ONTAP VM Snapshot [{}] timed out: {}", vmSnapshot.getName(), e.getMessage()); |
| 340 | + throw new CloudRuntimeException("Creating Instance Snapshot: " + vmSnapshot.getName() + " timed out: " + e.getMessage()); |
| 341 | + } catch (AgentUnavailableException e) { |
| 342 | + logger.error("ONTAP VM Snapshot [{}] failed, agent unavailable: {}", vmSnapshot.getName(), e.getMessage()); |
| 343 | + throw new CloudRuntimeException("Creating Instance Snapshot: " + vmSnapshot.getName() + " failed: " + e.getMessage()); |
| 344 | + } catch (CloudRuntimeException e) { |
| 345 | + throw e; |
| 346 | + } finally { |
| 347 | + if (!result) { |
| 348 | + // Rollback all disk snapshots created so far |
| 349 | + for (SnapshotInfo snapshotInfo : forRollback) { |
| 350 | + try { |
| 351 | + rollbackDiskSnapshot(snapshotInfo); |
| 352 | + } catch (Exception rollbackEx) { |
| 353 | + logger.error("Failed to rollback snapshot [{}]: {}", snapshotInfo.getId(), rollbackEx.getMessage()); |
| 354 | + } |
| 355 | + } |
| 356 | + |
| 357 | + // Ensure VM is thawed if we haven't done so |
| 358 | + if (thawAnswer == null && freezeAnswer != null && freezeAnswer.getResult()) { |
| 359 | + try { |
| 360 | + logger.info("Thawing VM [{}] during error cleanup", userVm.getInstanceName()); |
| 361 | + thawAnswer = (FreezeThawVMAnswer) agentMgr.send(hostId, thawCmd); |
| 362 | + } catch (Exception ex) { |
| 363 | + logger.error("Could not thaw VM during cleanup: {}", ex.getMessage()); |
| 364 | + } |
| 365 | + } |
| 366 | + |
| 367 | + // Clean up VM snapshot details and transition state |
| 368 | + try { |
| 369 | + List<VMSnapshotDetailsVO> vmSnapshotDetails = vmSnapshotDetailsDao.listDetails(vmSnapshot.getId()); |
| 370 | + for (VMSnapshotDetailsVO detail : vmSnapshotDetails) { |
| 371 | + if (STORAGE_SNAPSHOT.equals(detail.getName())) { |
| 372 | + vmSnapshotDetailsDao.remove(detail.getId()); |
| 373 | + } |
| 374 | + } |
| 375 | + vmSnapshotHelper.vmSnapshotStateTransitTo(vmSnapshot, VMSnapshot.Event.OperationFailed); |
| 376 | + } catch (NoTransitionException e1) { |
| 377 | + logger.error("Cannot set VM Snapshot state to OperationFailed: {}", e1.getMessage()); |
| 378 | + } |
| 379 | + } |
| 380 | + } |
| 381 | + } |
| 382 | +} |
0 commit comments