Coverage for watcher/decision_engine/strategy/strategies/host_maintenance.py: 88%

102 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-17 12:22 +0000

1# -*- encoding: utf-8 -*- 

2# Copyright (c) 2017 chinac.com 

3# 

4# Authors: suzhengwei<suzhengwei@chinac.com> 

5# 

6# Licensed under the Apache License, Version 2.0 (the "License"); 

7# you may not use this file except in compliance with the License. 

8# You may obtain a copy of the License at 

9# 

10# http://www.apache.org/licenses/LICENSE-2.0 

11# 

12# Unless required by applicable law or agreed to in writing, software 

13# distributed under the License is distributed on an "AS IS" BASIS, 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 

15# implied. 

16# See the License for the specific language governing permissions and 

17# limitations under the License. 

18# 

19 

20from oslo_log import log 

21from watcher._i18n import _ 

22from watcher.common import exception 

23from watcher.decision_engine.model import element 

24from watcher.decision_engine.strategy.strategies import base 

25 

26LOG = log.getLogger(__name__) 

27 

28 

29class HostMaintenance(base.HostMaintenanceBaseStrategy): 

30 """[PoC]Host Maintenance 

31 

32 *Description* 

33 

34 It is a migration strategy for one compute node maintenance, 

35 without having the user's application been interrupted. 

36 If given one backup node, the strategy will firstly 

37 migrate all instances from the maintenance node to 

38 the backup node. If the backup node is not provided, 

39 it will migrate all instances, relying on nova-scheduler. 

40 

41 *Requirements* 

42 

43 * You must have at least 2 physical compute nodes to run this strategy. 

44 

45 *Limitations* 

46 

47 - This is a proof of concept that is not meant to be used in production 

48 - It migrates all instances from one host to other hosts. It's better to 

49 execute such strategy when load is not heavy, and use this algorithm 

50 with `ONESHOT` audit. 

51 - It assumes that cold and live migrations are possible. 

52 """ 

53 

54 INSTANCE_MIGRATION = "migrate" 

55 CHANGE_NOVA_SERVICE_STATE = "change_nova_service_state" 

56 

57 def __init__(self, config, osc=None): 

58 super(HostMaintenance, self).__init__(config, osc) 

59 

60 @classmethod 

61 def get_name(cls): 

62 return "host_maintenance" 

63 

64 @classmethod 

65 def get_display_name(cls): 

66 return _("Host Maintenance Strategy") 

67 

68 @classmethod 

69 def get_translatable_display_name(cls): 

70 return "Host Maintenance Strategy" 

71 

72 @classmethod 

73 def get_schema(cls): 

74 return { 

75 "properties": { 

76 "maintenance_node": { 

77 "description": "The name of the compute node which " 

78 "need maintenance", 

79 "type": "string", 

80 }, 

81 "backup_node": { 

82 "description": "The name of the compute node which " 

83 "will backup the maintenance node.", 

84 "type": "string", 

85 }, 

86 }, 

87 "required": ["maintenance_node"], 

88 } 

89 

90 def get_instance_state_str(self, instance): 

91 """Get instance state in string format""" 

92 if isinstance(instance.state, str): 

93 return instance.state 

94 elif isinstance(instance.state, element.InstanceState): 

95 return instance.state.value 

96 else: 

97 LOG.error('Unexpected instance state type, ' 

98 'state=%(state)s, state_type=%(st)s.', 

99 dict(state=instance.state, 

100 st=type(instance.state))) 

101 raise exception.WatcherException 

102 

103 def get_node_status_str(self, node): 

104 """Get node status in string format""" 

105 if isinstance(node.status, str): 

106 return node.status 

107 elif isinstance(node.status, element.ServiceState): 

108 return node.status.value 

109 else: 

110 LOG.error('Unexpected node status type, ' 

111 'status=%(status)s, status_type=%(st)s.', 

112 dict(status=node.status, 

113 st=type(node.status))) 

114 raise exception.WatcherException 

115 

116 def get_node_capacity(self, node): 

117 """Collect cpu, ram and disk capacity of a node. 

118 

119 :param node: node object 

120 :return: dict(cpu(cores), ram(MB), disk(B)) 

121 """ 

122 return dict(cpu=node.vcpu_capacity, 

123 ram=node.memory_mb_capacity, 

124 disk=node.disk_gb_capacity) 

125 

126 def host_fits(self, source_node, destination_node): 

127 """check host fits 

128 

129 return True if VMs could intensively migrate 

130 from source_node to destination_node. 

131 """ 

132 

133 source_node_used = self.compute_model.get_node_used_resources( 

134 source_node) 

135 destination_node_free = self.compute_model.get_node_free_resources( 

136 destination_node) 

137 metrics = ['vcpu', 'memory'] 

138 for m in metrics: 

139 if source_node_used[m] > destination_node_free[m]: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 return False 

141 return True 

142 

143 def add_action_enable_compute_node(self, node): 

144 """Add an action for node enabler into the solution.""" 

145 params = {'state': element.ServiceState.ENABLED.value, 

146 'resource_name': node.hostname} 

147 self.solution.add_action( 

148 action_type=self.CHANGE_NOVA_SERVICE_STATE, 

149 resource_id=node.uuid, 

150 input_parameters=params) 

151 

152 def add_action_maintain_compute_node(self, node): 

153 """Add an action for node maintenance into the solution.""" 

154 params = {'state': element.ServiceState.DISABLED.value, 

155 'disabled_reason': self.REASON_FOR_MAINTAINING, 

156 'resource_name': node.hostname} 

157 self.solution.add_action( 

158 action_type=self.CHANGE_NOVA_SERVICE_STATE, 

159 resource_id=node.uuid, 

160 input_parameters=params) 

161 

162 def enable_compute_node_if_disabled(self, node): 

163 node_status_str = self.get_node_status_str(node) 

164 if node_status_str != element.ServiceState.ENABLED.value: 

165 self.add_action_enable_compute_node(node) 

166 

167 def instance_migration(self, instance, src_node, des_node=None): 

168 """Add an action for instance migration into the solution. 

169 

170 :param instance: instance object 

171 :param src_node: node object 

172 :param des_node: node object. if None, the instance will be 

173 migrated relying on nova-scheduler 

174 :return: None 

175 """ 

176 instance_state_str = self.get_instance_state_str(instance) 

177 if instance_state_str == element.InstanceState.ACTIVE.value: 177 ↛ 180line 177 didn't jump to line 180 because the condition on line 177 was always true

178 migration_type = 'live' 

179 else: 

180 migration_type = 'cold' 

181 

182 params = {'migration_type': migration_type, 

183 'source_node': src_node.uuid, 

184 'resource_name': instance.name} 

185 if des_node: 

186 params['destination_node'] = des_node.hostname 

187 self.solution.add_action(action_type=self.INSTANCE_MIGRATION, 

188 resource_id=instance.uuid, 

189 input_parameters=params) 

190 

191 def host_migration(self, source_node, destination_node): 

192 """host migration 

193 

194 Migrate all instances from source_node to destination_node. 

195 Active instances use "live-migrate", 

196 and other instances use "cold-migrate" 

197 """ 

198 instances = self.compute_model.get_node_instances(source_node) 

199 for instance in instances: 

200 self.instance_migration(instance, source_node, destination_node) 

201 

202 def safe_maintain(self, maintenance_node, backup_node=None): 

203 """safe maintain one compute node 

204 

205 Migrate all instances of the maintenance_node intensively to the 

206 backup host. 

207 

208 It calculate the resource both of the backup node and maintaining 

209 node to evaluate the migrations from maintaining node to backup node. 

210 If all instances of the maintaining node can migrated to 

211 the backup node, it will set the maintaining node in 

212 'watcher_maintaining' status, and add the migrations to solution. 

213 """ 

214 # If the user gives a backup node with required capacity, then migrates 

215 # all instances from the maintaining node to the backup node. 

216 if backup_node: 

217 if self.host_fits(maintenance_node, backup_node): 217 ↛ 223line 217 didn't jump to line 223 because the condition on line 217 was always true

218 self.enable_compute_node_if_disabled(backup_node) 

219 self.add_action_maintain_compute_node(maintenance_node) 

220 self.host_migration(maintenance_node, backup_node) 

221 return True 

222 

223 return False 

224 

225 def try_maintain(self, maintenance_node): 

226 """try to maintain one compute node 

227 

228 It firstly set the maintenance_node in 'watcher_maintaining' status. 

229 Then try to migrate all instances of the maintenance node, rely 

230 on nova-scheduler. 

231 """ 

232 self.add_action_maintain_compute_node(maintenance_node) 

233 instances = self.compute_model.get_node_instances(maintenance_node) 

234 for instance in instances: 

235 self.instance_migration(instance, maintenance_node) 

236 

237 def pre_execute(self): 

238 self._pre_execute() 

239 

240 def do_execute(self, audit=None): 

241 LOG.info(_('Executing Host Maintenance Migration Strategy')) 

242 

243 maintenance_node = self.input_parameters.get('maintenance_node') 

244 backup_node = self.input_parameters.get('backup_node') 

245 

246 # if no VMs in the maintenance_node, just maintain the compute node 

247 src_node = self.compute_model.get_node_by_name(maintenance_node) 

248 if len(self.compute_model.get_node_instances(src_node)) == 0: 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true

249 if (src_node.disabled_reason != 

250 self.REASON_FOR_MAINTAINING): 

251 self.add_action_maintain_compute_node(src_node) 

252 return 

253 

254 if backup_node: 254 ↛ 257line 254 didn't jump to line 257 because the condition on line 254 was always true

255 des_node = self.compute_model.get_node_by_name(backup_node) 

256 else: 

257 des_node = None 

258 

259 if not self.safe_maintain(src_node, des_node): 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 self.try_maintain(src_node) 

261 

262 def post_execute(self): 

263 """Post-execution phase 

264 

265 This can be used to compute the global efficacy 

266 """ 

267 LOG.debug(self.solution.actions) 

268 LOG.debug(self.compute_model.to_string())