Coverage for watcher/decision_engine/strategy/strategies/outlet_temp_control.py: 80%

106 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-17 12:22 +0000

1# -*- encoding: utf-8 -*- 

2# Copyright (c) 2015 Intel Corp 

3# 

4# Authors: Junjie-Huang <junjie.huang@intel.com> 

5# 

6# Licensed under the Apache License, Version 2.0 (the "License"); 

7# you may not use this file except in compliance with the License. 

8# You may obtain a copy of the License at 

9# 

10# http://www.apache.org/licenses/LICENSE-2.0 

11# 

12# Unless required by applicable law or agreed to in writing, software 

13# distributed under the License is distributed on an "AS IS" BASIS, 

14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 

15# implied. 

16# See the License for the specific language governing permissions and 

17# limitations under the License. 

18# 

19 

20""" 

21*Good Thermal Strategy* 

22 

23Towards to software defined infrastructure, the power and thermal 

24intelligences is being adopted to optimize workload, which can help 

25improve efficiency, reduce power, as well as to improve datacenter PUE 

26and lower down operation cost in data center. 

27Outlet (Exhaust Air) Temperature is one of the important thermal 

28telemetries to measure thermal/workload status of server. 

29 

30This strategy makes decisions to migrate workloads to the hosts with good 

31thermal condition (lowest outlet temperature) when the outlet temperature 

32of source hosts reach a configurable threshold. 

33""" 

34 

35from oslo_log import log 

36 

37from watcher._i18n import _ 

38from watcher.common import exception 

39from watcher.decision_engine.model import element 

40from watcher.decision_engine.strategy.strategies import base 

41 

42 

43LOG = log.getLogger(__name__) 

44 

45 

46class OutletTempControl(base.ThermalOptimizationBaseStrategy): 

47 """[PoC] Outlet temperature control using live migration 

48 

49 *Description* 

50 

51 It is a migration strategy based on the outlet temperature of compute 

52 hosts. It generates solutions to move a workload whenever a server's 

53 outlet temperature is higher than the specified threshold. 

54 

55 *Requirements* 

56 

57 * Hardware: All computer hosts should support IPMI and PTAS technology 

58 * Software: Ceilometer component ceilometer-agent-ipmi running 

59 in each compute host, and Ceilometer API can report such telemetry 

60 ``hardware.ipmi.node.outlet_temperature`` successfully. 

61 * You must have at least 2 physical compute hosts to run this strategy. 

62 

63 *Limitations* 

64 

65 - This is a proof of concept that is not meant to be used in production 

66 - We cannot forecast how many servers should be migrated. This is the 

67 reason why we only plan a single virtual machine migration at a time. 

68 So it's better to use this algorithm with `CONTINUOUS` audits. 

69 - It assume that live migrations are possible 

70 

71 *Spec URL* 

72 

73 https://github.com/openstack/watcher-specs/blob/master/specs/mitaka/implemented/outlet-temperature-based-strategy.rst 

74 """ 

75 

76 # The meter to report outlet temperature in ceilometer 

77 MIGRATION = "migrate" 

78 

79 DATASOURCE_METRICS = ['host_outlet_temp'] 

80 

81 def __init__(self, config, osc=None): 

82 """Outlet temperature control using live migration 

83 

84 :param config: A mapping containing the configuration of this strategy 

85 :type config: dict 

86 :param osc: an OpenStackClients object, defaults to None 

87 :type osc: :py:class:`~.OpenStackClients` instance, optional 

88 """ 

89 super(OutletTempControl, self).__init__(config, osc) 

90 

91 @classmethod 

92 def get_name(cls): 

93 return "outlet_temperature" 

94 

95 @classmethod 

96 def get_display_name(cls): 

97 return _("Outlet temperature based strategy") 

98 

99 @classmethod 

100 def get_translatable_display_name(cls): 

101 return "Outlet temperature based strategy" 

102 

103 @property 

104 def period(self): 

105 return self.input_parameters.get('period', 30) 

106 

107 @classmethod 

108 def get_schema(cls): 

109 # Mandatory default setting for each element 

110 return { 

111 "properties": { 

112 "threshold": { 

113 "description": "temperature threshold for migration", 

114 "type": "number", 

115 "default": 35.0 

116 }, 

117 "period": { 

118 "description": "The time interval in seconds for " 

119 "getting statistic aggregation", 

120 "type": "number", 

121 "default": 30 

122 }, 

123 "granularity": { 

124 "description": "The time between two measures in an " 

125 "aggregated timeseries of a metric.", 

126 "type": "number", 

127 "default": 300 

128 }, 

129 }, 

130 } 

131 

132 @property 

133 def granularity(self): 

134 return self.input_parameters.get('granularity', 300) 

135 

136 def get_available_compute_nodes(self): 

137 default_node_scope = [element.ServiceState.ENABLED.value] 

138 return {uuid: cn for uuid, cn in 

139 self.compute_model.get_all_compute_nodes().items() 

140 if cn.state == element.ServiceState.ONLINE.value and 

141 cn.status in default_node_scope} 

142 

143 def group_hosts_by_outlet_temp(self): 

144 """Group hosts based on outlet temp meters""" 

145 nodes = self.get_available_compute_nodes() 

146 hosts_need_release = [] 

147 hosts_target = [] 

148 metric_name = 'host_outlet_temp' 

149 for node in nodes.values(): 

150 outlet_temp = None 

151 

152 outlet_temp = self.datasource_backend.statistic_aggregation( 

153 resource=node, 

154 resource_type='compute_node', 

155 meter_name=metric_name, 

156 period=self.period, 

157 granularity=self.granularity, 

158 ) 

159 

160 # some hosts may not have outlet temp meters, remove from target 

161 if outlet_temp is None: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 LOG.warning("%s: no outlet temp data", node.uuid) 

163 continue 

164 

165 LOG.debug("%(resource)s: outlet temperature %(temp)f", 

166 {'resource': node.uuid, 'temp': outlet_temp}) 

167 instance_data = {'compute_node': node, 'outlet_temp': outlet_temp} 

168 if outlet_temp >= self.threshold: 

169 # mark the node to release resources 

170 hosts_need_release.append(instance_data) 

171 else: 

172 hosts_target.append(instance_data) 

173 return hosts_need_release, hosts_target 

174 

175 def choose_instance_to_migrate(self, hosts): 

176 """Pick up an active instance to migrate from provided hosts""" 

177 for instance_data in hosts: 177 ↛ 199line 177 didn't jump to line 199 because the loop on line 177 didn't complete

178 mig_source_node = instance_data['compute_node'] 

179 instances_of_src = self.compute_model.get_node_instances( 

180 mig_source_node) 

181 for instance in instances_of_src: 181 ↛ 177line 181 didn't jump to line 177 because the loop on line 181 didn't complete

182 try: 

183 # NOTE: skip exclude instance when migrating 

184 if instance.watcher_exclude: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true

185 LOG.debug("Instance is excluded by scope, " 

186 "skipped: %s", instance.uuid) 

187 continue 

188 # select the first active instance to migrate 

189 if (instance.state != 189 ↛ 191line 189 didn't jump to line 191 because the condition on line 189 was never true

190 element.InstanceState.ACTIVE.value): 

191 LOG.info("Instance not active, skipped: %s", 

192 instance.uuid) 

193 continue 

194 return mig_source_node, instance 

195 except exception.InstanceNotFound as e: 

196 LOG.exception(e) 

197 LOG.info("Instance not found") 

198 

199 return None 

200 

201 def filter_dest_servers(self, hosts, instance_to_migrate): 

202 """Only return hosts with sufficient available resources""" 

203 required_cores = instance_to_migrate.vcpus 

204 required_disk = instance_to_migrate.disk 

205 required_memory = instance_to_migrate.memory 

206 

207 # filter nodes without enough resource 

208 dest_servers = [] 

209 for instance_data in hosts: 

210 host = instance_data['compute_node'] 

211 # available 

212 free_res = self.compute_model.get_node_free_resources(host) 

213 if (free_res['vcpu'] >= required_cores and free_res['disk'] >= 213 ↛ 209line 213 didn't jump to line 209 because the condition on line 213 was always true

214 required_disk and free_res['memory'] >= required_memory): 

215 dest_servers.append(instance_data) 

216 

217 return dest_servers 

218 

219 def pre_execute(self): 

220 self._pre_execute() 

221 # the migration plan will be triggered when the outlet temperature 

222 # reaches threshold 

223 self.threshold = self.input_parameters.threshold 

224 LOG.info("Outlet temperature strategy threshold=%d", 

225 self.threshold) 

226 

227 def do_execute(self, audit=None): 

228 hosts_need_release, hosts_target = self.group_hosts_by_outlet_temp() 

229 

230 if len(hosts_need_release) == 0: 

231 # TODO(zhenzanz): return something right if there's no hot servers 

232 LOG.debug("No hosts require optimization") 

233 return self.solution 

234 

235 if len(hosts_target) == 0: 235 ↛ 236line 235 didn't jump to line 236 because the condition on line 235 was never true

236 LOG.warning("No hosts under outlet temp threshold found") 

237 return self.solution 

238 

239 # choose the server with highest outlet t 

240 hosts_need_release = sorted(hosts_need_release, 

241 reverse=True, 

242 key=lambda x: (x["outlet_temp"])) 

243 

244 instance_to_migrate = self.choose_instance_to_migrate( 

245 hosts_need_release) 

246 # calculate the instance's cpu cores,memory,disk needs 

247 if instance_to_migrate is None: 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true

248 return self.solution 

249 

250 mig_source_node, instance_src = instance_to_migrate 

251 dest_servers = self.filter_dest_servers(hosts_target, instance_src) 

252 # sort the filtered result by outlet temp 

253 # pick up the lowest one as dest server 

254 if len(dest_servers) == 0: 254 ↛ 257line 254 didn't jump to line 257 because the condition on line 254 was never true

255 # TODO(zhenzanz): maybe to warn that there's no resource 

256 # for instance. 

257 LOG.info("No proper target host could be found") 

258 return self.solution 

259 

260 dest_servers = sorted(dest_servers, key=lambda x: (x["outlet_temp"])) 

261 # always use the host with lowerest outlet temperature 

262 mig_destination_node = dest_servers[0]['compute_node'] 

263 # generate solution to migrate the instance to the dest server, 

264 if self.compute_model.migrate_instance( 264 ↛ exitline 264 didn't return from function 'do_execute' because the condition on line 264 was always true

265 instance_src, mig_source_node, mig_destination_node): 

266 parameters = {'migration_type': 'live', 

267 'source_node': mig_source_node.uuid, 

268 'destination_node': mig_destination_node.uuid, 

269 'resource_name': instance_src.name} 

270 self.solution.add_action(action_type=self.MIGRATION, 

271 resource_id=instance_src.uuid, 

272 input_parameters=parameters) 

273 

274 def post_execute(self): 

275 self.solution.model = self.compute_model 

276 # TODO(v-francoise): Add the indicators to the solution 

277 

278 LOG.debug(self.compute_model.to_string())