Coverage for watcher/decision_engine/strategy/strategies/outlet_temp_control.py: 80%
106 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-17 12:22 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-17 12:22 +0000
1# -*- encoding: utf-8 -*-
2# Copyright (c) 2015 Intel Corp
3#
4# Authors: Junjie-Huang <junjie.huang@intel.com>
5#
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15# implied.
16# See the License for the specific language governing permissions and
17# limitations under the License.
18#
20"""
21*Good Thermal Strategy*
23Towards to software defined infrastructure, the power and thermal
24intelligences is being adopted to optimize workload, which can help
25improve efficiency, reduce power, as well as to improve datacenter PUE
26and lower down operation cost in data center.
27Outlet (Exhaust Air) Temperature is one of the important thermal
28telemetries to measure thermal/workload status of server.
30This strategy makes decisions to migrate workloads to the hosts with good
31thermal condition (lowest outlet temperature) when the outlet temperature
32of source hosts reach a configurable threshold.
33"""
35from oslo_log import log
37from watcher._i18n import _
38from watcher.common import exception
39from watcher.decision_engine.model import element
40from watcher.decision_engine.strategy.strategies import base
43LOG = log.getLogger(__name__)
46class OutletTempControl(base.ThermalOptimizationBaseStrategy):
47 """[PoC] Outlet temperature control using live migration
49 *Description*
51 It is a migration strategy based on the outlet temperature of compute
52 hosts. It generates solutions to move a workload whenever a server's
53 outlet temperature is higher than the specified threshold.
55 *Requirements*
57 * Hardware: All computer hosts should support IPMI and PTAS technology
58 * Software: Ceilometer component ceilometer-agent-ipmi running
59 in each compute host, and Ceilometer API can report such telemetry
60 ``hardware.ipmi.node.outlet_temperature`` successfully.
61 * You must have at least 2 physical compute hosts to run this strategy.
63 *Limitations*
65 - This is a proof of concept that is not meant to be used in production
66 - We cannot forecast how many servers should be migrated. This is the
67 reason why we only plan a single virtual machine migration at a time.
68 So it's better to use this algorithm with `CONTINUOUS` audits.
69 - It assume that live migrations are possible
71 *Spec URL*
73 https://github.com/openstack/watcher-specs/blob/master/specs/mitaka/implemented/outlet-temperature-based-strategy.rst
74 """
76 # The meter to report outlet temperature in ceilometer
77 MIGRATION = "migrate"
79 DATASOURCE_METRICS = ['host_outlet_temp']
81 def __init__(self, config, osc=None):
82 """Outlet temperature control using live migration
84 :param config: A mapping containing the configuration of this strategy
85 :type config: dict
86 :param osc: an OpenStackClients object, defaults to None
87 :type osc: :py:class:`~.OpenStackClients` instance, optional
88 """
89 super(OutletTempControl, self).__init__(config, osc)
91 @classmethod
92 def get_name(cls):
93 return "outlet_temperature"
95 @classmethod
96 def get_display_name(cls):
97 return _("Outlet temperature based strategy")
99 @classmethod
100 def get_translatable_display_name(cls):
101 return "Outlet temperature based strategy"
103 @property
104 def period(self):
105 return self.input_parameters.get('period', 30)
107 @classmethod
108 def get_schema(cls):
109 # Mandatory default setting for each element
110 return {
111 "properties": {
112 "threshold": {
113 "description": "temperature threshold for migration",
114 "type": "number",
115 "default": 35.0
116 },
117 "period": {
118 "description": "The time interval in seconds for "
119 "getting statistic aggregation",
120 "type": "number",
121 "default": 30
122 },
123 "granularity": {
124 "description": "The time between two measures in an "
125 "aggregated timeseries of a metric.",
126 "type": "number",
127 "default": 300
128 },
129 },
130 }
132 @property
133 def granularity(self):
134 return self.input_parameters.get('granularity', 300)
136 def get_available_compute_nodes(self):
137 default_node_scope = [element.ServiceState.ENABLED.value]
138 return {uuid: cn for uuid, cn in
139 self.compute_model.get_all_compute_nodes().items()
140 if cn.state == element.ServiceState.ONLINE.value and
141 cn.status in default_node_scope}
143 def group_hosts_by_outlet_temp(self):
144 """Group hosts based on outlet temp meters"""
145 nodes = self.get_available_compute_nodes()
146 hosts_need_release = []
147 hosts_target = []
148 metric_name = 'host_outlet_temp'
149 for node in nodes.values():
150 outlet_temp = None
152 outlet_temp = self.datasource_backend.statistic_aggregation(
153 resource=node,
154 resource_type='compute_node',
155 meter_name=metric_name,
156 period=self.period,
157 granularity=self.granularity,
158 )
160 # some hosts may not have outlet temp meters, remove from target
161 if outlet_temp is None: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true
162 LOG.warning("%s: no outlet temp data", node.uuid)
163 continue
165 LOG.debug("%(resource)s: outlet temperature %(temp)f",
166 {'resource': node.uuid, 'temp': outlet_temp})
167 instance_data = {'compute_node': node, 'outlet_temp': outlet_temp}
168 if outlet_temp >= self.threshold:
169 # mark the node to release resources
170 hosts_need_release.append(instance_data)
171 else:
172 hosts_target.append(instance_data)
173 return hosts_need_release, hosts_target
175 def choose_instance_to_migrate(self, hosts):
176 """Pick up an active instance to migrate from provided hosts"""
177 for instance_data in hosts: 177 ↛ 199line 177 didn't jump to line 199 because the loop on line 177 didn't complete
178 mig_source_node = instance_data['compute_node']
179 instances_of_src = self.compute_model.get_node_instances(
180 mig_source_node)
181 for instance in instances_of_src: 181 ↛ 177line 181 didn't jump to line 177 because the loop on line 181 didn't complete
182 try:
183 # NOTE: skip exclude instance when migrating
184 if instance.watcher_exclude: 184 ↛ 185line 184 didn't jump to line 185 because the condition on line 184 was never true
185 LOG.debug("Instance is excluded by scope, "
186 "skipped: %s", instance.uuid)
187 continue
188 # select the first active instance to migrate
189 if (instance.state != 189 ↛ 191line 189 didn't jump to line 191 because the condition on line 189 was never true
190 element.InstanceState.ACTIVE.value):
191 LOG.info("Instance not active, skipped: %s",
192 instance.uuid)
193 continue
194 return mig_source_node, instance
195 except exception.InstanceNotFound as e:
196 LOG.exception(e)
197 LOG.info("Instance not found")
199 return None
201 def filter_dest_servers(self, hosts, instance_to_migrate):
202 """Only return hosts with sufficient available resources"""
203 required_cores = instance_to_migrate.vcpus
204 required_disk = instance_to_migrate.disk
205 required_memory = instance_to_migrate.memory
207 # filter nodes without enough resource
208 dest_servers = []
209 for instance_data in hosts:
210 host = instance_data['compute_node']
211 # available
212 free_res = self.compute_model.get_node_free_resources(host)
213 if (free_res['vcpu'] >= required_cores and free_res['disk'] >= 213 ↛ 209line 213 didn't jump to line 209 because the condition on line 213 was always true
214 required_disk and free_res['memory'] >= required_memory):
215 dest_servers.append(instance_data)
217 return dest_servers
219 def pre_execute(self):
220 self._pre_execute()
221 # the migration plan will be triggered when the outlet temperature
222 # reaches threshold
223 self.threshold = self.input_parameters.threshold
224 LOG.info("Outlet temperature strategy threshold=%d",
225 self.threshold)
227 def do_execute(self, audit=None):
228 hosts_need_release, hosts_target = self.group_hosts_by_outlet_temp()
230 if len(hosts_need_release) == 0:
231 # TODO(zhenzanz): return something right if there's no hot servers
232 LOG.debug("No hosts require optimization")
233 return self.solution
235 if len(hosts_target) == 0: 235 ↛ 236line 235 didn't jump to line 236 because the condition on line 235 was never true
236 LOG.warning("No hosts under outlet temp threshold found")
237 return self.solution
239 # choose the server with highest outlet t
240 hosts_need_release = sorted(hosts_need_release,
241 reverse=True,
242 key=lambda x: (x["outlet_temp"]))
244 instance_to_migrate = self.choose_instance_to_migrate(
245 hosts_need_release)
246 # calculate the instance's cpu cores,memory,disk needs
247 if instance_to_migrate is None: 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true
248 return self.solution
250 mig_source_node, instance_src = instance_to_migrate
251 dest_servers = self.filter_dest_servers(hosts_target, instance_src)
252 # sort the filtered result by outlet temp
253 # pick up the lowest one as dest server
254 if len(dest_servers) == 0: 254 ↛ 257line 254 didn't jump to line 257 because the condition on line 254 was never true
255 # TODO(zhenzanz): maybe to warn that there's no resource
256 # for instance.
257 LOG.info("No proper target host could be found")
258 return self.solution
260 dest_servers = sorted(dest_servers, key=lambda x: (x["outlet_temp"]))
261 # always use the host with lowerest outlet temperature
262 mig_destination_node = dest_servers[0]['compute_node']
263 # generate solution to migrate the instance to the dest server,
264 if self.compute_model.migrate_instance( 264 ↛ exitline 264 didn't return from function 'do_execute' because the condition on line 264 was always true
265 instance_src, mig_source_node, mig_destination_node):
266 parameters = {'migration_type': 'live',
267 'source_node': mig_source_node.uuid,
268 'destination_node': mig_destination_node.uuid,
269 'resource_name': instance_src.name}
270 self.solution.add_action(action_type=self.MIGRATION,
271 resource_id=instance_src.uuid,
272 input_parameters=parameters)
274 def post_execute(self):
275 self.solution.model = self.compute_model
276 # TODO(v-francoise): Add the indicators to the solution
278 LOG.debug(self.compute_model.to_string())