Coverage for watcher/decision_engine/strategy/strategies/noisy_neighbor.py: 83%
125 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-17 12:22 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-17 12:22 +0000
1# -*- encoding: utf-8 -*-
2# Copyright (c) 2017 Intel Corp
3#
4# Authors: Prudhvi Rao Shedimbi <prudhvi.rao.shedimbi@intel.com>
5#
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15# implied.
16# See the License for the specific language governing permissions and
17# limitations under the License.
18#
19from debtcollector import removals
20from oslo_config import cfg
21from oslo_log import log
22import warnings
24from watcher._i18n import _
25from watcher.decision_engine.strategy.strategies import base
27LOG = log.getLogger(__name__)
28CONF = cfg.CONF
30warnings.simplefilter('always')
33@removals.removed_class("NoisyNeighbor", version="2025.2",
34 removal_version="2026.2")
35class NoisyNeighbor(base.NoisyNeighborBaseStrategy):
36 """Noisy Neighbor strategy using live migration
38 *Description*
40 This strategy can identify and migrate a Noisy Neighbor -
41 a low priority VM that negatively affects performance of
42 a high priority VM in terms of IPC by over utilizing
43 Last Level Cache.
45 *Requirements*
47 To enable LLC metric, latest Intel server with CMT support is required.
49 *Limitations*
51 This is a proof of concept that is not meant to be used in production
53 *Spec URL*
55 http://specs.openstack.org/openstack/watcher-specs/specs/pike/implemented/noisy_neighbor_strategy.html
56 """
58 MIGRATION = "migrate"
60 DATASOURCE_METRICS = ['instance_l3_cache_usage']
62 DEFAULT_WATCHER_PRIORITY = 5
64 def __init__(self, config, osc=None):
65 super(NoisyNeighbor, self).__init__(config, osc)
67 self.meter_name = 'instance_l3_cache_usage'
69 @classmethod
70 def get_name(cls):
71 return "noisy_neighbor"
73 @classmethod
74 def get_display_name(cls):
75 return _("Noisy Neighbor")
77 @classmethod
78 def get_translatable_display_name(cls):
79 return "Noisy Neighbor"
81 @classmethod
82 def get_schema(cls):
83 # Mandatory default setting for each element
84 return {
85 "properties": {
86 "cache_threshold": {
87 "description": "Performance drop in L3_cache threshold "
88 "for migration",
89 "type": "number",
90 "default": 35.0
91 },
92 "period": {
93 "description": "Aggregate time period of "
94 "ceilometer and gnocchi",
95 "type": "number",
96 "default": 100.0
97 },
98 },
99 }
101 def get_current_and_previous_cache(self, instance):
102 try:
103 curr_cache = self.datasource_backend.get_instance_l3_cache_usage(
104 instance, self.meter_name, self.period,
105 'mean', granularity=300)
106 previous_cache = 2 * (
107 self.datasource_backend.get_instance_l3_cache_usage(
108 instance, self.meter_name, 2 * self.period,
109 'mean', granularity=300)) - curr_cache
111 except Exception as exc:
112 LOG.exception(exc)
113 return None, None
115 return curr_cache, previous_cache
117 def find_priority_instance(self, instance):
119 current_cache, previous_cache = \
120 self.get_current_and_previous_cache(instance)
122 if None in (current_cache, previous_cache): 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true
123 LOG.warning("Datasource unable to pick L3 Cache "
124 "values. Skipping the instance")
125 return None
127 if (current_cache < (1 - (self.cache_threshold / 100.0)) *
128 previous_cache):
129 return instance
130 else:
131 return None
133 def find_noisy_instance(self, instance):
135 noisy_current_cache, noisy_previous_cache = \
136 self.get_current_and_previous_cache(instance)
138 if None in (noisy_current_cache, noisy_previous_cache): 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 LOG.warning("Datasource unable to pick "
140 "L3 Cache. Skipping the instance")
141 return None
143 if (noisy_current_cache > (1 + (self.cache_threshold / 100.0)) * 143 ↛ 147line 143 didn't jump to line 147 because the condition on line 143 was always true
144 noisy_previous_cache):
145 return instance
146 else:
147 return None
149 def group_hosts(self):
150 nodes = self.compute_model.get_all_compute_nodes()
151 hosts_need_release = {}
152 hosts_target = []
154 for node in nodes.values():
155 instances_of_node = self.compute_model.get_node_instances(node)
156 node_instance_count = len(instances_of_node)
158 # Flag that tells us whether to skip the node or not. If True,
159 # the node is skipped. Will be true if we find a noisy instance or
160 # when potential priority instance will be same as potential noisy
161 # instance
162 loop_break_flag = False
164 if node_instance_count > 1:
166 instance_priority_list = []
168 for instance in instances_of_node:
169 instance_priority_list.append(instance)
171 # If there is no metadata regarding watcher-priority, it takes
172 # DEFAULT_WATCHER_PRIORITY as priority.
173 instance_priority_list.sort(key=lambda a: (
174 a.get('metadata').get('watcher-priority'),
175 self.DEFAULT_WATCHER_PRIORITY))
177 instance_priority_list_reverse = list(instance_priority_list)
178 instance_priority_list_reverse.reverse()
180 for potential_priority_instance in instance_priority_list:
182 priority_instance = self.find_priority_instance(
183 potential_priority_instance)
185 if (priority_instance is not None):
187 for potential_noisy_instance in ( 187 ↛ 209line 187 didn't jump to line 209 because the loop on line 187 didn't complete
188 instance_priority_list_reverse):
189 if (potential_noisy_instance == 189 ↛ 191line 189 didn't jump to line 191 because the condition on line 189 was never true
190 potential_priority_instance):
191 loop_break_flag = True
192 break
194 noisy_instance = self.find_noisy_instance(
195 potential_noisy_instance)
197 if noisy_instance is not None: 197 ↛ 187line 197 didn't jump to line 187 because the condition on line 197 was always true
198 hosts_need_release[node.uuid] = {
199 'priority_vm': potential_priority_instance,
200 'noisy_vm': potential_noisy_instance}
201 LOG.debug("Priority VM found: %s",
202 potential_priority_instance.uuid)
203 LOG.debug("Noisy VM found: %s",
204 potential_noisy_instance.uuid)
205 loop_break_flag = True
206 break
208 # No need to check other instances in the node
209 if loop_break_flag is True:
210 break
212 if node.uuid not in hosts_need_release:
213 hosts_target.append(node)
215 return hosts_need_release, hosts_target
217 def filter_dest_servers(self, hosts, instance_to_migrate):
218 required_cores = instance_to_migrate.vcpus
219 required_disk = instance_to_migrate.disk
220 required_memory = instance_to_migrate.memory
222 dest_servers = []
223 for host in hosts:
224 free_res = self.compute_model.get_node_free_resources(host)
225 if (free_res['vcpu'] >= required_cores and free_res['disk'] >= 225 ↛ 223line 225 didn't jump to line 223 because the condition on line 225 was always true
226 required_disk and free_res['memory'] >= required_memory):
227 dest_servers.append(host)
229 return dest_servers
231 def pre_execute(self):
232 self._pre_execute()
234 def do_execute(self, audit=None):
235 self.cache_threshold = self.input_parameters.cache_threshold
236 self.period = self.input_parameters.period
238 hosts_need_release, hosts_target = self.group_hosts()
240 if len(hosts_need_release) == 0:
241 LOG.debug("No hosts require optimization")
242 return
244 if len(hosts_target) == 0: 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true
245 LOG.debug("No hosts available to migrate")
246 return
248 mig_source_node_name = max(hosts_need_release.keys(), key=lambda a:
249 hosts_need_release[a]['priority_vm'])
250 instance_to_migrate = hosts_need_release[mig_source_node_name][
251 'noisy_vm']
253 if instance_to_migrate is None: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true
254 return
256 dest_servers = self.filter_dest_servers(hosts_target,
257 instance_to_migrate)
259 if len(dest_servers) == 0: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true
260 LOG.info("No proper target host could be found")
261 return
263 # Destination node will be the first available node in the list.
264 mig_destination_node = dest_servers[0]
265 mig_source_node = self.compute_model.get_node_by_uuid(
266 mig_source_node_name)
268 if self.compute_model.migrate_instance(instance_to_migrate, 268 ↛ exitline 268 didn't return from function 'do_execute' because the condition on line 268 was always true
269 mig_source_node,
270 mig_destination_node):
271 parameters = {'migration_type': 'live',
272 'source_node': mig_source_node.uuid,
273 'destination_node': mig_destination_node.uuid,
274 'resource_name': instance_to_migrate.name}
275 self.solution.add_action(action_type=self.MIGRATION,
276 resource_id=instance_to_migrate.uuid,
277 input_parameters=parameters)
279 def post_execute(self):
280 self.solution.model = self.compute_model
282 LOG.debug(self.compute_model.to_string())