File 0001-More-accurate-agent-restart-state-transfer.patch of Package openstack-neutron
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/agent/rpc.py neutron-9.4.2.dev21/neutron/agent/rpc.py
*** neutron-9.4.2.dev21-backup/neutron/agent/rpc.py 2019-07-23 09:28:37.651174881 -0700
--- neutron-9.4.2.dev21/neutron/agent/rpc.py 2019-07-23 09:36:23.133973449 -0700
***************
*** 173,184 ****
return {'devices': succeeded_devices, 'failed_devices': failed_devices}
def update_device_list(self, context, devices_up, devices_down,
! agent_id, host):
try:
cctxt = self.client.prepare(version='1.5')
res = cctxt.call(context, 'update_device_list',
devices_up=devices_up, devices_down=devices_down,
! agent_id=agent_id, host=host)
except oslo_messaging.UnsupportedVersion:
#TODO(rossella_s): Remove this failback logic in M
dev_up = self._device_list_rpc_call_with_failed_dev(
--- 173,185 ----
return {'devices': succeeded_devices, 'failed_devices': failed_devices}
def update_device_list(self, context, devices_up, devices_down,
! agent_id, host, agent_restarted=False):
try:
cctxt = self.client.prepare(version='1.5')
res = cctxt.call(context, 'update_device_list',
devices_up=devices_up, devices_down=devices_down,
! agent_id=agent_id, host=host,
! agent_restarted=agent_restarted)
except oslo_messaging.UnsupportedVersion:
#TODO(rossella_s): Remove this failback logic in M
dev_up = self._device_list_rpc_call_with_failed_dev(
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/plugins/ml2/drivers/l2pop/mech_driver.py neutron-9.4.2.dev21/neutron/plugins/ml2/drivers/l2pop/mech_driver.py
*** neutron-9.4.2.dev21-backup/neutron/plugins/ml2/drivers/l2pop/mech_driver.py 2019-07-23 09:28:37.651174881 -0700
--- neutron-9.4.2.dev21/neutron/plugins/ml2/drivers/l2pop/mech_driver.py 2019-07-23 09:36:23.133973449 -0700
***************
*** 234,239 ****
--- 234,252 ----
return agents
+ def agent_restarted(self, context):
+ agent_host = context.host
+ session = db_api.get_session()
+ agent = l2pop_db.get_agent_by_host(session, agent_host)
+ if l2pop_db.get_agent_uptime(agent) < cfg.CONF.l2pop.agent_boot_time:
+ LOG.warning(_LW("Agent on host '%s' did not supply "
+ "'agent_restarted'information in RPC message, "
+ "determined it restarted based on deprecated "
+ "'agent_boot_time' config option."),
+ agent_host)
+ return True
+ return False
+
def update_port_down(self, context):
port = context.current
agent_host = context.host
***************
*** 251,257 ****
self.L2populationAgentNotify.remove_fdb_entries(
self.rpc_ctx, fdb_entries)
! def update_port_up(self, context):
port = context.current
agent_host = context.host
session = db_api.get_session()
--- 264,270 ----
self.L2populationAgentNotify.remove_fdb_entries(
self.rpc_ctx, fdb_entries)
! def update_port_up(self, context, agent_restarted=None):
port = context.current
agent_host = context.host
session = db_api.get_session()
***************
*** 277,284 ****
# with high concurrency more than 1 port may be activated on an agent
# at the same time (like VM port + a DVR port) so checking for 1 or 2
is_first_port = agent_active_ports in (1, 2)
! if is_first_port or (l2pop_db.get_agent_uptime(agent) <
! cfg.CONF.l2pop.agent_boot_time):
# First port(s) activated on current agent in this network,
# we have to provide it with the whole list of fdb entries
agent_fdb_entries = self._create_agent_fdb(session,
--- 290,299 ----
# with high concurrency more than 1 port may be activated on an agent
# at the same time (like VM port + a DVR port) so checking for 1 or 2
is_first_port = agent_active_ports in (1, 2)
! if agent_restarted is None:
! # Only for backport compatibility, will be removed.
! agent_restarted = self.agent_restarted(context)
! if is_first_port or agent_restarted:
# First port(s) activated on current agent in this network,
# we have to provide it with the whole list of fdb entries
agent_fdb_entries = self._create_agent_fdb(session,
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py neutron-9.4.2.dev21/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py
*** neutron-9.4.2.dev21-backup/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py 2019-07-23 09:28:37.651174881 -0700
--- neutron-9.4.2.dev21/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py 2019-07-23 09:36:23.133973449 -0700
***************
*** 896,904 ****
LOG.debug("Setting status for %s to DOWN", device)
devices_down.append(device)
if devices_up or devices_down:
devices_set = self.plugin_rpc.update_device_list(
self.context, devices_up, devices_down, self.agent_id,
! self.conf.host)
failed_devices = (devices_set.get('failed_devices_up') +
devices_set.get('failed_devices_down'))
if failed_devices:
--- 896,910 ----
LOG.debug("Setting status for %s to DOWN", device)
devices_down.append(device)
if devices_up or devices_down:
+ # When the iter_num == 0, that indicate the ovs-agent is doing
+ # the initialization work. L2 pop needs this precise knowledge
+ # to notify the agent to refresh the tunnel related flows.
+ # Otherwise, these flows will be cleaned as stale due to the
+ # different cookie id.
+ agent_restarted = self.iter_num == 0
devices_set = self.plugin_rpc.update_device_list(
self.context, devices_up, devices_down, self.agent_id,
! self.conf.host, agent_restarted=agent_restarted)
failed_devices = (devices_set.get('failed_devices_up') +
devices_set.get('failed_devices_down'))
if failed_devices:
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/plugins/ml2/rpc.py neutron-9.4.2.dev21/neutron/plugins/ml2/rpc.py
*** neutron-9.4.2.dev21-backup/neutron/plugins/ml2/rpc.py 2019-07-23 09:28:37.655174905 -0700
--- neutron-9.4.2.dev21/neutron/plugins/ml2/rpc.py 2019-07-23 09:37:05.894230531 -0700
***************
*** 240,245 ****
--- 240,246 ----
agent_id = kwargs.get('agent_id')
device = kwargs.get('device')
host = kwargs.get('host')
+ agent_restarted = kwargs.pop('agent_restarted', None)
LOG.debug("Device %(device)s up at agent %(agent_id)s",
{'device': device, 'agent_id': agent_id})
plugin = manager.NeutronManager.get_plugin()
***************
*** 265,271 ****
else:
self.update_port_status_to_active(port, rpc_context, port_id, host)
self.notify_l2pop_port_wiring(port_id, rpc_context,
! n_const.PORT_STATUS_ACTIVE, host)
def update_port_status_to_active(self, port, rpc_context, port_id, host):
plugin = manager.NeutronManager.get_plugin()
--- 266,273 ----
else:
self.update_port_status_to_active(port, rpc_context, port_id, host)
self.notify_l2pop_port_wiring(port_id, rpc_context,
! n_const.PORT_STATUS_ACTIVE, host,
! agent_restarted)
def update_port_status_to_active(self, port, rpc_context, port_id, host):
plugin = manager.NeutronManager.get_plugin()
***************
*** 289,295 ****
provisioning_blocks.L2_AGENT_ENTITY)
def notify_l2pop_port_wiring(self, port_id, rpc_context,
! status, host):
"""Notify the L2pop driver that a port has been wired/unwired.
The L2pop driver uses this notification to broadcast forwarding
--- 291,297 ----
provisioning_blocks.L2_AGENT_ENTITY)
def notify_l2pop_port_wiring(self, port_id, rpc_context,
! status, host, agent_restarted=None):
"""Notify the L2pop driver that a port has been wired/unwired.
The L2pop driver uses this notification to broadcast forwarding
***************
*** 300,324 ****
'l2population')
if not l2pop_driver:
return
port_context = plugin.get_bound_port_context(
! rpc_context, port_id)
if not port_context:
# port deleted
return
port = port_context.current
! if (status == n_const.PORT_STATUS_ACTIVE and
port[portbindings.HOST_ID] != host and
! not l3_hamode_db.is_ha_router_port(rpc_context,
! port['device_owner'],
port['device_id'])):
# don't setup ACTIVE forwarding entries unless bound to this
! # host or if it's an HA port (which is special-cased in the
! # mech driver)
return
port_context.current['status'] = status
port_context.current[portbindings.HOST_ID] = host
if status == n_const.PORT_STATUS_ACTIVE:
! l2pop_driver.obj.update_port_up(port_context)
else:
l2pop_driver.obj.update_port_down(port_context)
--- 302,338 ----
'l2population')
if not l2pop_driver:
return
+ port = ml2_db.get_port(rpc_context.session, port_id)
+ if not port:
+ return
port_context = plugin.get_bound_port_context(
! rpc_context, port_id, host)
if not port_context:
# port deleted
return
+ # NOTE: DVR ports are already handled and updated through l2pop
+ # and so we don't need to update it again here. But, l2pop did not
+ # handle DVR ports while restart neutron-*-agent, we need to handle
+ # it here.
+ if agent_restarted is None:
+ agent_restarted = l2pop_driver.obj.agent_restarted(port_context)
+ if (port['device_owner'] == n_const.DEVICE_OWNER_DVR_INTERFACE and
+ not agent_restarted):
+ return
port = port_context.current
! if (port['device_owner'] != n_const.DEVICE_OWNER_DVR_INTERFACE and
! status == n_const.PORT_STATUS_ACTIVE and
port[portbindings.HOST_ID] != host and
! not l3_hamode_db.is_ha_router_port(port['device_owner'],
port['device_id'])):
# don't setup ACTIVE forwarding entries unless bound to this
! # host or if it's an HA or DVR port (which is special-cased in
! # the mech driver)
return
port_context.current['status'] = status
port_context.current[portbindings.HOST_ID] = host
if status == n_const.PORT_STATUS_ACTIVE:
! l2pop_driver.obj.update_port_up(port_context, agent_restarted)
else:
l2pop_driver.obj.update_port_down(port_context)
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/tests/functional/agent/l2/base.py neutron-9.4.2.dev21/neutron/tests/functional/agent/l2/base.py
*** neutron-9.4.2.dev21-backup/neutron/tests/functional/agent/l2/base.py 2019-07-23 09:28:37.655174905 -0700
--- neutron-9.4.2.dev21/neutron/tests/functional/agent/l2/base.py 2019-07-23 09:36:23.137973472 -0700
***************
*** 269,275 ****
return ports
def _mock_update_device(self, context, devices_up, devices_down, agent_id,
! host=None):
dev_up = []
dev_down = []
for port in self.ports:
--- 269,275 ----
return ports
def _mock_update_device(self, context, devices_up, devices_down, agent_id,
! host=None, agent_restarted=False):
dev_up = []
dev_down = []
for port in self.ports:
***************
*** 313,319 ****
def _prepare_failed_dev_up_trigger(self, agent):
def mock_failed_devices_up(context, devices_up, devices_down,
! agent_id, host=None):
failed_devices = []
devices = list(devices_up)
# first port fails
--- 313,320 ----
def _prepare_failed_dev_up_trigger(self, agent):
def mock_failed_devices_up(context, devices_up, devices_down,
! agent_id, host=None,
! agent_restarted=False):
failed_devices = []
devices = list(devices_up)
# first port fails
***************
*** 334,340 ****
def _prepare_failed_dev_down_trigger(self, agent):
def mock_failed_devices_down(context, devices_up, devices_down,
! agent_id, host=None):
# first port fails
failed_port_id = self.ports[0]['id']
failed_devices_down = []
--- 335,342 ----
def _prepare_failed_dev_down_trigger(self, agent):
def mock_failed_devices_down(context, devices_up, devices_down,
! agent_id, host=None,
! agent_restarted=False):
# first port fails
failed_port_id = self.ports[0]['id']
failed_devices_down = []
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py
*** neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py 2019-07-23 09:28:37.655174905 -0700
--- neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py 2019-07-23 09:36:23.137973472 -0700
***************
*** 343,348 ****
--- 343,391 ----
self.mock_fanout.assert_called_with(
mock.ANY, 'remove_fdb_entries', expected)
+ def _test_ovs_agent_restarted_with_dvr_port(
+ self, agent_boot_timeout=True, agent_restarted=False):
+ plugin = directory.get_plugin()
+ self._setup_l3()
+ router = self._create_router(distributed=True)
+ with mock.patch.object(l2pop_mech_driver.L2populationMechanismDriver,
+ 'agent_restarted',
+ return_value=agent_boot_timeout):
+ with self.subnet(network=self._network,
+ enable_dhcp=False) as snet:
+ with self.port(
+ subnet=snet,
+ device_owner=constants.DEVICE_OWNER_DVR_INTERFACE)\
+ as port:
+ port_id = port['port']['id']
+ plugin.update_distributed_port_binding(self.adminContext,
+ port_id, {'port': {portbindings.HOST_ID: HOST_4,
+ 'device_id': router['id']}})
+ port = self._show('ports', port_id)
+ self.assertEqual(portbindings.VIF_TYPE_DISTRIBUTED,
+ port['port'][portbindings.VIF_TYPE])
+ self.callbacks.update_device_up(
+ self.adminContext,
+ agent_id=HOST_4,
+ device=port_id,
+ host=HOST_4,
+ agent_restarted=agent_restarted)
+ fanout_expected = {port['port']['network_id']: {
+ 'network_type': u'vxlan',
+ 'ports': {
+ u'20.0.0.4': [('00:00:00:00:00:00', '0.0.0.0')]},
+ 'segment_id': 1}}
+ self.mock_fanout.assert_called_with(mock.ANY,
+ 'add_fdb_entries',
+ fanout_expected)
+
+ def test_ovs_agent_restarted_with_dvr_port_boot_config_timeout(self):
+ self._test_ovs_agent_restarted_with_dvr_port()
+
+ def test_ovs_agent_restarted_with_dvr_port_rpc_send_timeout(self):
+ self._test_ovs_agent_restarted_with_dvr_port(
+ agent_boot_timeout=False, agent_restarted=True)
+
def test_ha_agents_get_other_fdb(self):
# First network port is added on HOST4, then HA router port is
# added on HOST and HOST2.
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py
*** neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py 2019-07-23 09:28:37.655174905 -0700
--- neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py 2019-07-23 09:36:23.137973472 -0700
***************
*** 715,721 ****
self.agent._bind_devices(port_details)
update_devices.assert_called_once_with(mock.ANY, devices_up,
devices_down,
! mock.ANY, mock.ANY)
def _test_arp_spoofing(self, enable_prevent_arp_spoofing):
self.agent.prevent_arp_spoofing = enable_prevent_arp_spoofing
--- 715,722 ----
self.agent._bind_devices(port_details)
update_devices.assert_called_once_with(mock.ANY, devices_up,
devices_down,
! mock.ANY, mock.ANY,
! agent_restarted=True)
def _test_arp_spoofing(self, enable_prevent_arp_spoofing):
self.agent.prevent_arp_spoofing = enable_prevent_arp_spoofing
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/test_rpc.py neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/test_rpc.py
*** neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/test_rpc.py 2019-07-23 09:28:37.659174929 -0700
--- neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/test_rpc.py 2019-07-23 09:36:23.141973497 -0700
***************
*** 443,448 ****
--- 443,449 ----
devices_down=['fake_device3', 'fake_device4'],
agent_id='fake_agent_id',
host='fake_host',
+ agent_restarted=False,
version='1.5')
def test_update_device_list_unsupported(self):
diff -crB --new-file neutron-9.4.2.dev21-backup/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml neutron-9.4.2.dev21/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml
*** neutron-9.4.2.dev21-backup/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml 1969-12-31 16:00:00.000000000 -0800
--- neutron-9.4.2.dev21/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml 2019-07-23 09:36:23.141973497 -0700
***************
*** 0 ****
--- 1,27 ----
+ ---
+ critical:
+ - |
+ The neutron-openvswitch-agent can sometimes spend too much time handling
+ a large number of ports, exceeding its timeout value, ``agent_boot_time``,
+ for L2 population. Because of this, some flow update operations will not
+ be triggerred, resulting in lost flows during agent restart, especially
+ for host-to-host vxlan tunnel flows, causing the original tunnel flows to
+ be treated as stale due to the different cookie IDs. The agent's first
+ RPC loop will also do a stale flow clean-up procedure and delete them,
+ leading to a loss of connectivity.
+ Please ensure that all neutron-server and neutron-openvswitch-agent
+ binaries are upgraded for the changes to take effect, after which
+ the L2 population ``agent_boot_time`` config option will no longer
+ be used.
+ fixes:
+ - |
+ The neutron-openvswitch-agent was changed to notify the neutron-server
+ in its first RPC loop that it has restarted. This signals neutron-server
+ to provide updated L2 population information to correctly program FDB
+ entries, ensuring connectivity to instances is not interrupted.
+ This fixes the following bugs:
+ `1794991 <https://bugs.launchpad.net/neutron/+bug/1794991>`_,
+ `1799178 <https://bugs.launchpad.net/neutron/+bug/1799178>`_,
+ `1813703 <https://bugs.launchpad.net/neutron/+bug/1813703>`_,
+ `1813714 <https://bugs.launchpad.net/neutron/+bug/1813714>`_,
+ `1813715 <https://bugs.launchpad.net/neutron/+bug/1813715>`_.