File 0001-More-accurate-agent-restart-state-transfer.patch of Package openstack-neutron

diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/agent/rpc.py neutron-9.4.2.dev21/neutron/agent/rpc.py
*** neutron-9.4.2.dev21-backup/neutron/agent/rpc.py	2019-07-23 09:28:37.651174881 -0700
--- neutron-9.4.2.dev21/neutron/agent/rpc.py	2019-07-23 09:36:23.133973449 -0700
***************
*** 173,184 ****
          return {'devices': succeeded_devices, 'failed_devices': failed_devices}
  
      def update_device_list(self, context, devices_up, devices_down,
!                            agent_id, host):
          try:
              cctxt = self.client.prepare(version='1.5')
              res = cctxt.call(context, 'update_device_list',
                               devices_up=devices_up, devices_down=devices_down,
!                              agent_id=agent_id, host=host)
          except oslo_messaging.UnsupportedVersion:
              #TODO(rossella_s): Remove this failback logic in M
              dev_up = self._device_list_rpc_call_with_failed_dev(
--- 173,185 ----
          return {'devices': succeeded_devices, 'failed_devices': failed_devices}
  
      def update_device_list(self, context, devices_up, devices_down,
!                            agent_id, host, agent_restarted=False):
          try:
              cctxt = self.client.prepare(version='1.5')
              res = cctxt.call(context, 'update_device_list',
                               devices_up=devices_up, devices_down=devices_down,
!                              agent_id=agent_id, host=host,
!                              agent_restarted=agent_restarted)
          except oslo_messaging.UnsupportedVersion:
              #TODO(rossella_s): Remove this failback logic in M
              dev_up = self._device_list_rpc_call_with_failed_dev(
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/plugins/ml2/drivers/l2pop/mech_driver.py neutron-9.4.2.dev21/neutron/plugins/ml2/drivers/l2pop/mech_driver.py
*** neutron-9.4.2.dev21-backup/neutron/plugins/ml2/drivers/l2pop/mech_driver.py	2019-07-23 09:28:37.651174881 -0700
--- neutron-9.4.2.dev21/neutron/plugins/ml2/drivers/l2pop/mech_driver.py	2019-07-23 09:36:23.133973449 -0700
***************
*** 234,239 ****
--- 234,252 ----
  
          return agents
  
+     def agent_restarted(self, context):
+         agent_host = context.host
+         session = db_api.get_session()
+         agent = l2pop_db.get_agent_by_host(session, agent_host)
+         if l2pop_db.get_agent_uptime(agent) < cfg.CONF.l2pop.agent_boot_time:
+             LOG.warning(_LW("Agent on host '%s' did not supply "
+                             "'agent_restarted'information in RPC message, "
+                             "determined it restarted based on deprecated "
+                             "'agent_boot_time' config option."),
+                         agent_host)
+             return True
+         return False
+ 
      def update_port_down(self, context):
          port = context.current
          agent_host = context.host
***************
*** 251,257 ****
              self.L2populationAgentNotify.remove_fdb_entries(
                  self.rpc_ctx, fdb_entries)
  
!     def update_port_up(self, context):
          port = context.current
          agent_host = context.host
          session = db_api.get_session()
--- 264,270 ----
              self.L2populationAgentNotify.remove_fdb_entries(
                  self.rpc_ctx, fdb_entries)
  
!     def update_port_up(self, context, agent_restarted=None):
          port = context.current
          agent_host = context.host
          session = db_api.get_session()
***************
*** 277,284 ****
          # with high concurrency more than 1 port may be activated on an agent
          # at the same time (like VM port + a DVR port) so checking for 1 or 2
          is_first_port = agent_active_ports in (1, 2)
!         if is_first_port or (l2pop_db.get_agent_uptime(agent) <
!                              cfg.CONF.l2pop.agent_boot_time):
              # First port(s) activated on current agent in this network,
              # we have to provide it with the whole list of fdb entries
              agent_fdb_entries = self._create_agent_fdb(session,
--- 290,299 ----
          # with high concurrency more than 1 port may be activated on an agent
          # at the same time (like VM port + a DVR port) so checking for 1 or 2
          is_first_port = agent_active_ports in (1, 2)
!         if agent_restarted is None:
!             # Only for backport compatibility, will be removed.
!             agent_restarted = self.agent_restarted(context)
!         if is_first_port or agent_restarted:
              # First port(s) activated on current agent in this network,
              # we have to provide it with the whole list of fdb entries
              agent_fdb_entries = self._create_agent_fdb(session,
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py neutron-9.4.2.dev21/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py
*** neutron-9.4.2.dev21-backup/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py	2019-07-23 09:28:37.651174881 -0700
--- neutron-9.4.2.dev21/neutron/plugins/ml2/drivers/openvswitch/agent/ovs_neutron_agent.py	2019-07-23 09:36:23.133973449 -0700
***************
*** 896,904 ****
                  LOG.debug("Setting status for %s to DOWN", device)
                  devices_down.append(device)
          if devices_up or devices_down:
              devices_set = self.plugin_rpc.update_device_list(
                  self.context, devices_up, devices_down, self.agent_id,
!                 self.conf.host)
              failed_devices = (devices_set.get('failed_devices_up') +
                  devices_set.get('failed_devices_down'))
              if failed_devices:
--- 896,910 ----
                  LOG.debug("Setting status for %s to DOWN", device)
                  devices_down.append(device)
          if devices_up or devices_down:
+             # When the iter_num == 0, that indicate the ovs-agent is doing
+             # the initialization work. L2 pop needs this precise knowledge
+             # to notify the agent to refresh the tunnel related flows.
+             # Otherwise, these flows will be cleaned as stale due to the
+             # different cookie id.
+             agent_restarted = self.iter_num == 0
              devices_set = self.plugin_rpc.update_device_list(
                  self.context, devices_up, devices_down, self.agent_id,
!                 self.conf.host, agent_restarted=agent_restarted)
              failed_devices = (devices_set.get('failed_devices_up') +
                  devices_set.get('failed_devices_down'))
              if failed_devices:
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/plugins/ml2/rpc.py neutron-9.4.2.dev21/neutron/plugins/ml2/rpc.py
*** neutron-9.4.2.dev21-backup/neutron/plugins/ml2/rpc.py	2019-07-23 09:28:37.655174905 -0700
--- neutron-9.4.2.dev21/neutron/plugins/ml2/rpc.py	2019-07-23 09:37:05.894230531 -0700
***************
*** 240,245 ****
--- 240,246 ----
          agent_id = kwargs.get('agent_id')
          device = kwargs.get('device')
          host = kwargs.get('host')
+         agent_restarted = kwargs.pop('agent_restarted', None)
          LOG.debug("Device %(device)s up at agent %(agent_id)s",
                    {'device': device, 'agent_id': agent_id})
          plugin = manager.NeutronManager.get_plugin()
***************
*** 265,271 ****
          else:
              self.update_port_status_to_active(port, rpc_context, port_id, host)
          self.notify_l2pop_port_wiring(port_id, rpc_context,
!                                       n_const.PORT_STATUS_ACTIVE, host)
  
      def update_port_status_to_active(self, port, rpc_context, port_id, host):
          plugin = manager.NeutronManager.get_plugin()
--- 266,273 ----
          else:
              self.update_port_status_to_active(port, rpc_context, port_id, host)
          self.notify_l2pop_port_wiring(port_id, rpc_context,
!                                       n_const.PORT_STATUS_ACTIVE, host,
!                                       agent_restarted)
  
      def update_port_status_to_active(self, port, rpc_context, port_id, host):
          plugin = manager.NeutronManager.get_plugin()
***************
*** 289,295 ****
                  provisioning_blocks.L2_AGENT_ENTITY)
  
      def notify_l2pop_port_wiring(self, port_id, rpc_context,
!                                  status, host):
          """Notify the L2pop driver that a port has been wired/unwired.
  
          The L2pop driver uses this notification to broadcast forwarding
--- 291,297 ----
                  provisioning_blocks.L2_AGENT_ENTITY)
  
      def notify_l2pop_port_wiring(self, port_id, rpc_context,
!                                  status, host, agent_restarted=None):
          """Notify the L2pop driver that a port has been wired/unwired.
  
          The L2pop driver uses this notification to broadcast forwarding
***************
*** 300,324 ****
                  'l2population')
          if not l2pop_driver:
              return
          port_context = plugin.get_bound_port_context(
!                 rpc_context, port_id)
          if not port_context:
              # port deleted
              return
          port = port_context.current
!         if (status == n_const.PORT_STATUS_ACTIVE and
              port[portbindings.HOST_ID] != host and
!             not l3_hamode_db.is_ha_router_port(rpc_context,
!                                                port['device_owner'],
                                                 port['device_id'])):
                  # don't setup ACTIVE forwarding entries unless bound to this
!                 # host or if it's an HA port (which is special-cased in the
!                 # mech driver)
                  return
          port_context.current['status'] = status
          port_context.current[portbindings.HOST_ID] = host
          if status == n_const.PORT_STATUS_ACTIVE:
!             l2pop_driver.obj.update_port_up(port_context)
          else:
              l2pop_driver.obj.update_port_down(port_context)
  
--- 302,338 ----
                  'l2population')
          if not l2pop_driver:
              return
+         port = ml2_db.get_port(rpc_context.session, port_id)
+         if not port:
+             return
          port_context = plugin.get_bound_port_context(
!                 rpc_context, port_id, host)
          if not port_context:
              # port deleted
              return
+         # NOTE: DVR ports are already handled and updated through l2pop
+         # and so we don't need to update it again here. But, l2pop did not
+         # handle DVR ports while restart neutron-*-agent, we need to handle
+         # it here.
+         if agent_restarted is None:
+             agent_restarted = l2pop_driver.obj.agent_restarted(port_context)
+         if (port['device_owner'] == n_const.DEVICE_OWNER_DVR_INTERFACE and
+                 not agent_restarted):
+             return
          port = port_context.current
!         if (port['device_owner'] != n_const.DEVICE_OWNER_DVR_INTERFACE and
!             status == n_const.PORT_STATUS_ACTIVE and
              port[portbindings.HOST_ID] != host and
!             not l3_hamode_db.is_ha_router_port(port['device_owner'],
                                                 port['device_id'])):
                  # don't setup ACTIVE forwarding entries unless bound to this
!                 # host or if it's an HA or DVR port (which is special-cased in
!                 # the mech driver)
                  return
          port_context.current['status'] = status
          port_context.current[portbindings.HOST_ID] = host
          if status == n_const.PORT_STATUS_ACTIVE:
!             l2pop_driver.obj.update_port_up(port_context, agent_restarted)
          else:
              l2pop_driver.obj.update_port_down(port_context)
  
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/tests/functional/agent/l2/base.py neutron-9.4.2.dev21/neutron/tests/functional/agent/l2/base.py
*** neutron-9.4.2.dev21-backup/neutron/tests/functional/agent/l2/base.py	2019-07-23 09:28:37.655174905 -0700
--- neutron-9.4.2.dev21/neutron/tests/functional/agent/l2/base.py	2019-07-23 09:36:23.137973472 -0700
***************
*** 269,275 ****
          return ports
  
      def _mock_update_device(self, context, devices_up, devices_down, agent_id,
!                             host=None):
          dev_up = []
          dev_down = []
          for port in self.ports:
--- 269,275 ----
          return ports
  
      def _mock_update_device(self, context, devices_up, devices_down, agent_id,
!                             host=None, agent_restarted=False):
          dev_up = []
          dev_down = []
          for port in self.ports:
***************
*** 313,319 ****
      def _prepare_failed_dev_up_trigger(self, agent):
  
          def mock_failed_devices_up(context, devices_up, devices_down,
!                                    agent_id, host=None):
              failed_devices = []
              devices = list(devices_up)
              # first port fails
--- 313,320 ----
      def _prepare_failed_dev_up_trigger(self, agent):
  
          def mock_failed_devices_up(context, devices_up, devices_down,
!                                    agent_id, host=None,
!                                    agent_restarted=False):
              failed_devices = []
              devices = list(devices_up)
              # first port fails
***************
*** 334,340 ****
      def _prepare_failed_dev_down_trigger(self, agent):
  
          def mock_failed_devices_down(context, devices_up, devices_down,
!                                      agent_id, host=None):
              # first port fails
              failed_port_id = self.ports[0]['id']
              failed_devices_down = []
--- 335,342 ----
      def _prepare_failed_dev_down_trigger(self, agent):
  
          def mock_failed_devices_down(context, devices_up, devices_down,
!                                      agent_id, host=None,
!                                      agent_restarted=False):
              # first port fails
              failed_port_id = self.ports[0]['id']
              failed_devices_down = []
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py
*** neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py	2019-07-23 09:28:37.655174905 -0700
--- neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/drivers/l2pop/test_mech_driver.py	2019-07-23 09:36:23.137973472 -0700
***************
*** 343,348 ****
--- 343,391 ----
                  self.mock_fanout.assert_called_with(
                      mock.ANY, 'remove_fdb_entries', expected)
  
+     def _test_ovs_agent_restarted_with_dvr_port(
+             self, agent_boot_timeout=True, agent_restarted=False):
+         plugin = directory.get_plugin()
+         self._setup_l3()
+         router = self._create_router(distributed=True)
+         with mock.patch.object(l2pop_mech_driver.L2populationMechanismDriver,
+                                'agent_restarted',
+                                return_value=agent_boot_timeout):
+             with self.subnet(network=self._network,
+                              enable_dhcp=False) as snet:
+                 with self.port(
+                         subnet=snet,
+                         device_owner=constants.DEVICE_OWNER_DVR_INTERFACE)\
+                             as port:
+                     port_id = port['port']['id']
+                     plugin.update_distributed_port_binding(self.adminContext,
+                         port_id, {'port': {portbindings.HOST_ID: HOST_4,
+                         'device_id': router['id']}})
+                     port = self._show('ports', port_id)
+                     self.assertEqual(portbindings.VIF_TYPE_DISTRIBUTED,
+                                     port['port'][portbindings.VIF_TYPE])
+                     self.callbacks.update_device_up(
+                         self.adminContext,
+                         agent_id=HOST_4,
+                         device=port_id,
+                         host=HOST_4,
+                         agent_restarted=agent_restarted)
+                     fanout_expected = {port['port']['network_id']: {
+                         'network_type': u'vxlan',
+                         'ports': {
+                             u'20.0.0.4': [('00:00:00:00:00:00', '0.0.0.0')]},
+                         'segment_id': 1}}
+                     self.mock_fanout.assert_called_with(mock.ANY,
+                                                         'add_fdb_entries',
+                                                         fanout_expected)
+ 
+     def test_ovs_agent_restarted_with_dvr_port_boot_config_timeout(self):
+         self._test_ovs_agent_restarted_with_dvr_port()
+ 
+     def test_ovs_agent_restarted_with_dvr_port_rpc_send_timeout(self):
+         self._test_ovs_agent_restarted_with_dvr_port(
+             agent_boot_timeout=False, agent_restarted=True)
+ 
      def test_ha_agents_get_other_fdb(self):
          # First network port is added on HOST4, then HA router port is
          # added on HOST and HOST2.
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py
*** neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py	2019-07-23 09:28:37.655174905 -0700
--- neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/drivers/openvswitch/agent/test_ovs_neutron_agent.py	2019-07-23 09:36:23.137973472 -0700
***************
*** 715,721 ****
              self.agent._bind_devices(port_details)
              update_devices.assert_called_once_with(mock.ANY, devices_up,
                                                     devices_down,
!                                                    mock.ANY, mock.ANY)
  
      def _test_arp_spoofing(self, enable_prevent_arp_spoofing):
          self.agent.prevent_arp_spoofing = enable_prevent_arp_spoofing
--- 715,722 ----
              self.agent._bind_devices(port_details)
              update_devices.assert_called_once_with(mock.ANY, devices_up,
                                                     devices_down,
!                                                    mock.ANY, mock.ANY,
!                                                    agent_restarted=True)
  
      def _test_arp_spoofing(self, enable_prevent_arp_spoofing):
          self.agent.prevent_arp_spoofing = enable_prevent_arp_spoofing
diff -crB --new-file neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/test_rpc.py neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/test_rpc.py
*** neutron-9.4.2.dev21-backup/neutron/tests/unit/plugins/ml2/test_rpc.py	2019-07-23 09:28:37.659174929 -0700
--- neutron-9.4.2.dev21/neutron/tests/unit/plugins/ml2/test_rpc.py	2019-07-23 09:36:23.141973497 -0700
***************
*** 443,448 ****
--- 443,449 ----
                             devices_down=['fake_device3', 'fake_device4'],
                             agent_id='fake_agent_id',
                             host='fake_host',
+                            agent_restarted=False,
                             version='1.5')
  
      def test_update_device_list_unsupported(self):
diff -crB --new-file neutron-9.4.2.dev21-backup/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml neutron-9.4.2.dev21/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml
*** neutron-9.4.2.dev21-backup/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml	1969-12-31 16:00:00.000000000 -0800
--- neutron-9.4.2.dev21/releasenotes/notes/precise-agent-state-transfer-67c771cb1ee04dd0.yaml	2019-07-23 09:36:23.141973497 -0700
***************
*** 0 ****
--- 1,27 ----
+ ---
+ critical:
+   - |
+     The neutron-openvswitch-agent can sometimes spend too much time handling
+     a large number of ports, exceeding its timeout value, ``agent_boot_time``,
+     for L2 population. Because of this, some flow update operations will not
+     be triggerred, resulting in lost flows during agent restart, especially
+     for host-to-host vxlan tunnel flows, causing the original tunnel flows to
+     be treated as stale due to the different cookie IDs. The agent's first
+     RPC loop will also do a stale flow clean-up procedure and delete them,
+     leading to a loss of connectivity.
+     Please ensure that all neutron-server and neutron-openvswitch-agent
+     binaries are upgraded for the changes to take effect, after which
+     the L2 population ``agent_boot_time`` config option will no longer
+     be used.
+ fixes:
+   - |
+     The neutron-openvswitch-agent was changed to notify the neutron-server
+     in its first RPC loop that it has restarted. This signals neutron-server
+     to provide updated L2 population information to correctly program FDB
+     entries, ensuring connectivity to instances is not interrupted.
+     This fixes the following bugs:
+     `1794991 <https://bugs.launchpad.net/neutron/+bug/1794991>`_,
+     `1799178 <https://bugs.launchpad.net/neutron/+bug/1799178>`_,
+     `1813703 <https://bugs.launchpad.net/neutron/+bug/1813703>`_,
+     `1813714 <https://bugs.launchpad.net/neutron/+bug/1813714>`_,
+     `1813715 <https://bugs.launchpad.net/neutron/+bug/1813715>`_.
openSUSE Build Service is sponsored by