File 0004-dig-first-round-of-refinement-to-dig.patch of Package b4

From: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Date: Tue, 14 Oct 2025 16:55:05 -0400
Subject: dig: first round of refinement to dig
References: dig-support
Git-repo: https://git.kernel.org/pub/scm/utils/b4/b4.git
Git-commit: 3ae277e9c7dd3e1df61a14884aabdd5834ad1201
Patch-mainline: yes

Lots of changes:

- filter search result to only match what is before the commit date,
because after that date we're likely to get false-positives from
cherry-picks and backports
- output a single "most likely came from here" link to stdout, so people
can pass this to pipes
- add a -a switch that will dig deeper and try to find all previous
revisions of that series, whether or not the patch showed up there or
not

Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
---
 src/b4/__init__.py |  49 ++++++++++-
 src/b4/command.py  |   4 +
 src/b4/dig.py      | 203 ++++++++++++++++++++++++++++++++-------------
 3 files changed, 197 insertions(+), 59 deletions(-)

diff --git a/src/b4/__init__.py b/src/b4/__init__.py
index b6eab255103f..2b7ba1d59797 100644
--- a/src/b4/__init__.py
+++ b/src/b4/__init__.py
@@ -37,6 +37,8 @@ from pathlib import Path
 from contextlib import contextmanager
 from typing import Optional, Tuple, Set, List, BinaryIO, Union, Sequence, Literal, Iterator, Dict
 
+from email.message import EmailMessage
+
 from email import charset
 
 charset.add_charset('utf-8', None)
@@ -505,7 +507,9 @@ class LoreSeries:
     complete: bool = False
     has_cover: bool = False
     partial_reroll: bool = False
-    subject: str
+    subject: Optional[str] = None
+    fromname: Optional[str] = None
+    fromemail: Optional[str] = None
     indexes: Optional[List[Tuple[str, str]]] = None
     base_commit: Optional[str] = None
     change_id: Optional[str] = None
@@ -524,6 +528,7 @@ class LoreSeries:
     def __repr__(self):
         out = list()
         out.append('- Series: [v%s] %s' % (self.revision, self.subject))
+        out.append('  author: %s <%s>' % (self.fromname, self.fromemail))
         out.append('  revision: %s' % self.revision)
         out.append('  expected: %s' % self.expected)
         out.append('  complete: %s' % self.complete)
@@ -542,6 +547,27 @@ class LoreSeries:
 
         return '\n'.join(out)
 
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, LoreSeries):
+            return NotImplemented
+        # We are the same series if all patch-id's are exactly the same
+        my_patchids: List[Optional[str]] = list()
+        for patch in self.patches[1:]:
+            if patch is not None:
+                my_patchids.append(patch.git_patch_id)
+        other_patchids: List[Optional[str]] = list()
+        for patch in other.patches[1:]:
+            if patch is not None:
+                other_patchids.append(patch.git_patch_id)
+        return my_patchids == other_patchids
+
+    def get_patch_by_msgid(self, msgid: str) -> Optional['LoreMessage']:
+        for lmsg in self.patches:
+            if lmsg is not None and lmsg.msgid == msgid:
+                return lmsg
+        raise IndexError('No such patch in series')
+
+
     def add_patch(self, lmsg: 'LoreMessage') -> None:
         while len(self.patches) < lmsg.expected + 1:
             self.patches.append(None)
@@ -587,8 +613,12 @@ class LoreSeries:
 
             if self.patches[0] is not None:
                 self.subject = self.patches[0].subject
+                self.fromname = self.patches[0].fromname
+                self.fromemail = self.patches[0].fromemail
             elif self.patches[1] is not None:
                 self.subject = self.patches[1].subject
+                self.fromname = self.patches[1].fromname
+                self.fromemail = self.patches[1].fromemail
 
     def get_slug(self, extended: bool = False) -> str:
         # Find the first non-None entry
@@ -3386,9 +3416,22 @@ def get_series_by_change_id(change_id: str, nocache: bool = False) -> Optional['
     return lmbx
 
 
-def get_series_by_patch_id(patch_id: str, nocache: bool = False) -> Optional['LoreMailbox']:
+def get_msgs_by_patch_id(patch_id: str, extra_query: Optional[str] = None,
+                         nocache: bool = False, full_threads: bool = False
+                         ) -> Optional[List[EmailMessage]]:
     q = f'patchid:{patch_id}'
-    q_msgs = get_pi_search_results(q, nocache=nocache)
+    if extra_query:
+        q = f'{q} {extra_query}'
+    logger.debug('Full query: %s (nocache=%s)', q, nocache)
+    q_msgs = get_pi_search_results(q, nocache=nocache, full_threads=full_threads)
+    if not q_msgs:
+        return None
+
+    return q_msgs
+
+
+def get_series_by_patch_id(patch_id: str, nocache: bool = False) -> Optional['LoreMailbox']:
+    q_msgs = get_msgs_by_patch_id(patch_id, full_threads=True, nocache=nocache)
     if not q_msgs:
         return None
     lmbx = LoreMailbox()
diff --git a/src/b4/command.py b/src/b4/command.py
index 80a96c365b3e..4568b2089868 100644
--- a/src/b4/command.py
+++ b/src/b4/command.py
@@ -392,6 +392,10 @@ def setup_parser() -> argparse.ArgumentParser:
     sp_dig = subparsers.add_parser('dig', help='Dig into the details of a specific commit')
     sp_dig.add_argument('-c', '--commitish', dest='commitish', metavar='COMMITISH',
                         help='Commit-ish object to dig into')
+    sp_dig.add_argument('-C', '--no-cache', dest='nocache', action='store_true', default=False,
+                        help='Do not use local cache')
+    sp_dig.add_argument('-a', '--all-series', action='store_true', default=False,
+                        help='Show all series, not just the latest matching')
     sp_dig.set_defaults(func=cmd_dig)
 
     return parser
diff --git a/src/b4/dig.py b/src/b4/dig.py
index 03ca3211c37b..5d19d59261e7 100644
--- a/src/b4/dig.py
+++ b/src/b4/dig.py
@@ -9,10 +9,11 @@ import os
 import sys
 import b4
 import argparse
-import email.parser
+import re
+import urllib.parse
 
 from email.message import EmailMessage
-from typing import List, Set, Optional
+from typing import List, Set, Optional, Union
 
 logger = b4.logger
 
@@ -25,6 +26,28 @@ try_diff_algos: List[str] = [
 ]
 
 
+def try_links(links: Set[str]) -> None:
+    logger.info('Try following these Link trailers:')
+    for link in links:
+        logger.info('  Link: %s', link)
+
+
+def print_one_match(subject: str, link: str) -> None:
+    logger.info('---')
+    logger.info(subject)
+    sys.stdout.write(f'{link}\n')
+
+
+def get_all_msgids_from_urls(urls: Set[str]) -> Set[str]:
+    msgids: Set[str] = set()
+    for url in urls:
+        matches = re.search(r'^https?://[^@]+/([^/]+@[^/]+)', url, re.IGNORECASE)
+        if matches:
+            chunks = matches.groups()
+            msgids.add(urllib.parse.unquote(chunks[0]))
+    return msgids
+
+
 def dig_commitish(cmdargs: argparse.Namespace) -> None:
     config = b4.get_main_config()
     cfg_llval = config.get('linkmask', '')
@@ -56,15 +79,30 @@ def dig_commitish(cmdargs: argparse.Namespace) -> None:
         logger.error('Merge commit detected, please specify a single-parent commit.')
         sys.exit(1)
 
+    # Look at the commit message and find any Link: trailers
+    links: Set[str] = set()
+    ecode, out = b4.git_run_command(
+        topdir, ['show', '--no-patch', '--format=%B', commit],
+    )
+    if ecode > 0:
+        logger.error('Could not get commit message for %s', commit)
+        sys.exit(1)
+    trailers, _ = b4.LoreMessage.find_trailers(out)
+    ltrs = [t for t in trailers if t.name.lower() == 'link']
+    if ltrs:
+        links = set(ltr.value for ltr in ltrs)
+
+    msgids = get_all_msgids_from_urls(links)
+
     # Find commit's author and subject from git
     ecode, out = b4.git_run_command(
-        topdir, ['show', '--no-patch', '--format=%ae %s', commit],
+        topdir, ['show', '--no-patch', '--format=%as %ae %s', commit],
     )
     if ecode > 0:
         logger.error('Could not get commit info for %s', commit)
         sys.exit(1)
-    fromeml, csubj = out.strip().split(maxsplit=1)
-    logger.debug('fromeml=%s, csubj=%s', fromeml, csubj)
+    cdate, fromeml, csubj = out.strip().split(maxsplit=2)
+    logger.debug('cdate=%s, fromeml=%s, csubj=%s', cdate, fromeml, csubj)
     logger.info('Attempting to match by exact patch-id...')
     showargs = [
         '--format=email',
@@ -74,7 +112,7 @@ def dig_commitish(cmdargs: argparse.Namespace) -> None:
     ]
     # Keep a record so we don't try git-patch-id on identical patches
     bpatches: Set[bytes] = set()
-    lmbx: Optional[b4.LoreMailbox] = None
+    msgs: Optional[List[EmailMessage]] = None
     for algo in try_diff_algos:
         logger.debug('Trying with diff-algorithm=%s', algo)
         algoarg = f'--diff-algorithm={algo}'
@@ -97,68 +135,121 @@ def dig_commitish(cmdargs: argparse.Namespace) -> None:
             sys.exit(1)
         patch_id = out.split(maxsplit=1)[0]
         logger.debug('Patch-id for commit %s is %s', commit, patch_id)
-        logger.info('Trying to find matching series by patch-id %s', patch_id)
-        lmbx = b4.get_series_by_patch_id(patch_id)
-        if lmbx:
+        logger.info('Trying to find matching series by patch-id %s (%s)', patch_id, algo)
+        # Limit lookup by date prior to the commit date, to weed out any false-positives from
+        # backports or from erroneously resent series
+        extra_query = f'AND rt:..{cdate}'
+        logger.debug('extra_query=%s', extra_query)
+        msgs = b4.get_msgs_by_patch_id(patch_id, nocache=cmdargs.nocache, extra_query=extra_query)
+        if msgs:
             logger.info('Found matching series by patch-id')
+            for msg in msgs:
+                msgid = b4.LoreMessage.get_clean_msgid(msg)
+                if msgid:
+                    logger.debug('Adding from patch-id matches: %s', msgid)
+                    msgids.add(msgid)
             break
 
-    if not lmbx:
+    if not msgs:
         logger.info('Attempting to match by author and subject...')
-        q = '(s:"%s" AND f:"%s")' % (csubj.replace('"', ''), fromeml)
-        msgs = b4.get_pi_search_results(q)
+        q = '(s:"%s" AND f:"%s" AND rt:..%s)' % (csubj.replace('"', ''), fromeml, cdate)
+        msgs = b4.get_pi_search_results(q, nocache=cmdargs.nocache, full_threads=False)
         if msgs:
-            logger.info('Found %s matching messages', len(msgs))
-            lmbx = b4.LoreMailbox()
             for msg in msgs:
-                lmbx.add_message(msg)
-        else:
+                msgid = b4.LoreMessage.get_clean_msgid(msg)
+                if msgid:
+                    logger.debug('Adding from author+subject matches: %s', msgid)
+                    msgids.add(msgid)
+        if not msgs and not msgids:
             logger.error('Could not find anything matching commit %s', commit)
-            # Look at the commit message and find any Link: trailers
-            ecode, out = b4.git_run_command(
-                topdir, ['show', '--no-patch', '--format=%B', commit],
-            )
-            if ecode > 0:
-                logger.error('Could not get commit message for %s', commit)
-                sys.exit(1)
-            trailers, _ = b4.LoreMessage.find_trailers(out)
-            ltrs = [t for t in trailers if t.name.lower() == 'link']
-            if ltrs:
-                logger.info('---')
-                logger.info('Try following these Link trailers:')
-                for ltr in ltrs:
-                    logger.info('  %s', ltr.as_string())
+            if links:
+                try_links(links)
             sys.exit(1)
 
-    # Grab the latest series and see if we have a change_id
-    revs = list(lmbx.series.keys())
-    revs.sort(key=lambda r: lmbx.series[r].submission_date or 0)
-
-    change_id: Optional[str] = None
-    lser = lmbx.get_series(codereview_trailers=False)
-    for rev in revs:
-        change_id = lmbx.series[rev].change_id
-        if not change_id:
+    logger.info('Will consider promising messages: %s', len(msgids))
+    logger.debug('msgids: %s', msgids)
+    # Go one by one and grab threads by message-id
+    seen_msgids: Set[str] = set()
+    lmbxs: List[b4.LoreMailbox] = list()
+    for msgid in msgids:
+        if not msgid or msgid in seen_msgids:
+            logger.debug('Skipping duplicate or invalid msgid %s', msgid)
+            continue
+        seen_msgids.add(msgid)
+        logger.debug('Fetching thread by msgid %s', msgid)
+        lmbx = b4.get_series_by_msgid(msgid)
+        if not lmbx:
+            logger.error('Could not fetch thread for msgid %s, skipping', msgid)
             continue
-        logger.info('Backfilling any missing series by change-id')
-        logger.debug('change_id=%s', change_id)
-        # Fill in the rest of the series by change_id
-        q = f'nq:"change-id:{change_id}"'
-        q_msgs = b4.get_pi_search_results(q, full_threads=True)
-        if q_msgs:
-            for q_msg in q_msgs:
-                lmbx.add_message(q_msg)
-        break
-
-    logger.debug('Number of series in the mbox: %d', len(lmbx.series))
+        if not lmbx.series:
+            logger.debug('No series found in this mailbox, skipping')
+            continue
+        lmbxs.append(lmbx)
+
+    if not lmbxs:
+        logger.error('Could not fetch any threads for the matching messages!')
+        sys.exit(1)
+
+    lsers: List[b4.LoreSeries] = list()
+    for lmbx in lmbxs:
+        maxrev = max(lmbx.series.keys())
+        if cmdargs.all_series and len(lmbx.series) < maxrev:
+            logger.debug('Fetching prior series')
+            # Do we have a change-id in this series?
+            lser = lmbx.get_series(codereview_trailers=False)
+            fillin_q: str = ''
+            if lser and lser.change_id:
+                logger.debug('Found change-id %s in the series', lser.change_id)
+                fillin_q = f'nq:"change-id:{lser.change_id}"'
+            elif lser and lser.subject and lser.fromemail:
+                # We're going to match by first patch/cover letter subject and author.
+                # It's not perfect, but it's the best we can do without a change-id.
+                fillin_q = '(s:"%s" AND f:"%s")' % (lser.subject.replace('"', ''), lser.fromemail)
+            if fillin_q:
+                fillin_q += f' AND rt:..{cdate}'
+                logger.debug('fillin_q=%s', fillin_q)
+                q_msgs = b4.get_pi_search_results(fillin_q, nocache=cmdargs.nocache, full_threads=True)
+                if q_msgs:
+                    for q_msg in q_msgs:
+                        lmbx.add_message(q_msg)
+                        q_msgid = b4.LoreMessage.get_clean_msgid(q_msg)
+                        if q_msgid:
+                            seen_msgids.add(q_msgid)
+
+        for lser in lmbx.series.values():
+            if lser and lser not in lsers:
+                lsers.append(lser)
+
+    if not len(lsers):
+        logger.error('Could not find any series containing this patch!')
+        if links:
+            try_links(links)
+        sys.exit(1)
+
+    lsers.sort(key=lambda r: r.submission_date or 0)
+    logger.debug('Number of matching series: %d', len(lsers))
+    lmsg: Optional[b4.LoreMessage] = None
+    if not cmdargs.all_series:
+        # Go backwards in time and find the first matching patch
+        for lser in reversed(lsers):
+            for lmsg in lser.patches[1:]:
+                if lmsg is None:
+                    continue
+                if lmsg.git_patch_id == patch_id:
+                    logger.debug('matched by exact patch-id')
+                    print_one_match(lmsg.full_subject, linkmask % lmsg.msgid)
+                    return
+                if lmsg.subject == csubj:
+                    logger.debug('matched by subject')
+                    print_one_match(lmsg.full_subject, linkmask % lmsg.msgid)
+                    return
+
     logger.info('---')
-    logger.info('This patch is present in the following series:')
+    logger.info('This patch belongs in the following series:')
     logger.info('---')
-    for rev in revs:
+    for lser in lsers:
         firstmsg: Optional[b4.LoreMessage] = None
-        pref = f'  v{rev}: '
-        lser = lmbx.series[rev]
-        lmsg: Optional[b4.LoreMessage] = None
+        pref = f'  v{lser.revision}: '
         if lser.has_cover:
             firstmsg = lser.patches[0]
         for lmsg in lser.patches[1:]:
@@ -179,7 +270,7 @@ def dig_commitish(cmdargs: argparse.Namespace) -> None:
         if lmsg is None:
             # Use the first patch in the series as a fallback
             lmsg = firstmsg
-        logger.info('%s%s', pref, lmsg.full_subject)
+        logger.info('%s%s', pref, firstmsg.full_subject)
         logger.info('%s%s', ' ' * len(pref), linkmask % lmsg.msgid)
 
 
-- 
2.51.0

openSUSE Build Service is sponsored by