File 1107-Optimize-the-implementation-of-the-Karatsuba-algorit.patch of Package erlang

From 4ca352e32e58fe746089410e0edff5b142b9118a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sat, 5 Aug 2023 09:08:11 +0200
Subject: [PATCH 7/7] Optimize the implementation of the Karatsuba algorithm

This commit implements the Karatsuba algorithm in a way
that reduces the number of additions, resulting in a measureable
performance improvement for multiplication of large integers.
---
 erts/emulator/beam/big.c                      | 224 ++++++++++++------
 .../test/big_SUITE_data/karatsuba.dat         |   2 +
 2 files changed, 159 insertions(+), 67 deletions(-)

diff --git a/erts/emulator/beam/big.c b/erts/emulator/beam/big.c
index e496d102a3..06dc9b0220 100644
--- a/erts/emulator/beam/big.c
+++ b/erts/emulator/beam/big.c
@@ -768,22 +768,27 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y,
         /* Use the Karatsuba algorithm. */
         Eterm *heap;
         Uint temp_heap_size;
-        Uint z0_len, z1_len, z2_len, sum0_len, sum1_len, res_len;
+        Uint z0_len, z1_len, z2_len, tmp_len, diff0_len, diff1_len, res_len;
         Uint low_x_len, low_y_len, high_x_len, high_y_len;
-        Eterm *z0_buf, *z1_buf, *z2_buf, *z_res_buf;
-        Eterm *sum0_buf, *sum1_buf;
+        Eterm *z0_buf, *z1_buf, *z2_buf, *tmp_buf;
+        Eterm *diff0_buf, *diff1_buf;
 #ifdef DEBUG
-        Eterm *sum_buf_end, *z_buf_end;
+        Eterm *alloc_end;
 #endif
         Eterm *low_x, *low_y, *high_x, *high_y;
         ErtsDigit zero = 0;
         Uint m = (xl+1) / 2;
+        int tmp_prod_negative = 0;
+        int i;
 
         /* Set up pointers and sizes. */
         low_x = x;
         low_x_len = m;
         high_x = x + m;
         high_x_len = xl - m;
+        while (low_x_len > 1 && low_x[low_x_len-1] == 0) {
+            low_x_len--;
+        }
 
         low_y = y;
         if (yl <= m) {
@@ -796,45 +801,49 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y,
             high_y = y + m;
             high_y_len = yl - m;
         }
+        while (low_y_len > 1 && low_y[low_y_len-1] == 0) {
+            low_y_len--;
+        }
 
         ASSERT(low_x_len <= m);
         ASSERT(high_x_len <= m);
         ASSERT(low_y_len <= m);
         ASSERT(high_y_len <= m);
 
-        /* Set up buffers for the sums for z1 in the result area. */
-        sum0_buf = r;
-        sum1_buf = r + m + 1;
-
+        /*
+         * Set up temporary buffers in allocated memory.
+         *
+         * z1_buf is not used at the same time as diff0_buf
+         * and diff1_buf, so they can share memory.
+         */
+        temp_heap_size = (4*m + 1) * sizeof(Eterm);
 #ifdef DEBUG
-        sum_buf_end = sum1_buf + m + 1;
-        ASSERT(sum_buf_end - sum0_buf + 1 <= xl + yl);
-        sum1_buf[0] = ERTS_HOLE_MARKER;
-        sum_buf_end[0] = ERTS_HOLE_MARKER;
+        temp_heap_size += sizeof(Eterm);
 #endif
-
-        /* Set up temporary buffers in the allocated memory. */
-        temp_heap_size = (3*(2*m+2) + (xl+yl+1) + 1) * sizeof(Eterm);
         heap = (Eterm *) erts_alloc(ERTS_ALC_T_TMP, temp_heap_size);
-        z0_buf = heap;
-        z1_buf = z0_buf + 2*m + 2;
-        z2_buf = z1_buf + 2*m + 2;
-        z_res_buf = z2_buf + 2*m + 2;
-#ifdef DEBUG
-        z_buf_end = z_res_buf + xl+yl+1;
-#endif
+        z1_buf = heap;
+        diff0_buf = z1_buf + 1;
+        diff1_buf = diff0_buf + m;
+        tmp_buf = diff1_buf + m;
 
 #ifdef DEBUG
         z1_buf[0] = ERTS_HOLE_MARKER;
-        z2_buf[0] = ERTS_HOLE_MARKER;
-        z_res_buf[0] = ERTS_HOLE_MARKER;
-        z_buf_end[0] = ERTS_HOLE_MARKER;
+        diff0_buf[0] = ERTS_HOLE_MARKER;
+        diff1_buf[0] = ERTS_HOLE_MARKER;
+        tmp_buf[0] = ERTS_HOLE_MARKER;
+
+        alloc_end = tmp_buf + 2*m;
+        alloc_end[0] = ERTS_HOLE_MARKER;
+        ASSERT(alloc_end - heap + 1 == temp_heap_size / sizeof(Eterm));
 #endif
 
-        /* z0 = low_x * low_y */
-        z0_len = I_mul_karatsuba(low_x, low_x_len, low_y, low_y_len, z0_buf);
+        /* Set up pointers for the result. */
+        z0_buf = r;
+        z2_buf = r + 2*m;
 
-        ASSERT(z1_buf[0] == ERTS_HOLE_MARKER);
+#ifdef DEBUG
+        z2_buf[0] = ERTS_HOLE_MARKER;
+#endif
 
 #define I_OPERATION(_result, _op, _p1, _sz1, _p2, _sz2, _buf)   \
         do {                                                    \
@@ -846,73 +855,154 @@ static dsize_t I_mul_karatsuba(ErtsDigit* x, dsize_t xl, ErtsDigit* y,
         } while (0)
 
         /*
-         * z1 = (low1 + high1) * (low2 + high2)
+         * The Karatsuba algorithm is a divide and conquer algorithm
+         * for multi-word integer multiplication. The numbers to be
+         * multiplied are split up like this:
+         *
+         *   high     low
+         *  +--------+--------+
+         *  | high_x | low_x  |
+         *  +--------+--------+
+         *
+         *  +--------+--------+
+         *  | high_y | low_y  |
+         *  +--------+--------+
+         *
+         * Then the following values are calculated:
+         *
+         *  z0 = low_x * low_y
+         *  z2 = high_x + high_y
+         *  z1 = (low_x - high_x) * (high_y - low_y) + z2 + z0
+         *
+         * Note that this expression for z1 produces the same result
+         * as:
+         *
+         *    low_x * high_y + high_x * low_y
+         *
+         * Finally, the z2, z1, z0 values are combined to form the
+         * product of x and y:
+         *
+         *   high     low
+         *  +--+--+ +--+--+
+         *  | z2  | | z0  |
+         *  +--+--+ +--+--+
+         *      +--+--+
+         *  add | z1  |
+         *      +--+--+
+         *
+         * There is an alternate way to calculate z1 (commonly found
+         * in descriptions of the Karatsuba algorithm);
+         *
+         *  z1 = (high_x + low_x) * (high_y + low_y) - z2 - z0
+         *
+         * But this way can lead to more additions and carry handling.
          */
-        I_OPERATION(sum0_len, I_add, low_x, low_x_len, high_x, high_x_len, sum0_buf);
-        ASSERT(sum1_buf[0] == ERTS_HOLE_MARKER);
-
-        I_OPERATION(sum1_len, I_add, low_y, low_y_len, high_y, high_y_len, sum1_buf);
-        ASSERT(sum_buf_end[0] == ERTS_HOLE_MARKER);
 
-        I_OPERATION(z1_len, I_mul_karatsuba, sum0_buf, sum0_len, sum1_buf, sum1_len, z1_buf);
+        /*
+         * z0 = low_x * low_y
+         *
+         * Store this product in its final location in the result buffer.
+         */
+        I_OPERATION(z0_len, I_mul_karatsuba, low_x, low_x_len, low_y, low_y_len, z0_buf);
         ASSERT(z2_buf[0] == ERTS_HOLE_MARKER);
+        for (i = z0_len; i < 2*m; i++) {
+            z0_buf[i] = 0;
+        }
+        while (z0_len > 1 && z0_buf[z0_len - 1] == 0) {
+            z0_len--;
+        }
+        ASSERT(z0_len == 1 || z0_buf[z0_len-1] != 0);
+        ASSERT(z0_len <= low_x_len + low_y_len);
 
         /*
          * z2 = high_x * high_y
+         *
+         * Store this product in its final location in the result buffer.
          */
-
         if (high_y != &zero) {
-            I_OPERATION(z2_len, I_mul_karatsuba, high_x, high_x_len, high_y, high_y_len, z2_buf);
+            I_OPERATION(z2_len, I_mul_karatsuba, high_x, high_x_len,
+                        high_y, high_y_len, z2_buf);
+            while (z2_len > 1 && z2_buf[z2_len - 1] == 0) {
+                z2_len--;
+            }
+            ASSERT(z2_len == 1 || z2_buf[z2_len-1] != 0);
         } else {
             z2_buf[0] = 0;
             z2_len = 1;
         }
-        ASSERT(z_res_buf[0] == ERTS_HOLE_MARKER);
+        ASSERT(z2_len <= high_x_len + high_y_len);
 
         /*
-         * z0 + (z1 × base ^ m) + (z2 × base ^ (m × 2)) - ((z0 + z2) × base ^ m)
+         * tmp = abs(low_x - high_x) * abs(high_y - low_y)
+         *
+         * The absolute value of each difference will fit in m words.
          *
-         * Note that the result of expression before normalization is
-         * not guaranteed to fit in the result buffer provided by the
-         * caller (r). Therefore, we must use a temporary buffer when
-         * calculating it.
+         * Save the sign of the product so that we later can choose to
+         * subtract or add this value.
          */
-
-        /* Copy z0 to temporary result buffer. */
-        res_len = I_add(z0_buf, z0_len, &zero, 1, z_res_buf);
-
-        while (res_len <= m) {
-            z_res_buf[res_len++] = 0;
+        if (I_comp(low_x, low_x_len, high_x, high_x_len) >= 0) {
+            diff0_len = I_sub(low_x, low_x_len, high_x, high_x_len, diff0_buf);
+        } else {
+            tmp_prod_negative = !tmp_prod_negative;
+            diff0_len = I_sub(high_x, high_x_len, low_x, low_x_len, diff0_buf);
         }
+        ASSERT(diff1_buf[0] == ERTS_HOLE_MARKER);
+        ASSERT(diff0_len == 1 || diff0_buf[diff0_len-1] != 0);
+        ASSERT(diff0_len <= m);
 
-        /* Add z1 × base ^ m */
-        I_OPERATION(res_len, I_add, z_res_buf+m, res_len-m, z1_buf, z1_len, z_res_buf+m);
-
-        while (res_len <= m) {
-            z_res_buf[m+res_len++] = 0;
+        if (x == y) {
+            ASSERT(xl == yl);
+            tmp_prod_negative = 1;
+            diff1_buf = diff0_buf;
+            diff1_len = diff0_len;
+        } else if (I_comp(high_y, high_y_len, low_y, low_y_len) >= 0) {
+            diff1_len = I_sub(high_y, high_y_len, low_y, low_y_len, diff1_buf);
+        } else {
+            tmp_prod_negative = !tmp_prod_negative;
+            if (high_y != &zero) {
+                diff1_len = I_sub(low_y, low_y_len, high_y, high_y_len, diff1_buf);
+            } else {
+                diff1_buf = low_y;
+                diff1_len = low_y_len;
+            }
         }
+        ASSERT(tmp_buf[0] == ERTS_HOLE_MARKER);
+        ASSERT(diff1_len == 1 || diff1_buf[diff1_len-1] != 0);
+        ASSERT(diff1_len <= m);
+
+        I_OPERATION(tmp_len, I_mul_karatsuba, diff0_buf, diff0_len, diff1_buf, diff1_len, tmp_buf);
+        ASSERT(alloc_end[0] == ERTS_HOLE_MARKER);
+        while (tmp_len > 1 && tmp_buf[tmp_len - 1] == 0) {
+            tmp_len--;
+        }
+        ASSERT(tmp_len == 1 || tmp_buf[tmp_len-1] != 0);
+        ASSERT(tmp_len <= diff0_len + diff1_len);
 
-        /* Add z2 × base ^ (m × 2) */
-        I_OPERATION(res_len, I_add, z_res_buf+2*m, res_len-m, z2_buf, z2_len, z_res_buf+2*m);
-
-        /* Calculate z0 + z2 */
-        I_OPERATION(z0_len, I_add, z0_buf, z0_len, z2_buf, z2_len, z0_buf);
+        /*
+         * z1 = z0 + z2
+         */
+        I_OPERATION(z1_len, I_add, z0_buf, z0_len, z2_buf, z2_len, z1_buf);
+        ASSERT(z1_len == 1 || z1_buf[z1_len-1] != 0);
 
-        /* Subtract (z0 + z2) × base ^ m */
-        res_len = I_sub(z_res_buf+m, res_len+m, z0_buf, z0_len, z_res_buf+m);
+        if (tmp_prod_negative) {
+            /* z1 = z1 - tmp */
+            z1_len = I_sub(z1_buf, z1_len, tmp_buf, tmp_len, z1_buf);
+        } else {
+            /* z1 = z1 + tmp */
+            I_OPERATION(z1_len, I_add, z1_buf, z1_len, tmp_buf, tmp_len, z1_buf);
+        }
 
-        ASSERT(z_buf_end[0] == ERTS_HOLE_MARKER);
+        /* Add z1 shifted into the result */
+        I_OPERATION(res_len, I_add, z0_buf+m, z2_len+m, z1_buf, z1_len, z0_buf+m);
 
         /* Normalize */
-        while (z_res_buf[m + res_len - 1] == 0 && res_len > 0) {
+        res_len += m;
+        while (res_len > 1 && r[res_len - 1] == 0) {
             res_len--;
         }
-        res_len += m;
+        ASSERT(res_len == 1 || r[res_len-1] != 0);
         ASSERT(res_len <= xl + yl);
 
-        /* Copy result to the final result buffer. */
-        (void) I_add(z_res_buf, res_len, &zero, 1, r);
-
         erts_free(ERTS_ALC_T_TMP, (void *) heap);
         return res_len;
     }
diff --git a/erts/emulator/test/big_SUITE_data/karatsuba.dat b/erts/emulator/test/big_SUITE_data/karatsuba.dat
index c0a0a72647..d3eeb1edda 100644
--- a/erts/emulator/test/big_SUITE_data/karatsuba.dat
+++ b/erts/emulator/test/big_SUITE_data/karatsuba.dat
@@ -2,3 +2,5 @@
 778044957111982296698085106003820588379533248535175305369992153103173638825081172125947786580536601796787332015996348528501051686995129310226034229210961747151236268717981478782260 = 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222220638517163875604234197893799387492940714846904459640175648396747364515954208900839325461351363793129284077140658689554385146217203856200723212082378278681864294152015980850427464026656249693797220123249860586581459140699479021638770759493450252580845047833949914496709723236955652660 div 2856161719074522159237009590056107822635035670018713848188829444171911440810511153593372984982324471392734428893744842307433179041780071800813834204750896979634955588152420293439551458314069220674241649915149179367953255529141343871757486196569041879420486970654045852414605072383041.
 2856161719074522159237009590056107822635035670018713848188829444171911440810511153593372984982324471392734428893744842307433179041780071800813834204750896979634955588152420293439551458314069220674241649915149179367953255529141343871757486196569041879420486970654045852414605072383041 = 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222220638517163875604234197893799387492940714846904459640175648396747364515954208900839325461351363793129284077140658689554385146217203856200723212082378278681864294152015980850427464026656249693797220123249860586581459140699479021638770759493450252580845047833949914496709723236955652660 div 778044957111982296698085106003820588379533248535175305369992153103173638825081172125947786580536601796787332015996348528501051686995129310226034229210961747151236268717981478782260.
 111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111063449349767560406261235142392341647649025372723765646039413496854749810861561328494820951780143140256625626909176959305680458786553388553625107116000046915830257816726584619722724548281507789670906469943008403108740971884016290536011259624824529432103615715080701414809437629318331795425002605292373621627076391706037344544674952930655508373167888426163639927117607800531055216745789633238812571951209824449373098738720474325546029838911536645854016658738287810538248535167943048401518088275884360994522447081283976255652056784266085927715913003002338073550339930729586019785550085812920084971952784494047195550542735273493532002238413738149969014369426810026122179115108673871125270161926777717396432145405190153796887093089464334888549712739386389592196847353891969079332035195409407770223110583450176399233173852721971468580226906388907065682291829282704839192753805483705821353760692262887904886094845663298931749956912234282496355841821173993089465864518791095006027639953496042767023689710197973028632817727184498866638467324567019639370889678900544190692386873018896672246736368046857567131910710307733567633067262431422838734848 = 227244752509208666244300049023218501664936565624639514576438496566054518678090377206901554539288856996266272083253480866328891556584179795376043393179397439111001075307836347541226766323267029349186809052518632576630669501495499405707466858562586370759397778341785328471459982484138277798132149400741413820728000336730772425983653136275550466388207243285757977918 * 488949073121539043178895838851406898867685260673611653386208527301320256750945098442460084709113182789107256214464892052884956575049058585335894004824225556978848052700870382486432811126512032810019093103514678580988403100655092773497194921652262878660895819095201202957727846410635903499346091564032100080922029110666198856874372068909543752725127446446275130141704384403153173213642973260347547120927518593494548154693042687813126757638259919480596660015785042800818110628659177941014457037586017066097793890372685757581701160719984224943404424801688527463891923678834379546779645545606419569427574824563111913480237499286872496822079247464512671595600027881545516790726794520875456846025695904828856824122680691695622222780388482248652558345646407544371724549307907546148267185543556530668197490478774267191002528938567372851525085232721844598755176527243119768654561142351728852269053413174476938571816914419435443895662361267309023154705765158795604804766486473731465844362909575530822196003981963565054860681271890620405191718478159479477685388913902388171544282368568608479247952664643776459165494556000797676507398085908822491136.
+16#aeb17ba36a5a62ac6f0aad2b264d0787363825b9f0edf1ddd6e3d06eb970b70c90d5a43da0e234d85a2bd692ac118318965a1fa855019b8c65f32487755dc5677e27863aa4e4a6a82a76884c4d5d78f5b7807151b0179ee3b387b2118211610d832d1e7367a0e3cd50cce3ce2810e3567fc3fddf180c5ccd0572dc0f8662ef54e864e6182c3f951deff6d4a6cead4322e9bf3d55276f9dbdab649fa18fbdeaa89c002e037bb9090b1a5907ab6d18de09f8f376efdc0341ae360aa732405bf83cfe8342d644443208cfb8ef0568cd597de1ce7389878e48863bf0ebf1538ce2c317d8ac9f81976ae51617d7f6939582a8c28375caab30052d8ddf1b2995fb3891ea4541ef3d92bff37b6726052e8d7530b1f64a3cdfbba9cc320b55b2504417ff21986ceaaab8d4f73fafca6076e04fda786562571c5482b1f06b9b2762f51f3c1734284916153b377f147feb9ab398cf9ee46ba272c0ec8685f5a3832ff4e32aca370591f68bf38523839bd7367ebe02170150e87c69c3ff0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 = 16#ffffffffffffffff514e845c95a59d5319bf93b6817098d5d7971aec9505825ab1147be429d33c3c85e64de35cbde5d4346523fbc587238f2dc034f4089e119df20a0ddedd415203a7f0a3197be55398eed8be064b7654f4ad47b9bba204f02e04e3d5765209f9606f5d9dbf3b5a2d3734c8f69d2c4677c7d19b6e7ce34b705b220cd214d02435619b89c579d4110f7904aee7c0461b50a48e35c911cea6aae434020aa597a1dc70510e6dab26caf2327ee50d24077a61b317d42479cdf6e1ff00000000000000000000000000000000000000000000000000000000000000000000000000000000 * 16#aeb17ba36a5a62ace6406c497e8f672a2868e5136afa7da54eeb841bd62cc3c37a19b21ca3421a2bcb9adc043a78dc70d23fcb0bf761ee620df5f22122beadfc580f5ce6841aac67112741f9b489ab0b52b846445dfb0fd1fb1c2a89adf6069f90a26240c4a5d2c8cb370962d3b988382e6491831cb48fa4ddf32deb2fdbca9e64763a862beef086fb51183fb9e4af5b71ca36ee3159551bcbfdf55a685e238faef19254d9350dcd811af2dbf8859e4ce82bdb8632091e0100000000000000000000000000000000000000000000000000000000000000000000000000000000.
+16#34c8f69d2c4677c6d19b6e7ce34b705bd0be4db83a7e980e81ca31c352a076a32d17ccd3b115ce49dd214d2da4d36ea7ae1bbcc23ae3f69c1ca949af6143cea35124d82ffedc501525ca169af0b58ffb580f5ce6841aac67112741f9b489ab0b52b846445dfb0fd1fb1c2a89adf6069f90a26240c4a5d2c9 = 16#34c8f69d2c4677c7d19b6e7ce34b705b220cd214d02435619b89c579d4110f7904aee7c0461b50a48e35c911cea6aae434020aa597a1dc70510e6dab26caf2327ee50d24077a61b317d42479cdf6e1ff00000000000000000000000000000000000000000000000000000000000000000000000000000000 - 16#ffffffffffffffff514e845c95a59d5319bf93b6817098d5d7971aec9505825ab1147be429d33c3c85e64de35cbde5d4346523fbc587238f2dc034f4089e119df20a0ddedd415203a7f0a3197be55398eed8be064b7654f4ad47b9bba204f02e04e3d5765209f9606f5d9dbf3b5a2d37.
-- 
2.35.3

openSUSE Build Service is sponsored by