File 54b41e87156bc823d5938749d71c4c57adc75b1b.patch of Package epub2txt2

From 54b41e87156bc823d5938749d71c4c57adc75b1b Mon Sep 17 00:00:00 2001
From: Kevin Boone <kevin@railwayterrace.com>
Date: Thu, 30 Jun 2022 15:41:59 +0100
Subject: [PATCH] Fixed handling of URL-encoded spine hrefs

---
 README.md      |  1 +
 src/epub2txt.c |  3 ++-
 src/util.c     | 44 +++++++++++++++++++++++++++++++++++++++++++-
 src/util.h     |  6 +++++-
 src/xhtml.c    |  5 +++++
 5 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4fbcafc..546242f 100644
--- a/README.md
+++ b/README.md
@@ -244,6 +244,7 @@ covered.
 
 Date | Change
 -----|-------
+?,&nbsp;Jun&nbsp;2022 | Fixed handling of URL-encoded spine href's 
 2.06,&nbsp;Jun&nbsp;2022 | Fixed bug in invoking unzip 
 2.05,&nbsp;Apr&nbsp;2022 | Fixed bug with empty metadata tags 
 2.04,&nbsp;Apr&nbsp;2022 | Improved handling of UTF-8 BOMs 
diff --git a/src/epub2txt.c b/src/epub2txt.c
index 7e9f4a1..72e0504 100644
--- a/src/epub2txt.c
+++ b/src/epub2txt.c
@@ -312,7 +312,8 @@ List *epub2txt_get_items (const char *opf, char **error)
 			char *val2 = r3->attributes[p].value;
 			if (strcmp (name2, "href") == 0)
 			  {
-			  list_append (ret, strdup (val2));
+                          char *decoded_val2 = decode_url (val2);
+			  list_append (ret, decoded_val2);
 			  }
 			}
 		      }
diff --git a/src/util.c b/src/util.c
index 853343c..16e7431 100644
--- a/src/util.c
+++ b/src/util.c
@@ -1,12 +1,14 @@
 /*============================================================================
   epub2txt v2 
   util.c
-  Copyright (c)2022 Marco Bonelli, GPL v3.0
+  Copyright (c)2022 Marco Bonelli, Kevin Boone, GPL v3.0
 ============================================================================*/
 
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
 #include <signal.h>
 #include <sys/wait.h>
 #include "util.h"
@@ -16,6 +18,7 @@
 run_command
 Run an helper command through fork + execvp, wait for it to finish and return
 its status. Log execvp errors, and abort execution if abort_on_error is TRUE.
+(Marco Bonelli)
 *==========================================================================*/
 int run_command (const char *const argv[], BOOL abort_on_error)
   {
@@ -39,3 +42,42 @@ int run_command (const char *const argv[], BOOL abort_on_error)
   waitpid (pid, &status, 0);
   return status;
   }
+
+/*==========================================================================
+  Decode %xx in URL-type strings. The caller must free the resulting
+  string, which will be no longer than the input. 
+  (Kevin Boone)
+*==========================================================================*/
+char *decode_url (const char *url)
+  {
+  char *ret = malloc (strlen (url) + 2);
+
+  int len = 0;
+  for (; *url; len++) 
+    {
+    if (*url == '%' && url[1] && url[2] && 
+        isxdigit(url[1]) && isxdigit(url[2])) 
+      {
+      char url1 = url[1];
+      char url2 = url[2];
+      url1 -= url1 <= '9' ? '0' : (url1 <= 'F' ? 'A' : 'a')-10;
+      url2 -= url2 <= '9' ? '0' : (url2 <= 'F' ? 'A' : 'a')-10;
+      ret[len] = 16 * url1 + url2;
+      url += 3;
+      continue;
+      }
+    else if (*url == '+')
+      {
+      /* I have not tested this piece of the function, because I have not
+         seen any instances of '+' (meaning space) in a spine href */
+      url += 1;
+      ret[len] = ' '; 
+      }
+    ret[len] = *url++;
+    }
+  ret[len] = '\0';
+
+  return ret;
+  }
+
+
diff --git a/src/util.h b/src/util.h
index 2685a02..6b0c197 100644
--- a/src/util.h
+++ b/src/util.h
@@ -1,7 +1,7 @@
 /*============================================================================
   epub2txt v2 
   util.h
-  Copyright (c)2022 Marco Bonelli, GPL v3.0
+  Copyright (c)2022 Marco Bonelli, Kevin Boone GPL v3.0
 ============================================================================*/
 
 #pragma once
@@ -9,3 +9,7 @@
 #include "defs.h"
 
 int run_command (const char *const argv[], BOOL abort_on_error);
+
+/** Decode %xx in URL-type strings. The caller must free the resulting
+    string, which will be no longer than the input. */
+char *decode_url (const char *url);
diff --git a/src/xhtml.c b/src/xhtml.c
index 1338882..fbfceae 100644
--- a/src/xhtml.c
+++ b/src/xhtml.c
@@ -530,6 +530,8 @@ WString *xhtml_transform_char (uint32_t c, BOOL to_ascii)
 ============================================================================*/
 WString *xhtml_translate_entity (const WString *entity)
   {
+  /* Program flow in this function is very ugly, and prone to memory
+     leaks when modified. The whole thing needs to be rewritten */
   char out[20];
   IN
   char *in = wstring_to_utf8 (entity);
@@ -569,8 +571,11 @@ WString *xhtml_translate_entity (const WString *entity)
       WString *ret = wstring_create_empty();
       wstring_append_c (ret, (uint32_t)v);
       OUT
+      free (s);
+      free (in);
       return ret; 
       } 
+    free (s);
     }
   else 
     {
openSUSE Build Service is sponsored by