File htdig-simpleUTF8.patch of Package htdig
Index: htnet/HtHTTP.cc
===================================================================
--- htnet/HtHTTP.cc.orig
+++ htnet/HtHTTP.cc
@@ -643,6 +643,8 @@
String line = 0;
int inHeader = 1;
+ _needUTF8Convert = 0;
+
if (_response._modification_time)
{
delete _response._modification_time;
@@ -731,8 +733,15 @@
token = strtok(token, "\n\t");
if (token && *token)
+ {
_response._content_type = token;
-
+ if ((_response._content_type.indexOf("text/html") != -1) && (_response._content_type.indexOf("UTF-8") != -1))
+ {
+ if ( debug > 4 )
+ cout << "needUTF8Convert flagged" << endl;
+ _needUTF8Convert = 1;
+ }
+ }
}
else if( ! mystrncasecmp((char*)line, "content-length:", 15))
{
@@ -970,6 +979,31 @@
}
+ if ( _needUTF8Convert )
+ {
+ if ( debug > 4 )
+ cout << "Converting UTF-8 characters" << endl;
+
+ char *srcPtr, *dstPtr;
+ srcPtr = dstPtr = _response._contents.get();
+ while ( *srcPtr )
+ {
+ if ( ( *srcPtr & 0x80 ) == 0 )
+ *dstPtr++ = *srcPtr++;
+ else if ( ( *srcPtr & 0xE0 ) == 0xC0 ) {
+ *dstPtr++ = (((*srcPtr & 0x03) << 6) | (*(srcPtr+1) & 0x3F) ) & 0xFF;
+ srcPtr += 2;
+ } else if ( ( *srcPtr & 0xF0 ) == 0xE0 ) {
+ *dstPtr++ = '?';
+ srcPtr += 3;
+ } else {
+ *dstPtr++ = '?';
+ srcPtr += 4;
+ }
+ }
+ *dstPtr = 0;
+ }
+
// Set document length
_response._document_length = _response._contents.length();
Index: htnet/HtHTTP.h
===================================================================
--- htnet/HtHTTP.h.orig
+++ htnet/HtHTTP.h
@@ -316,6 +316,7 @@
int _bytes_read; // Bytes read
URL _url; // URL to retrieve
URL _referer; // Referring URL
+ int _needUTF8Convert; // Flag for simple UTF-8 convert
String _accept_language; // accept-language directive