File lscsoft-glue-python-3.10-fixes.patch of Package lscsoft-glue

From 0df2bc1b9a994e5c3c741fa0925470a4183833b3 Mon Sep 17 00:00:00 2001
From: Duncan Macleod <duncan.macleod@ligo.org>
Date: Tue, 30 Nov 2021 16:42:29 +0000
Subject: [PATCH 1/2] glue.ligolw: fix python3.10 compatibility

This is a cross-project cherry-pick of https://git.ligo.org/kipp.cannon/python-ligo-lw/-/commit/93ecb764202ee8cb56722441eb339fd1e719b5aa

Co-authored-by: Kipp Cannon <kipp.cannon@ligo.org>
---
 glue/ligolw/tokenizer.RowDumper.c |   8 +-
 glue/ligolw/tokenizer.Tokenizer.c | 138 +++++++++++++++++++-----------
 2 files changed, 91 insertions(+), 55 deletions(-)

diff --git a/glue/ligolw/tokenizer.RowDumper.c b/glue/ligolw/tokenizer.RowDumper.c
index 796586fd..dfc97ff7 100644
--- a/glue/ligolw/tokenizer.RowDumper.c
+++ b/glue/ligolw/tokenizer.RowDumper.c
@@ -30,6 +30,8 @@
 #include <structmember.h>
 #include <stdlib.h>
 #include <tokenizer.h>
+#include <wchar.h>
+#include <wctype.h>
 
 
 /*
@@ -91,7 +93,7 @@ static void __del__(PyObject *self)
 static int __init__(PyObject *self, PyObject *args, PyObject *kwds)
 {
 	ligolw_RowDumper *rowdumper = (ligolw_RowDumper *) self;
-	Py_UNICODE default_delimiter = ',';
+	wchar_t default_delimiter = L',';
 
 	rowdumper->delimiter = NULL;
 	if(!PyArg_ParseTuple(args, "OO|U", &rowdumper->attributes, &rowdumper->formats, &rowdumper->delimiter))
@@ -100,7 +102,7 @@ static int __init__(PyObject *self, PyObject *args, PyObject *kwds)
 	if(rowdumper->delimiter)
 		Py_INCREF(rowdumper->delimiter);
 	else
-		rowdumper->delimiter = PyUnicode_FromUnicode(&default_delimiter, 1);
+		rowdumper->delimiter = PyUnicode_FromWideChar(&default_delimiter, 1);
 	rowdumper->attributes = llwtokenizer_build_attributes(rowdumper->attributes);
 	rowdumper->formats = llwtokenizer_build_formats(rowdumper->formats);
 	if(!rowdumper->delimiter || !rowdumper->attributes || !rowdumper->formats)
@@ -220,7 +222,7 @@ static PyObject *next(PyObject *self)
 		}
 
 		if(val == Py_None)
-			token = PyUnicode_FromUnicode(NULL, 0); /* u"" */
+			token = PyUnicode_FromWideChar(NULL, 0); /* u"" */
 		else
 			token = PyObject_CallFunctionObjArgs(PyTuple_GET_ITEM(rowdumper->formats, i), val, NULL);
 		Py_DECREF(val);
diff --git a/glue/ligolw/tokenizer.Tokenizer.c b/glue/ligolw/tokenizer.Tokenizer.c
index f38151e3..e5c25996 100644
--- a/glue/ligolw/tokenizer.Tokenizer.c
+++ b/glue/ligolw/tokenizer.Tokenizer.c
@@ -32,6 +32,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <tokenizer.h>
+#include <wchar.h>
+#include <wctype.h>
 
 
 /*
@@ -49,7 +51,7 @@
  */
 
 
-static const Py_UNICODE default_quote_characters[] = {'\'', '\"', '\0'};
+static const wchar_t default_quote_characters[] = {L'\'', L'\"', 0};
 
 
 /*
@@ -66,19 +68,19 @@ typedef struct {
 	/* the type to which the next parsed token will be converted */
 	PyObject **type;
 	/* delimiter character to be used in parsing */
-	Py_UNICODE delimiter;
+	wchar_t delimiter;
 	/* the character(s) to interpret as a quote character */
-	const Py_UNICODE *quote_characters;
+	const wchar_t *quote_characters;
 	/* the character to interpret as the escape character */
-	Py_UNICODE escape_character;
+	wchar_t escape_character;
 	/* size of internal buffer, minus null terminator */
 	Py_ssize_t allocation;
 	/* internal buffer */
-	Py_UNICODE *data;
+	wchar_t *data;
 	/* end of internal buffer's contents (null terminator) */
-	Py_UNICODE *length;
+	wchar_t *length;
 	/* current offset in buffer */
-	Py_UNICODE *pos;
+	wchar_t *pos;
 } ligolw_Tokenizer;
 
 
@@ -90,7 +92,12 @@ typedef struct {
 
 static int add_to_data(ligolw_Tokenizer *tokenizer, PyObject *unicode)
 {
+	/* FIXME:  remove GET_SIZE vers. when we require python >= 3.12 */
+#ifndef PyUnicode_GET_LENGTH
 	Py_ssize_t n = PyUnicode_GET_SIZE(unicode);
+#else
+	Py_ssize_t n = PyUnicode_GET_LENGTH(unicode);
+#endif
 
 	if(n) {
 		if(tokenizer->length - tokenizer->data + n > tokenizer->allocation) {
@@ -106,7 +113,7 @@ static int add_to_data(ligolw_Tokenizer *tokenizer, PyObject *unicode)
 			 * the null terminator
 			 */
 
-			Py_UNICODE *old_data = tokenizer->data;
+			wchar_t *old_data = tokenizer->data;
 
 			tokenizer->data = realloc(tokenizer->data, (tokenizer->allocation + n + 1) * sizeof(*tokenizer->data));
 			if(!tokenizer->data) {
@@ -132,7 +139,7 @@ static int add_to_data(ligolw_Tokenizer *tokenizer, PyObject *unicode)
 		 * terminator
 		 */
 
-		memcpy(tokenizer->length, PyUnicode_AsUnicode(unicode), n * sizeof(*tokenizer->length));
+		PyUnicode_AsWideChar(unicode, tokenizer->length, n);
 		tokenizer->length += n;
 		*tokenizer->length = 0;
 	}
@@ -184,7 +191,7 @@ static void unref_types(ligolw_Tokenizer *tokenizer)
  */
 
 
-static void parse_error(PyObject *exception, const Py_UNICODE *buffer, const ptrdiff_t buffer_length, const Py_UNICODE *pos, const char *msg)
+static void parse_error(PyObject *exception, const wchar_t *buffer, const ptrdiff_t buffer_length, const wchar_t *pos, const char *msg)
 {
 	PyObject *buffer_str;
 	PyObject *pos_str;
@@ -194,15 +201,15 @@ static void parse_error(PyObject *exception, const Py_UNICODE *buffer, const ptr
 	buffer_str = PyUnicode_Encode(buffer, buffer_length, NULL, NULL);
 	pos_str = PyUnicode_Encode(pos, 1, NULL, NULL);
 #else
-	buffer_str = PyUnicode_FromUnicode(buffer, buffer_length);
-	pos_str = PyUnicode_FromUnicode(pos, 1);
+	buffer_str = PyUnicode_FromWideChar(buffer, buffer_length);
+	pos_str = PyUnicode_FromWideChar(pos, 1);
 #endif
 
 	if(buffer_str && pos_str)
 #if PY_MAJOR_VERSION < 3
 		PyErr_Format(exception, "parse error in '%s' near '%s' at position %td: %s", PyString_AS_STRING(buffer_str), PyString_AS_STRING(pos_str), pos - buffer + 1, msg);
 #else
-		PyErr_Format(exception, "parse error in '%U' near '%U' at position %td: %s", buffer_str, pos_str, pos - buffer + 1, msg);
+		PyErr_Format(exception, "parse error in '%U' near '%U' at position %zd: %s", buffer_str, pos_str, (Py_ssize_t) (pos - buffer + 1), msg);
 #endif
 	else
 		PyErr_Format(exception, "parse error (details not available): %s", msg);
@@ -212,28 +219,14 @@ static void parse_error(PyObject *exception, const Py_UNICODE *buffer, const ptr
 }
 
 
-/*
- * Py_UNICODE equivalent of strchr()
- */
-
-
-static const Py_UNICODE *pyunicode_strchr(const Py_UNICODE *s, Py_UNICODE c)
-{
-	for(; *s; s++)
-		if(*s == c)
-			return s;
-	return NULL;
-}
-
-
 /*
  * Unescape a string.
  */
 
 
-static int unescape(Py_UNICODE *s, Py_UNICODE **end, const Py_UNICODE *escapable_characters, Py_UNICODE escape_character)
+static int unescape(wchar_t *s, wchar_t **end, const wchar_t *escapable_characters, wchar_t escape_character)
 {
-	Py_UNICODE *start = s;
+	wchar_t *start = s;
 	int escaped = 0;
 
 	while(*s) {
@@ -251,7 +244,7 @@ static int unescape(Py_UNICODE *s, Py_UNICODE **end, const Py_UNICODE *escapable
 		 * Check for an unrecognized escape sequence.
 		 */
 
-		if(!pyunicode_strchr(escapable_characters, *s)) {
+		if(!wcschr(escapable_characters, *s)) {
 			parse_error(PyExc_ValueError, start, *end - start - 1, s - 1, "unrecognized escape sequence");
 			return -1;
 		}
@@ -304,12 +297,12 @@ static int unescape(Py_UNICODE *s, Py_UNICODE **end, const Py_UNICODE *escapable
  */
 
 
-static PyObject *next_token(ligolw_Tokenizer *tokenizer, Py_UNICODE **start, Py_UNICODE **end)
+static PyObject *next_token(ligolw_Tokenizer *tokenizer, wchar_t **start, wchar_t **end)
 {
-	Py_UNICODE *pos = tokenizer->pos;
-	Py_UNICODE *bailout = tokenizer->length;
+	wchar_t *pos = tokenizer->pos;
+	wchar_t *bailout = tokenizer->length;
 	PyObject *type = *tokenizer->type;
-	Py_UNICODE quote_character;
+	wchar_t quote_character;
 
 	/*
 	 * The following code matches the pattern:
@@ -337,10 +330,10 @@ static PyObject *next_token(ligolw_Tokenizer *tokenizer, Py_UNICODE **start, Py_
 
 	if(pos >= bailout)
 		goto stop_iteration;
-	while(Py_UNICODE_ISSPACE(*pos))
+	while(iswspace(*pos))
 		if(++pos >= bailout)
 			goto stop_iteration;
-	if(pyunicode_strchr(tokenizer->quote_characters, *pos)) {
+	if(wcschr(tokenizer->quote_characters, *pos)) {
 		/*
 		 * Found a quoted token.
 		 */
@@ -368,7 +361,7 @@ static PyObject *next_token(ligolw_Tokenizer *tokenizer, Py_UNICODE **start, Py_
 		quote_character = 0;
 
 		*start = pos;
-		while(!Py_UNICODE_ISSPACE(*pos) && (*pos != tokenizer->delimiter))
+		while(!iswspace(*pos) && (*pos != tokenizer->delimiter))
 			if(++pos >= bailout)
 				goto stop_iteration;
 		*end = pos;
@@ -382,7 +375,7 @@ static PyObject *next_token(ligolw_Tokenizer *tokenizer, Py_UNICODE **start, Py_
 			*start = *end = NULL;
 	}
 	while(*pos != tokenizer->delimiter) {
-		if(!Py_UNICODE_ISSPACE(*pos)) {
+		if(!iswspace(*pos)) {
 			parse_error(PyExc_ValueError, *start, tokenizer->length - *start - 1, pos, "expected whitespace or delimiter");
 			return NULL;
 		}
@@ -416,7 +409,7 @@ static PyObject *next_token(ligolw_Tokenizer *tokenizer, Py_UNICODE **start, Py_
 		**end = 0;
 	if(quote_character) {
 		/* FIXME:  remove the delimiter */
-		Py_UNICODE escapable_characters[] = {quote_character, tokenizer->escape_character, tokenizer->delimiter, '\0'};
+		wchar_t escapable_characters[] = {quote_character, tokenizer->escape_character, tokenizer->delimiter, 0};
 		if(unescape(*start, end, escapable_characters, tokenizer->escape_character))
 			return NULL;
 	}
@@ -453,6 +446,10 @@ static PyObject *append(PyObject *self, PyObject *data)
 	int fail;
 
 	if(PyUnicode_Check(data)) {
+		/* FIXME:  remove when we require Python >= 3.12 */
+#ifdef PyUnicode_READY
+		PyUnicode_READY(data);
+#endif
 		fail = add_to_data((ligolw_Tokenizer *) self, data);
 	/* FIXME:  remove when we require >= 3 */
 #if PY_MAJOR_VERSION < 3
@@ -508,12 +505,25 @@ static int __init__(PyObject *self, PyObject *args, PyObject *kwds)
 	if(!PyArg_ParseTuple(args, "U", &arg))
 		return -1;
 
+	/* FIXME:  remove when we require Python >= 3.12 */
+#ifdef PyUnicode_READY
+	PyUnicode_READY(arg);
+#endif
+
+	/* FIXME:  remove _GET_SIZE vers. when we require Python >= 3.3 */
+#ifndef PyUnicode_GET_LENGTH
 	if(PyUnicode_GET_SIZE(arg) != 1) {
 		PyErr_SetString(PyExc_ValueError, "len(delimiter) != 1");
 		return -1;
 	}
+#else
+	if(PyUnicode_GET_LENGTH(arg) != 1) {
+		PyErr_SetString(PyExc_ValueError, "len(delimiter) != 1");
+		return -1;
+	}
+#endif
 
-	tokenizer->delimiter = *PyUnicode_AS_UNICODE(arg);
+	PyUnicode_AsWideChar(arg, &tokenizer->delimiter, 1);
 	tokenizer->quote_characters = default_quote_characters;
 	tokenizer->escape_character = '\\';
 	tokenizer->types = malloc(1 * sizeof(*tokenizer->types));
@@ -552,7 +562,7 @@ static PyObject *next(PyObject *self)
 	ligolw_Tokenizer *tokenizer = (ligolw_Tokenizer *) self;
 	PyObject *type;
 	PyObject *token;
-	Py_UNICODE *start, *end;
+	wchar_t *start, *end;
 
 	/*
 	 * Identify the start and end of the next token.
@@ -576,23 +586,25 @@ static PyObject *next(PyObject *self)
 		Py_INCREF(Py_None);
 		token = Py_None;
 	} else if(type == (PyObject *) &PyFloat_Type) {
-		char ascii_buffer[end - start + 1];
-		char *ascii_end;
-		if(PyUnicode_EncodeDecimal(start, end - start, ascii_buffer, NULL))
-			return NULL;
-		token = PyFloat_FromDouble(strtod(ascii_buffer, &ascii_end));
-		if(ascii_end == ascii_buffer || *ascii_end != 0) {
+		wchar_t buffer[end - start + 1];
+		wchar_t *buffer_end;
+		memcpy(buffer, start, (void *) end - (void *) start);
+		buffer[end - start] = 0;
+		token = PyFloat_FromDouble(wcstod(buffer, &buffer_end));
+		if(buffer_end == buffer || *buffer_end != 0) {
 			/*
-			 * strtod() couldn't convert the token, emulate
+			 * wcstod() couldn't convert the token, emulate
 			 * float()'s error message
 			 */
 
 			Py_XDECREF(token);
-			PyErr_Format(PyExc_ValueError, "invalid literal for float(): '%s'", ascii_buffer);
+			token = PyUnicode_FromWideChar(buffer, -1);
+			PyErr_Format(PyExc_ValueError, "invalid literal for float(): '%U'", token);
+			Py_DECREF(token);
 			token = NULL;
 		}
 	} else if(type == (PyObject *) &PyUnicode_Type) {
-		token = PyUnicode_FromUnicode(start, end - start);
+		token = PyUnicode_FromWideChar(start, end - start);
 	/* FIXME:  remove when we require >= 3 */
 #if PY_MAJOR_VERSION < 3
 	} else if(type == (PyObject *) &PyString_Type) {
@@ -601,7 +613,29 @@ static PyObject *next(PyObject *self)
 		token = PyInt_FromUnicode(start, end - start, 0);
 #endif
 	} else if(type == (PyObject *) &PyLong_Type) {
-		token = PyLong_FromUnicode(start, end - start, 0);
+		wchar_t buffer[end - start + 1];
+		wchar_t *buffer_end;
+		memcpy(buffer, start, (void *) end - (void *) start);
+		buffer[end - start] = 0;
+		/* FIXME:  although Python supports arbitrary precision
+		 * integers, this can only handle numbers that fit into a C
+		 * long long.  in practice, since we invariably
+		 * interoperate with C codes, that should be sufficient,
+		 * but it's a limitation of the library and should probably
+		 * be fixed */
+		token = PyLong_FromLongLong(wcstoll(buffer, &buffer_end, 0));
+		if(buffer_end == buffer || *buffer_end != 0) {
+			/*
+			 * wcstoll() couldn't convert the token, emulate
+			 * long()'s error message
+			 */
+
+			Py_XDECREF(token);
+			token = PyUnicode_FromWideChar(buffer, -1);
+			PyErr_Format(PyExc_ValueError, "invalid literal for long(): '%U'", token);
+			Py_DECREF(token);
+			token = NULL;
+		}
 	} else {
 		token = PyObject_CallFunction(type, "u#", start, end - start);
 	}
@@ -679,7 +713,7 @@ static PyObject *attribute_get_data(PyObject *obj, void *data)
 {
 	ligolw_Tokenizer *tokenizer = (ligolw_Tokenizer *) obj;
 
-	return PyUnicode_FromUnicode(tokenizer->pos, tokenizer->length - tokenizer->pos);
+	return PyUnicode_FromWideChar(tokenizer->pos, tokenizer->length - tokenizer->pos);
 }
 
 
-- 
GitLab
Places

File lscsoft-glue-python-3.10-fixes.patch of Package lscsoft-glue

Places