diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index c42c0d4f5e9bc2..3e012aa685d9cd 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -584,6 +584,87 @@ def test_hex_separator_six_bytes(self): self.assertEqual(six_bytes.hex(':', -6), '0306090c0f12') self.assertEqual(six_bytes.hex(' ', -95), '0306090c0f12') + def test_hex_simd_boundaries(self): + # Test lengths around the SIMD threshold (16 bytes). + # SIMD processes 16 bytes at a time; smaller inputs use scalar code. + for length in (14, 15, 16, 17, 31, 32, 33, 64, 65): + data = self.type2test(bytes(range(length))) + expected = ''.join(f'{b:02x}' for b in range(length)) + with self.subTest(length=length): + self.assertEqual(data.hex(), expected) + + def test_hex_nibble_boundaries(self): + # Test the nibble value boundary at 9/10 (where '9' becomes 'a'). + # SIMD uses signed comparison for efficiency; verify correctness + # at this boundary for various nibble combinations. + boundary_bytes = self.type2test(bytes([ + 0x09, # both nibbles: 0, 9 + 0x0a, # both nibbles: 0, 10 + 0x90, # both nibbles: 9, 0 + 0x99, # both nibbles: 9, 9 (max all-digit) + 0x9a, # both nibbles: 9, 10 + 0xa0, # both nibbles: 10, 0 + 0xa9, # both nibbles: 10, 9 + 0xaa, # both nibbles: 10, 10 (min all-letter) + 0x00, # min value + 0xff, # max value + ])) + self.assertEqual(boundary_bytes.hex(), '090a90999aa0a9aa00ff') + + # Repeat with 16+ bytes to exercise SIMD path + simd_boundary = self.type2test(boundary_bytes * 2) + self.assertEqual(simd_boundary.hex(), '090a90999aa0a9aa00ff' * 2) + + def test_hex_simd_separator(self): + # Test SIMD path for separator insertion (sep >= 8 bytes, len >= 16). + # SIMD hexlifies then shuffles in-place to insert separators. + + # 32 bytes exercises SIMD; test various separator group sizes + data = self.type2test(bytes(range(32))) + + # bytes_per_sep=8: 4 groups of 8 bytes, 3 separators + self.assertEqual( + data.hex('-', 8), + '0001020304050607-08090a0b0c0d0e0f-' + '1011121314151617-18191a1b1c1d1e1f' + ) + # bytes_per_sep=9: groups of 9 from start, 5 byte remainder at end + self.assertEqual( + data.hex('.', 9), + '0001020304.05060708090a0b0c0d.' + '0e0f10111213141516.1718191a1b1c1d1e1f' + ) + # bytes_per_sep=16: 2 groups of 16 bytes + self.assertEqual( + data.hex(' ', 16), + '000102030405060708090a0b0c0d0e0f ' + '101112131415161718191a1b1c1d1e1f' + ) + # Negative bytes_per_sep: groups from end, remainder at start + self.assertEqual( + data.hex('|', -8), + '0001020304050607|08090a0b0c0d0e0f|' + '1011121314151617|18191a1b1c1d1e1f' + ) + self.assertEqual( + data.hex('_', -9), + '000102030405060708_090a0b0c0d0e0f1011_' + '12131415161718191a_1b1c1d1e1f' + ) + + # 20 bytes: SIMD (16) + 4 byte scalar remainder + data20 = self.type2test(bytes(range(20))) + # Positive: groups from start, remainder at end + self.assertEqual( + data20.hex('#', 8), + '00010203#0405060708090a0b#0c0d0e0f10111213' + ) + # Negative: groups from end, remainder at start + self.assertEqual( + data20.hex('@', -8), + '0001020304050607@08090a0b0c0d0e0f@10111213' + ) + def test_join(self): self.assertEqual(self.type2test(b"").join([]), b"") self.assertEqual(self.type2test(b"").join([b""]), b"") diff --git a/Python/pystrhex.c b/Python/pystrhex.c index af2f5c5dce5fca..85617fb67c69dd 100644 --- a/Python/pystrhex.c +++ b/Python/pystrhex.c @@ -4,6 +4,126 @@ #include "pycore_strhex.h" // _Py_strhex_with_sep() #include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency() +/* Scalar hexlify: convert len bytes to 2*len hex characters. + Uses table lookup via Py_hexdigits for the conversion. */ +static inline void +_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) +{ + /* Various optimizations like using math instead of a table lookup, + manually unrolling the loop, storing the global table pointer locally, + and doing wider dst writes have been tried and benchmarked; all produced + nearly identical performance on gcc 15. Using a 256 entry uint16_t + table was a bit slower. So we keep our old simple and obvious code. */ + for (Py_ssize_t i = 0; i < len; i++) { + unsigned char c = src[i]; + *dst++ = Py_hexdigits[c >> 4]; + *dst++ = Py_hexdigits[c & 0x0f]; + } +} + +/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions. + Uses __builtin_shufflevector for portable interleave that compiles to + native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always], + NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags + for the target microarch allow it [try -march=native if running 32-bit + on an rpi3 or later]). + + Requirements: + - GCC 12+ or Clang 3.0+ (for __builtin_shufflevector) + - x86-64, ARM64, or ARM32 with NEON + + Performance: + - Up to 11x faster on larger data than the scalar code. + - For more common small data it varies between 1.1-3x faster. + + Even faster is possible for big data using AVX2 or AVX512 but + that adds complication. Honestly, who really hexes _huge_ data?! + + Speeding up the 16-64 byte cases fits nicely with md5 through sha512. +*/ +#if (defined(__x86_64__) || defined(__aarch64__) || \ + (defined(__arm__) && defined(__ARM_NEON))) && \ + (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 12)) +# define PY_HEXLIFY_CAN_COMPILE_SIMD 1 +#else +# define PY_HEXLIFY_CAN_COMPILE_SIMD 0 +#endif + +#if PY_HEXLIFY_CAN_COMPILE_SIMD + +/* 128-bit vector of 16 unsigned bytes */ +typedef unsigned char v16u8 __attribute__((vector_size(16))); +/* 128-bit vector of 16 signed bytes - for efficient comparison. + Using signed comparison generates pcmpgtb on x86-64 instead of + the slower psubusb+pcmpeqb sequence from unsigned comparison. + ARM NEON performs the same either way. */ +typedef signed char v16s8 __attribute__((vector_size(16))); + +/* Splat a byte value across all 16 lanes */ +static inline v16u8 +v16u8_splat(unsigned char x) +{ + return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; +} + +static inline v16s8 +v16s8_splat(signed char x) +{ + return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; +} + +/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration. + Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */ +static void +_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) +{ + const v16u8 mask_0f = v16u8_splat(0x0f); + const v16u8 ascii_0 = v16u8_splat('0'); + const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */ + const v16s8 nine = v16s8_splat(9); + + Py_ssize_t i = 0; + + /* Process 16 bytes at a time */ + for (; i + 16 <= len; i += 16, dst += 32) { + /* Load 16 bytes (memcpy for safe unaligned access) */ + v16u8 data; + memcpy(&data, src + i, 16); + + /* Extract high and low nibbles using vector operators */ + v16u8 hi = (data >> 4) & mask_0f; + v16u8 lo = data & mask_0f; + + /* Compare > 9 using signed comparison for efficient codegen. + Nibble values 0-15 are safely in signed byte range. + This generates pcmpgtb on x86-64, avoiding the slower + psubusb+pcmpeqb sequence from unsigned comparison. */ + v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine); + v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine); + + /* Convert nibbles to hex ASCII */ + hi = hi + ascii_0 + (hi_gt9 & offset); + lo = lo + ascii_0 + (lo_gt9 & offset); + + /* Interleave hi/lo nibbles using portable shufflevector. + This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64, + or vzip on ARM32. */ + v16u8 result0 = __builtin_shufflevector(hi, lo, + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + v16u8 result1 = __builtin_shufflevector(hi, lo, + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + /* Store 32 hex characters */ + memcpy(dst, &result0, 16); + memcpy(dst + 16, &result1, 16); + } + + /* Scalar fallback for remaining 0-15 bytes */ + _Py_hexlify_scalar(src + i, dst, len - i); +} + +#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */ + static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, PyObject* sep, int bytes_per_sep_group, const int return_bytes) @@ -82,13 +202,16 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, unsigned char c; if (bytes_per_sep_group == 0) { - for (i = j = 0; i < arglen; ++i) { - assert((j + 1) < resultlen); - c = argbuf[i]; - retbuf[j++] = Py_hexdigits[c >> 4]; - retbuf[j++] = Py_hexdigits[c & 0x0f]; +#if PY_HEXLIFY_CAN_COMPILE_SIMD + if (arglen >= 16) { + // little vector units go brrrr... + _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen); + } + else +#endif + { + _Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen); } - assert(j == resultlen); } else { /* The number of complete chunk+sep periods */ @@ -96,6 +219,50 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, Py_ssize_t chunk; unsigned int k; +#if PY_HEXLIFY_CAN_COMPILE_SIMD + /* SIMD path for separator groups >= 8 bytes. + SIMD hexlify to output buffer, then shuffle in-place to insert + separators. Working backwards avoids overlap issues since we're + expanding (destination index >= source index). */ + if (abs_bytes_per_sep >= 8 && arglen >= 16) { + /* SIMD hexlify all bytes to start of output buffer */ + _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen); + + /* Shuffle in-place, working backwards */ + Py_ssize_t hex_chunk_size = 2 * (Py_ssize_t)abs_bytes_per_sep; + Py_ssize_t remainder_bytes = arglen - chunks * (Py_ssize_t)abs_bytes_per_sep; + Py_ssize_t remainder_hex_len = 2 * remainder_bytes; + Py_ssize_t hex_pos = 2 * arglen; /* End of hex data */ + Py_ssize_t out_pos = resultlen; /* End of output */ + + if (bytes_per_sep_group < 0) { + /* Forward: remainder at end, separators after each chunk */ + if (remainder_hex_len > 0) { + hex_pos -= remainder_hex_len; + out_pos -= remainder_hex_len; + memmove(retbuf + out_pos, retbuf + hex_pos, remainder_hex_len); + } + for (Py_ssize_t c = chunks - 1; c >= 0; c--) { + retbuf[--out_pos] = sep_char; + hex_pos -= hex_chunk_size; + out_pos -= hex_chunk_size; + memmove(retbuf + out_pos, retbuf + hex_pos, hex_chunk_size); + } + } + else { + /* Backward: remainder at start, separators before each chunk */ + for (Py_ssize_t c = chunks - 1; c >= 0; c--) { + hex_pos -= hex_chunk_size; + out_pos -= hex_chunk_size; + memmove(retbuf + out_pos, retbuf + hex_pos, hex_chunk_size); + retbuf[--out_pos] = sep_char; + } + /* Remainder at start stays in place (hex_pos == out_pos == remainder_hex_len) */ + } + goto done_hexlify; + } +#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */ + if (bytes_per_sep_group < 0) { i = j = 0; for (chunk = 0; chunk < chunks; chunk++) { @@ -133,6 +300,10 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, } } +#if PY_HEXLIFY_CAN_COMPILE_SIMD +done_hexlify: +#endif + #ifdef Py_DEBUG if (!return_bytes) { assert(_PyUnicode_CheckConsistency(retval, 1));