diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py
index c42c0d4f5e9bc2..3e012aa685d9cd 100644
--- a/Lib/test/test_bytes.py
+++ b/Lib/test/test_bytes.py
@@ -584,6 +584,87 @@ def test_hex_separator_six_bytes(self):
         self.assertEqual(six_bytes.hex(':', -6), '0306090c0f12')
         self.assertEqual(six_bytes.hex(' ', -95), '0306090c0f12')
 
+    def test_hex_simd_boundaries(self):
+        # Test lengths around the SIMD threshold (16 bytes).
+        # SIMD processes 16 bytes at a time; smaller inputs use scalar code.
+        for length in (14, 15, 16, 17, 31, 32, 33, 64, 65):
+            data = self.type2test(bytes(range(length)))
+            expected = ''.join(f'{b:02x}' for b in range(length))
+            with self.subTest(length=length):
+                self.assertEqual(data.hex(), expected)
+
+    def test_hex_nibble_boundaries(self):
+        # Test the nibble value boundary at 9/10 (where '9' becomes 'a').
+        # SIMD uses signed comparison for efficiency; verify correctness
+        # at this boundary for various nibble combinations.
+        boundary_bytes = self.type2test(bytes([
+            0x09,  # both nibbles: 0, 9
+            0x0a,  # both nibbles: 0, 10
+            0x90,  # both nibbles: 9, 0
+            0x99,  # both nibbles: 9, 9 (max all-digit)
+            0x9a,  # both nibbles: 9, 10
+            0xa0,  # both nibbles: 10, 0
+            0xa9,  # both nibbles: 10, 9
+            0xaa,  # both nibbles: 10, 10 (min all-letter)
+            0x00,  # min value
+            0xff,  # max value
+        ]))
+        self.assertEqual(boundary_bytes.hex(), '090a90999aa0a9aa00ff')
+
+        # Repeat with 16+ bytes to exercise SIMD path
+        simd_boundary = self.type2test(boundary_bytes * 2)
+        self.assertEqual(simd_boundary.hex(), '090a90999aa0a9aa00ff' * 2)
+
+    def test_hex_simd_separator(self):
+        # Test SIMD path for separator insertion (sep >= 8 bytes, len >= 16).
+        # SIMD hexlifies then shuffles in-place to insert separators.
+
+        # 32 bytes exercises SIMD; test various separator group sizes
+        data = self.type2test(bytes(range(32)))
+
+        # bytes_per_sep=8: 4 groups of 8 bytes, 3 separators
+        self.assertEqual(
+            data.hex('-', 8),
+            '0001020304050607-08090a0b0c0d0e0f-'
+            '1011121314151617-18191a1b1c1d1e1f'
+        )
+        # bytes_per_sep=9: groups of 9 from start, 5 byte remainder at end
+        self.assertEqual(
+            data.hex('.', 9),
+            '0001020304.05060708090a0b0c0d.'
+            '0e0f10111213141516.1718191a1b1c1d1e1f'
+        )
+        # bytes_per_sep=16: 2 groups of 16 bytes
+        self.assertEqual(
+            data.hex(' ', 16),
+            '000102030405060708090a0b0c0d0e0f '
+            '101112131415161718191a1b1c1d1e1f'
+        )
+        # Negative bytes_per_sep: groups from end, remainder at start
+        self.assertEqual(
+            data.hex('|', -8),
+            '0001020304050607|08090a0b0c0d0e0f|'
+            '1011121314151617|18191a1b1c1d1e1f'
+        )
+        self.assertEqual(
+            data.hex('_', -9),
+            '000102030405060708_090a0b0c0d0e0f1011_'
+            '12131415161718191a_1b1c1d1e1f'
+        )
+
+        # 20 bytes: SIMD (16) + 4 byte scalar remainder
+        data20 = self.type2test(bytes(range(20)))
+        # Positive: groups from start, remainder at end
+        self.assertEqual(
+            data20.hex('#', 8),
+            '00010203#0405060708090a0b#0c0d0e0f10111213'
+        )
+        # Negative: groups from end, remainder at start
+        self.assertEqual(
+            data20.hex('@', -8),
+            '0001020304050607@08090a0b0c0d0e0f@10111213'
+        )
+
     def test_join(self):
         self.assertEqual(self.type2test(b"").join([]), b"")
         self.assertEqual(self.type2test(b"").join([b""]), b"")
diff --git a/Python/pystrhex.c b/Python/pystrhex.c
index af2f5c5dce5fca..85617fb67c69dd 100644
--- a/Python/pystrhex.c
+++ b/Python/pystrhex.c
@@ -4,6 +4,126 @@
 #include "pycore_strhex.h"        // _Py_strhex_with_sep()
 #include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()
 
+/* Scalar hexlify: convert len bytes to 2*len hex characters.
+   Uses table lookup via Py_hexdigits for the conversion. */
+static inline void
+_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
+{
+    /* Various optimizations like using math instead of a table lookup,
+       manually unrolling the loop, storing the global table pointer locally,
+       and doing wider dst writes have been tried and benchmarked; all produced
+       nearly identical performance on gcc 15.  Using a 256 entry uint16_t
+       table was a bit slower.  So we keep our old simple and obvious code. */
+    for (Py_ssize_t i = 0; i < len; i++) {
+        unsigned char c = src[i];
+        *dst++ = Py_hexdigits[c >> 4];
+        *dst++ = Py_hexdigits[c & 0x0f];
+    }
+}
+
+/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
+   Uses __builtin_shufflevector for portable interleave that compiles to
+   native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
+   NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
+   for the target microarch allow it [try -march=native if running 32-bit
+   on an rpi3 or later]).
+
+   Requirements:
+   - GCC 12+ or Clang 3.0+ (for __builtin_shufflevector)
+   - x86-64, ARM64, or ARM32 with NEON
+
+   Performance:
+   - Up to 11x faster on larger data than the scalar code.
+   - For more common small data it varies between 1.1-3x faster.
+
+   Even faster is possible for big data using AVX2 or AVX512 but
+   that adds complication. Honestly, who really hexes _huge_ data?!
+
+   Speeding up the 16-64 byte cases fits nicely with md5 through sha512.
+*/
+#if (defined(__x86_64__) || defined(__aarch64__) || \
+     (defined(__arm__) && defined(__ARM_NEON))) && \
+    (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 12))
+#  define PY_HEXLIFY_CAN_COMPILE_SIMD 1
+#else
+#  define PY_HEXLIFY_CAN_COMPILE_SIMD 0
+#endif
+
+#if PY_HEXLIFY_CAN_COMPILE_SIMD
+
+/* 128-bit vector of 16 unsigned bytes */
+typedef unsigned char v16u8 __attribute__((vector_size(16)));
+/* 128-bit vector of 16 signed bytes - for efficient comparison.
+   Using signed comparison generates pcmpgtb on x86-64 instead of
+   the slower psubusb+pcmpeqb sequence from unsigned comparison.
+   ARM NEON performs the same either way. */
+typedef signed char v16s8 __attribute__((vector_size(16)));
+
+/* Splat a byte value across all 16 lanes */
+static inline v16u8
+v16u8_splat(unsigned char x)
+{
+    return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
+}
+
+static inline v16s8
+v16s8_splat(signed char x)
+{
+    return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
+}
+
+/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
+   Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
+static void
+_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
+{
+    const v16u8 mask_0f = v16u8_splat(0x0f);
+    const v16u8 ascii_0 = v16u8_splat('0');
+    const v16u8 offset = v16u8_splat('a' - '0' - 10);  /* 0x27 */
+    const v16s8 nine = v16s8_splat(9);
+
+    Py_ssize_t i = 0;
+
+    /* Process 16 bytes at a time */
+    for (; i + 16 <= len; i += 16, dst += 32) {
+        /* Load 16 bytes (memcpy for safe unaligned access) */
+        v16u8 data;
+        memcpy(&data, src + i, 16);
+
+        /* Extract high and low nibbles using vector operators */
+        v16u8 hi = (data >> 4) & mask_0f;
+        v16u8 lo = data & mask_0f;
+
+        /* Compare > 9 using signed comparison for efficient codegen.
+           Nibble values 0-15 are safely in signed byte range.
+           This generates pcmpgtb on x86-64, avoiding the slower
+           psubusb+pcmpeqb sequence from unsigned comparison. */
+        v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
+        v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);
+
+        /* Convert nibbles to hex ASCII */
+        hi = hi + ascii_0 + (hi_gt9 & offset);
+        lo = lo + ascii_0 + (lo_gt9 & offset);
+
+        /* Interleave hi/lo nibbles using portable shufflevector.
+           This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
+           or vzip on ARM32. */
+        v16u8 result0 = __builtin_shufflevector(hi, lo,
+            0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+        v16u8 result1 = __builtin_shufflevector(hi, lo,
+            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+
+        /* Store 32 hex characters */
+        memcpy(dst, &result0, 16);
+        memcpy(dst + 16, &result1, 16);
+    }
+
+    /* Scalar fallback for remaining 0-15 bytes */
+    _Py_hexlify_scalar(src + i, dst, len - i);
+}
+
+#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */
+
 static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
                                  PyObject* sep, int bytes_per_sep_group,
                                  const int return_bytes)
@@ -82,13 +202,16 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
     unsigned char c;
 
     if (bytes_per_sep_group == 0) {
-        for (i = j = 0; i < arglen; ++i) {
-            assert((j + 1) < resultlen);
-            c = argbuf[i];
-            retbuf[j++] = Py_hexdigits[c >> 4];
-            retbuf[j++] = Py_hexdigits[c & 0x0f];
+#if PY_HEXLIFY_CAN_COMPILE_SIMD
+        if (arglen >= 16) {
+            // little vector units go brrrr...
+            _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
+        }
+        else
+#endif
+        {
+            _Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen);
         }
-        assert(j == resultlen);
     }
     else {
         /* The number of complete chunk+sep periods */
@@ -96,6 +219,50 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
         Py_ssize_t chunk;
         unsigned int k;
 
+#if PY_HEXLIFY_CAN_COMPILE_SIMD
+        /* SIMD path for separator groups >= 8 bytes.
+           SIMD hexlify to output buffer, then shuffle in-place to insert
+           separators. Working backwards avoids overlap issues since we're
+           expanding (destination index >= source index). */
+        if (abs_bytes_per_sep >= 8 && arglen >= 16) {
+            /* SIMD hexlify all bytes to start of output buffer */
+            _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
+
+            /* Shuffle in-place, working backwards */
+            Py_ssize_t hex_chunk_size = 2 * (Py_ssize_t)abs_bytes_per_sep;
+            Py_ssize_t remainder_bytes = arglen - chunks * (Py_ssize_t)abs_bytes_per_sep;
+            Py_ssize_t remainder_hex_len = 2 * remainder_bytes;
+            Py_ssize_t hex_pos = 2 * arglen;   /* End of hex data */
+            Py_ssize_t out_pos = resultlen;    /* End of output */
+
+            if (bytes_per_sep_group < 0) {
+                /* Forward: remainder at end, separators after each chunk */
+                if (remainder_hex_len > 0) {
+                    hex_pos -= remainder_hex_len;
+                    out_pos -= remainder_hex_len;
+                    memmove(retbuf + out_pos, retbuf + hex_pos, remainder_hex_len);
+                }
+                for (Py_ssize_t c = chunks - 1; c >= 0; c--) {
+                    retbuf[--out_pos] = sep_char;
+                    hex_pos -= hex_chunk_size;
+                    out_pos -= hex_chunk_size;
+                    memmove(retbuf + out_pos, retbuf + hex_pos, hex_chunk_size);
+                }
+            }
+            else {
+                /* Backward: remainder at start, separators before each chunk */
+                for (Py_ssize_t c = chunks - 1; c >= 0; c--) {
+                    hex_pos -= hex_chunk_size;
+                    out_pos -= hex_chunk_size;
+                    memmove(retbuf + out_pos, retbuf + hex_pos, hex_chunk_size);
+                    retbuf[--out_pos] = sep_char;
+                }
+                /* Remainder at start stays in place (hex_pos == out_pos == remainder_hex_len) */
+            }
+            goto done_hexlify;
+        }
+#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */
+
         if (bytes_per_sep_group < 0) {
             i = j = 0;
             for (chunk = 0; chunk < chunks; chunk++) {
@@ -133,6 +300,10 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
         }
     }
 
+#if PY_HEXLIFY_CAN_COMPILE_SIMD
+done_hexlify:
+#endif
+
 #ifdef Py_DEBUG
     if (!return_bytes) {
         assert(_PyUnicode_CheckConsistency(retval, 1));