Skip to content
Snippets Groups Projects
Select Git revision
  • e79765972a1de7a959159f43a54a7e7d11694bac
  • main default protected
  • master
3 results

SDL_config.h.in

Blame
  • speedups.c 5.70 KiB
    /* C implementation of performance sensitive functions. */
    
    #define PY_SSIZE_T_CLEAN
    #include <Python.h>
    #include <stdint.h> /* uint8_t, uint32_t, uint64_t */
    
    #if __ARM_NEON
    #include <arm_neon.h>
    #elif __SSE2__
    #include <emmintrin.h>
    #endif
    
    static const Py_ssize_t MASK_LEN = 4;
    
    /* Similar to PyBytes_AsStringAndSize, but accepts more types */
    
    static int
    _PyBytesLike_AsStringAndSize(PyObject *obj, PyObject **tmp, char **buffer, Py_ssize_t *length)
    {
        // This supports bytes, bytearrays, and memoryview objects,
        // which are common data structures for handling byte streams.
        // websockets.framing.prepare_data() returns only these types.
        // If *tmp isn't NULL, the caller gets a new reference.
        if (PyBytes_Check(obj))
        {
            *tmp = NULL;
            *buffer = PyBytes_AS_STRING(obj);
            *length = PyBytes_GET_SIZE(obj);
        }
        else if (PyByteArray_Check(obj))
        {
            *tmp = NULL;
            *buffer = PyByteArray_AS_STRING(obj);
            *length = PyByteArray_GET_SIZE(obj);
        }
        else if (PyMemoryView_Check(obj))
        {
            *tmp = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C');
            if (*tmp == NULL)
            {
                return -1;
            }
            Py_buffer *mv_buf;
            mv_buf = PyMemoryView_GET_BUFFER(*tmp);
            *buffer = mv_buf->buf;
            *length = mv_buf->len;
        }
        else
        {
            PyErr_Format(
                PyExc_TypeError,
                "expected a bytes-like object, %.200s found",
                Py_TYPE(obj)->tp_name);
            return -1;
        }
    
        return 0;
    }
    
    /* C implementation of websockets.utils.apply_mask */
    
    static PyObject *
    apply_mask(PyObject *self, PyObject *args, PyObject *kwds)
    {
    
        // In order to support various bytes-like types, accept any Python object.
    
        static char *kwlist[] = {"data", "mask", NULL};
        PyObject *input_obj;
        PyObject *mask_obj;
    
        // A pointer to a char * + length will be extracted from the data and mask
        // arguments, possibly via a Py_buffer.
    
        PyObject *input_tmp = NULL;
        char *input;
        Py_ssize_t input_len;
        PyObject *mask_tmp = NULL;
        char *mask;
        Py_ssize_t mask_len;
    
        // Initialize a PyBytesObject then get a pointer to the underlying char *
        // in order to avoid an extra memory copy in PyBytes_FromStringAndSize.
    
        PyObject *result = NULL;
        char *output;
    
        // Other variables.
    
        Py_ssize_t i = 0;
    
        // Parse inputs.
    
        if (!PyArg_ParseTupleAndKeywords(
                args, kwds, "OO", kwlist, &input_obj, &mask_obj))
        {
            goto exit;
        }
    
        if (_PyBytesLike_AsStringAndSize(input_obj, &input_tmp, &input, &input_len) == -1)
        {
            goto exit;
        }
    
        if (_PyBytesLike_AsStringAndSize(mask_obj, &mask_tmp, &mask, &mask_len) == -1)
        {
            goto exit;
        }
    
        if (mask_len != MASK_LEN)
        {
            PyErr_SetString(PyExc_ValueError, "mask must contain 4 bytes");
            goto exit;
        }
    
        // Create output.
    
        result = PyBytes_FromStringAndSize(NULL, input_len);
        if (result == NULL)
        {
            goto exit;
        }
    
        // Since we just created result, we don't need error checks.
        output = PyBytes_AS_STRING(result);
    
        // Perform the masking operation.
    
        // Apparently GCC cannot figure out the following optimizations by itself.
    
        // We need a new scope for MSVC 2010 (non C99 friendly)
        {
    #if __ARM_NEON
    
            // With NEON support, XOR by blocks of 16 bytes = 128 bits.
    
            Py_ssize_t input_len_128 = input_len & ~15;
            uint8x16_t mask_128 = vreinterpretq_u8_u32(vdupq_n_u32(*(uint32_t *)mask));
    
            for (; i < input_len_128; i += 16)
            {
                uint8x16_t in_128 = vld1q_u8((uint8_t *)(input + i));
                uint8x16_t out_128 = veorq_u8(in_128, mask_128);
                vst1q_u8((uint8_t *)(output + i), out_128);
            }
    
    #elif __SSE2__
    
            // With SSE2 support, XOR by blocks of 16 bytes = 128 bits.
    
            // Since we cannot control the 16-bytes alignment of input and output
            // buffers, we rely on loadu/storeu rather than load/store.
    
            Py_ssize_t input_len_128 = input_len & ~15;
            __m128i mask_128 = _mm_set1_epi32(*(uint32_t *)mask);
    
            for (; i < input_len_128; i += 16)
            {
                __m128i in_128 = _mm_loadu_si128((__m128i *)(input + i));
                __m128i out_128 = _mm_xor_si128(in_128, mask_128);
                _mm_storeu_si128((__m128i *)(output + i), out_128);
            }
    
    #else
    
            // Without SSE2 support, XOR by blocks of 8 bytes = 64 bits.
    
            // We assume the memory allocator aligns everything on 8 bytes boundaries.
    
            Py_ssize_t input_len_64 = input_len & ~7;
            uint32_t mask_32 = *(uint32_t *)mask;
            uint64_t mask_64 = ((uint64_t)mask_32 << 32) | (uint64_t)mask_32;
    
            for (; i < input_len_64; i += 8)
            {
                *(uint64_t *)(output + i) = *(uint64_t *)(input + i) ^ mask_64;
            }
    
    #endif
        }
    
        // XOR the remainder of the input byte by byte.
    
        for (; i < input_len; i++)
        {
            output[i] = input[i] ^ mask[i & (MASK_LEN - 1)];
        }
    
    exit:
        Py_XDECREF(input_tmp);
        Py_XDECREF(mask_tmp);
        return result;
    
    }
    
    static PyMethodDef speedups_methods[] = {
        {
            "apply_mask",
            (PyCFunction)apply_mask,
            METH_VARARGS | METH_KEYWORDS,
            "Apply masking to the data of a WebSocket message.",
        },
        {NULL, NULL, 0, NULL},      /* Sentinel */
    };
    
    static struct PyModuleDef speedups_module = {
        PyModuleDef_HEAD_INIT,
        "websocket.speedups",       /* m_name */
        "C implementation of performance sensitive functions.",
                                    /* m_doc */
        -1,                         /* m_size */
        speedups_methods,           /* m_methods */
        NULL,
        NULL,
        NULL,
        NULL
    };
    
    PyMODINIT_FUNC
    PyInit_speedups(void)
    {
        return PyModule_Create(&speedups_module);
    }