Skip to content

Comments

gh-145192: Improve performance of PySequence_GetSlice#145193

Open
eendebakpt wants to merge 4 commits intopython:mainfrom
eendebakpt:pyslice_fromindices
Open

gh-145192: Improve performance of PySequence_GetSlice#145193
eendebakpt wants to merge 4 commits intopython:mainfrom
eendebakpt:pyslice_fromindices

Conversation

@eendebakpt
Copy link
Contributor

@eendebakpt eendebakpt commented Feb 24, 2026

  • Improve performance of PySequence_GetSlice by avoiding an incref/decref pairs in _PySlice_FromIndices
  • The internal method _PyBuildSlice_Consume2 is refactored so that in consumes references. Before it had the funny property of consuming refs of the the first 2 two slice arguments, but not of the last.
  • _PyBuildSlice_ConsumeRefs (used in the opcode _BINARY_SLICE) avoids an incref on Py_None

The PySequence_GetSlice is part of the C api, with no direct equivalent from Python, so the benchmark is in C.

Main:

PySequence_GetSlice benchmark (2000000 iterations)
==============================================

list[1200:1204]      : 0.1314 s  (65.7 ns/call)
list[1500:1800]: 0.9673 s  (480.7 ns/call)
list[1500:1500]: 0.1138 s  (56.9 ns/call)

tuple[1200:1204]     : 0.1216 s  (60.8 ns/call)
tuple[1500:1800]: 1.0420 s  (501.0 ns/call)
tuple[1500:1500]  : 0.0873 s  (43.6 ns/call)

bytes[1200:1204]     : 0.1179 s  (58.9 ns/call)
bytes[1500:1800]: 0.1254 s  (62.7 ns/call)
bytes[1500:1500]  : 0.0905 s  (45.3 ns/call)

PR

PySequence_GetSlice benchmark (2000000 iterations)
==============================================

list[1200:1204]      : 0.1213 s  (60.6 ns/call)
list[1500:1800]: 0.9424 s  (471.2 ns/call)
list[1500:1500]: 0.1006 s  (50.3 ns/call)

tuple[1200:1204]     : 0.1026 s  (51.3 ns/call)
tuple[1500:1800]: 0.9769 s  (488.5 ns/call)
tuple[1500:1500]  : 0.0724 s  (36.2 ns/call)

bytes[1200:1204]     : 0.1001 s  (50.1 ns/call)
bytes[1500:1800]: 0.1093 s  (54.6 ns/call)
bytes[1500:1500]  : 0.0744 s  (37.2 ns/call)
Benchmark details

Run with optimized FT build.

Code:

#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <time.h>

#define ITERATIONS 2000000
#define SIZE 2000

static double
bench_getslice(PyObject *seq, Py_ssize_t i1, Py_ssize_t i2, int iterations)
{
    clock_t start = clock();
    for (int i = 0; i < iterations; i++) {
        PyObject *slice = PySequence_GetSlice(seq, i1, i2);
        if (slice == NULL) {
            PyErr_Print();
            return -1.0;
        }
        Py_DECREF(slice);
    }
    clock_t end = clock();
    return (double)(end - start) / CLOCKS_PER_SEC;
}

int
main(int argc, char *argv[])
{
    Py_Initialize();

    /* Benchmark with a list */
    PyObject *list = PyList_New(SIZE);
    if (list == NULL) {
        PyErr_Print();
        return 1;
    }
    for (Py_ssize_t i = 0; i < SIZE; i++) {
        PyObject *val = PyLong_FromSsize_t(i);
        if (val == NULL) {
            PyErr_Print();
            return 1;
        }
        PyList_SET_ITEM(list, i, val);  /* steals reference */
    }

    printf("PySequence_GetSlice benchmark (%d iterations)\n", ITERATIONS);
    printf("==============================================\n\n");

    /* Small slice */
    double t = bench_getslice(list, 1200, 1204, ITERATIONS);
    printf("list[1200:1204]      : %.4f s  (%.1f ns/call)\n",
           t, t / ITERATIONS * 1e9);

    /* Medium slice */
    t = bench_getslice(list, 1500, 1800, ITERATIONS);
    printf("list[1500:1800]: %.4f s  (%.1f ns/call)\n",
           t, t / ITERATIONS * 1e9);

    /* Empty slice */
    t = bench_getslice(list, 1500, 1500, ITERATIONS);
    printf("list[1500:1500]: %.4f s  (%.1f ns/call)\n",
           t, t / ITERATIONS * 1e9);

    Py_DECREF(list);

    /* Benchmark with a tuple */
    PyObject *tuple = PyTuple_New(SIZE);
    if (tuple == NULL) {
        PyErr_Print();
        return 1;
    }
    for (Py_ssize_t i = 0; i < SIZE; i++) {
        PyObject *val = PyLong_FromSsize_t(i);
        if (val == NULL) {
            PyErr_Print();
            return 1;
        }
        PyTuple_SET_ITEM(tuple, i, val);  /* steals reference */
    }

    printf("\n");

    t = bench_getslice(tuple, 1200, 1204, ITERATIONS);
    printf("tuple[1200:1204]     : %.4f s  (%.1f ns/call)\n",
           t, t / ITERATIONS * 1e9);

    t = bench_getslice(tuple, 1500, 1800, ITERATIONS);
    printf("tuple[1500:1800]: %.4f s  (%.1f ns/call)\n",
           t, t / ITERATIONS * 1e9);

    t = bench_getslice(tuple, 1500, 1500, ITERATIONS);
    printf("tuple[1500:1500]  : %.4f s  (%.1f ns/call)\n",
           t, t / ITERATIONS * 1e9);

    Py_DECREF(tuple);

    /* Benchmark with a bytes object */
    PyObject *bytes = PyBytes_FromStringAndSize(NULL, SIZE);
    if (bytes == NULL) {
        PyErr_Print();
        return 1;
    }
    char *buf = PyBytes_AS_STRING(bytes);
    for (int i = 0; i < SIZE; i++) {
        buf[i] = (char)i;
    }

    printf("\n");

    t = bench_getslice(bytes, 1200, 1204, ITERATIONS);
    printf("bytes[1200:1204]     : %.4f s  (%.1f ns/call)\n",
           t, t / ITERATIONS * 1e9);

    t = bench_getslice(bytes, 1500, 1800, ITERATIONS);
    printf("bytes[1500:1800]: %.4f s  (%.1f ns/call)\n",
           t, t / ITERATIONS * 1e9);

    t = bench_getslice(bytes, 1500, 1500, ITERATIONS);
    printf("bytes[1500:1500]  : %.4f s  (%.1f ns/call)\n",
           t, t / ITERATIONS * 1e9);

    Py_DECREF(bytes);

    if (Py_FinalizeEx() < 0) {
        return 120;
    }
    return 0;
}

@@ -0,0 +1 @@
Improve performance of :c:func:`PySequence_GetSlice`.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Change Looks really good to me, thank You for fixing this!

I only suggest going a Bit more in Detail for the Description, but very nice fix!

LGTM!

Suggested change
Improve performance of :c:func:`PySequence_GetSlice`.
Improve performance of :c:func:`PySequence_GetSlice` by avoiding unnecessary reference count decrements.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants