summaryrefslogtreecommitdiff
path: root/test/benchmark.c
blob: 4eb51da22bf18e22a40e52d8b924891f33c5f180 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#define _POSIX_C_SOURCE 200112L
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <signal.h>

#include <unistd.h> // alarm()

#include "../utf8.h"
#include "utf8-encode.h"
#include "bh-utf8.h"

#define SECONDS 6
#define BUFLEN  8 // MB

static uint32_t
pcg32(uint64_t *s)
{
    uint64_t m = 0x9b60933458e17d7d;
    uint64_t a = 0xd737232eeccdf7ed;
    *s = *s * m + a;
    int shift = 29 - (*s >> 61);
    return *s >> shift;
}

/* Generate a random codepoint whose UTF-8 length is uniformly selected. */
static long
randchar(uint64_t *s)
{
    uint32_t r = pcg32(s);
    int len = 1 + (r & 0x3);
    r >>= 2;
    switch (len) {
        case 1:
            return r % 128;
        case 2:
            return 128 + r % (2048 - 128);
        case 3:
            return 2048 + r % (65536 - 2048);
        case 4:
            return 65536 + r % (131072 - 65536);
    }
    abort();
}

static volatile sig_atomic_t running;

static void
alarm_handler(int signum)
{
    (void)signum;
    running = 0;
}

/* Fill buffer with random characters, with evenly-distributed encoded
 * lengths.
 */
static void *
buffer_fill(void *buf, size_t z)
{
    uint64_t s = 0;
    char *p = buf;
    char *end = p + z;
    while (p < end) {
        long c;
        do
            c = randchar(&s);
        while (IS_SURROGATE(c));
        p = utf8_encode(p, c);
    }
    return p;
}

int
main(void)
{
    long errors, n;
    size_t z = BUFLEN * 1024L * 1024;
    unsigned char *buffer = malloc(z);
    unsigned char *end = buffer_fill(buffer, z);

    /* Benchmark the branchless decoder */
    running = 1;
    signal(SIGALRM, alarm_handler);
    alarm(SECONDS);
    errors = n = 0;
    do {
        unsigned char *p = buffer;
        int e = 0;
        uint32_t c;
        long count = 0;
        while (p < end) {
            p = utf8_decode(p, &c, &e);
            errors += !!e;  // force errors to be checked
            count++;
        }
        if (p == end) // reached the end successfully?
            n++;
    } while (running);

    double rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
    printf("branchless: %f MB/s, %ld errors\n", rate, errors);

    /* Benchmark Bjoern Hoehrmann's decoder */
    running = 1;
    signal(SIGALRM, alarm_handler);
    alarm(SECONDS);
    errors = n = 0;
    do {
        unsigned char *p = buffer;
        uint32_t c;
        uint32_t state = 0;
        long count = 0;
        for (; p < end; p++) {
            if (!bh_utf8_decode(&state, &c, *p))
                count++;
            else if (state == UTF8_REJECT)
                errors++;  // force errors to be checked
        }
        if (p == end) // reached the end successfully?
            n++;
    } while (running);

    rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
    printf("Hoehrmann:  %f MB/s, %ld errors\n", rate, errors);

    free(buffer);
}