1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
#define _POSIX_C_SOURCE 200112L
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <signal.h>
#include <unistd.h> // alarm()
#include "../utf8.h"
#include "utf8-encode.h"
#include "bh-utf8.h"
#define SECONDS 6
#define BUFLEN 8 // MB
static uint32_t
pcg32(uint64_t *s)
{
uint64_t m = 0x9b60933458e17d7d;
uint64_t a = 0xd737232eeccdf7ed;
*s = *s * m + a;
int shift = 29 - (*s >> 61);
return *s >> shift;
}
/* Generate a random codepoint whose UTF-8 length is uniformly selected. */
static long
randchar(uint64_t *s)
{
uint32_t r = pcg32(s);
int len = 1 + (r & 0x3);
r >>= 2;
switch (len) {
case 1:
return r % 128;
case 2:
return 128 + r % (2048 - 128);
case 3:
return 2048 + r % (65536 - 2048);
case 4:
return 65536 + r % (131072 - 65536);
}
abort();
}
static volatile sig_atomic_t running;
static void
alarm_handler(int signum)
{
(void)signum;
running = 0;
}
/* Fill buffer with random characters, with evenly-distributed encoded
* lengths.
*/
static void *
buffer_fill(void *buf, size_t z)
{
uint64_t s = 0;
char *p = buf;
char *end = p + z;
while (p < end) {
long c;
do
c = randchar(&s);
while (IS_SURROGATE(c));
p = utf8_encode(p, c);
}
return p;
}
int
main(void)
{
long errors, n;
size_t z = BUFLEN * 1024L * 1024;
unsigned char *buffer = malloc(z);
unsigned char *end = buffer_fill(buffer, z);
/* Benchmark the branchless decoder */
running = 1;
signal(SIGALRM, alarm_handler);
alarm(SECONDS);
errors = n = 0;
do {
unsigned char *p = buffer;
int e = 0;
uint32_t c;
long count = 0;
while (p < end) {
p = utf8_decode(p, &c, &e);
errors += !!e; // force errors to be checked
count++;
}
if (p == end) // reached the end successfully?
n++;
} while (running);
double rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
printf("branchless: %f MB/s, %ld errors\n", rate, errors);
/* Benchmark Bjoern Hoehrmann's decoder */
running = 1;
signal(SIGALRM, alarm_handler);
alarm(SECONDS);
errors = n = 0;
do {
unsigned char *p = buffer;
uint32_t c;
uint32_t state = 0;
long count = 0;
for (; p < end; p++) {
if (!bh_utf8_decode(&state, &c, *p))
count++;
else if (state == UTF8_REJECT)
errors++; // force errors to be checked
}
if (p == end) // reached the end successfully?
n++;
} while (running);
rate = n * (end - buffer) / (double)SECONDS / 1024 / 1024;
printf("Hoehrmann: %f MB/s, %ld errors\n", rate, errors);
free(buffer);
}
|