Skip to content

Commit 84736d5

Browse files
authored
Add support for 512-bit vectors in utf-8 validator (#32)
1 parent 1f9074d commit 84736d5

File tree

7 files changed

+44
-29
lines changed

7 files changed

+44
-29
lines changed

src/main/java/org/simdjson/CharactersClassifier.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ class CharactersClassifier {
99

1010
private static final ByteVector WHITESPACE_TABLE =
1111
ByteVector.fromArray(
12-
StructuralIndexer.SPECIES,
13-
repeat(new byte[]{' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100}, StructuralIndexer.SPECIES.vectorByteSize() / 4),
12+
StructuralIndexer.BYTE_SPECIES,
13+
repeat(new byte[]{' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100}, StructuralIndexer.BYTE_SPECIES.vectorByteSize() / 4),
1414
0);
1515

1616
private static final ByteVector OP_TABLE =
1717
ByteVector.fromArray(
18-
StructuralIndexer.SPECIES,
19-
repeat(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0}, StructuralIndexer.SPECIES.vectorByteSize() / 4),
18+
StructuralIndexer.BYTE_SPECIES,
19+
repeat(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0}, StructuralIndexer.BYTE_SPECIES.vectorByteSize() / 4),
2020
0);
2121

2222
private static byte[] repeat(byte[] array, int n) {

src/main/java/org/simdjson/JsonStringScanner.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ class JsonStringScanner {
1414
private long prevEscaped = 0;
1515

1616
JsonStringScanner() {
17-
this.backslashMask = ByteVector.broadcast(StructuralIndexer.SPECIES, (byte) '\\');
18-
this.quoteMask = ByteVector.broadcast(StructuralIndexer.SPECIES, (byte) '"');
17+
this.backslashMask = ByteVector.broadcast(StructuralIndexer.BYTE_SPECIES, (byte) '\\');
18+
this.quoteMask = ByteVector.broadcast(StructuralIndexer.BYTE_SPECIES, (byte) '"');
1919
}
2020

2121
JsonStringBlock next(ByteVector chunk0) {

src/main/java/org/simdjson/StringParser.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ class StringParser {
1010

1111
private static final byte BACKSLASH = '\\';
1212
private static final byte QUOTE = '"';
13-
private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
13+
private static final int BYTES_PROCESSED = StructuralIndexer.BYTE_SPECIES.vectorByteSize();
1414
private static final int MIN_HIGH_SURROGATE = 0xD800;
1515
private static final int MAX_HIGH_SURROGATE = 0xDBFF;
1616
private static final int MIN_LOW_SURROGATE = 0xDC00;
@@ -31,7 +31,7 @@ void parseString(byte[] buffer, int idx) {
3131
int src = idx + 1;
3232
int dst = stringBufferIdx + Integer.BYTES;
3333
while (true) {
34-
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
34+
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.BYTE_SPECIES, buffer, src);
3535
srcVec.intoArray(stringBuffer, dst);
3636
long backslashBits = srcVec.eq(BACKSLASH).toLong();
3737
long quoteBits = srcVec.eq(QUOTE).toLong();

src/main/java/org/simdjson/StructuralIndexer.java

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,43 @@
11
package org.simdjson;
22

33
import jdk.incubator.vector.ByteVector;
4+
import jdk.incubator.vector.IntVector;
5+
import jdk.incubator.vector.VectorShape;
46
import jdk.incubator.vector.VectorSpecies;
5-
import java.lang.invoke.MethodType;
67

78
import static jdk.incubator.vector.VectorOperators.UNSIGNED_LE;
89

910
class StructuralIndexer {
1011

11-
static final VectorSpecies<Byte> SPECIES;
12+
static final VectorSpecies<Integer> INT_SPECIES;
13+
static final VectorSpecies<Byte> BYTE_SPECIES;
1214
static final int N_CHUNKS;
1315

1416
static {
1517
String species = System.getProperty("org.simdjson.species", "preferred");
16-
SPECIES = switch(species) {
17-
case "preferred" -> ByteVector.SPECIES_PREFERRED;
18-
case "512" -> ByteVector.SPECIES_512;
19-
case "256" -> ByteVector.SPECIES_256;
18+
switch (species) {
19+
case "preferred" -> {
20+
BYTE_SPECIES = ByteVector.SPECIES_PREFERRED;
21+
INT_SPECIES = IntVector.SPECIES_PREFERRED;
22+
}
23+
case "512" -> {
24+
BYTE_SPECIES = ByteVector.SPECIES_512;
25+
INT_SPECIES = IntVector.SPECIES_512;
26+
}
27+
case "256" -> {
28+
BYTE_SPECIES = ByteVector.SPECIES_256;
29+
INT_SPECIES = IntVector.SPECIES_256;
30+
}
2031
default -> throw new IllegalArgumentException("Unsupported vector species: " + species);
21-
};
22-
N_CHUNKS = 64 / SPECIES.vectorByteSize();
23-
if (SPECIES != ByteVector.SPECIES_256 && SPECIES != ByteVector.SPECIES_512) {
24-
throw new IllegalArgumentException("Unsupported vector species: " + SPECIES);
32+
}
33+
N_CHUNKS = 64 / BYTE_SPECIES.vectorByteSize();
34+
assertSupportForSpecies(BYTE_SPECIES);
35+
assertSupportForSpecies(INT_SPECIES);
36+
}
37+
38+
private static void assertSupportForSpecies(VectorSpecies<?> species) {
39+
if (species.vectorShape() != VectorShape.S_256_BIT && species.vectorShape() != VectorShape.S_512_BIT) {
40+
throw new IllegalArgumentException("Unsupported vector species: " + species);
2541
}
2642
}
2743

@@ -48,7 +64,7 @@ void step(byte[] buffer, int offset, int blockIndex) {
4864
}
4965

5066
private void step1(byte[] buffer, int offset, int blockIndex) {
51-
ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_512, buffer, offset);
67+
ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_512, buffer, offset);
5268
JsonStringBlock strings = stringScanner.next(chunk0);
5369
JsonCharacterBlock characters = classifier.classify(chunk0);
5470
long unescaped = lteq(chunk0, (byte) 0x1F);
@@ -75,7 +91,7 @@ private void finishStep(JsonCharacterBlock characters, JsonStringBlock strings,
7591
bitIndexes.write(blockIndex, prevStructurals);
7692
prevStructurals = potentialStructuralStart & ~strings.stringTail();
7793
unescapedCharsError |= strings.nonQuoteInsideString(unescaped);
78-
}
94+
}
7995

8096
private long lteq(ByteVector chunk0, byte scalar) {
8197
long r = chunk0.compare(UNSIGNED_LE, scalar).toLong();

src/main/java/org/simdjson/Utf8Validator.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44

55
import java.util.Arrays;
66

7-
public class Utf8Validator {
8-
private static final VectorSpecies<Byte> VECTOR_SPECIES = ByteVector.SPECIES_256;
7+
class Utf8Validator {
8+
9+
private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.BYTE_SPECIES;
910
private static final ByteVector INCOMPLETE_CHECK = getIncompleteCheck();
10-
private static final VectorShuffle<Integer> SHIFT_FOUR_BYTES_FORWARD = VectorShuffle.iota(IntVector.SPECIES_256,
11-
IntVector.SPECIES_256.elementSize() - 1, 1, true);
11+
private static final VectorShuffle<Integer> SHIFT_FOUR_BYTES_FORWARD = VectorShuffle.iota(StructuralIndexer.INT_SPECIES,
12+
StructuralIndexer.INT_SPECIES.elementSize() - 1, 1, true);
1213
private static final ByteVector LOW_NIBBLE_MASK = ByteVector.broadcast(VECTOR_SPECIES, 0b0000_1111);
1314
private static final ByteVector ALL_ASCII_MASK = ByteVector.broadcast(VECTOR_SPECIES, (byte) 0b1000_0000);
1415

@@ -39,7 +40,7 @@ static void validate(byte[] inputBytes) {
3940

4041
errors |= secondCheck.compare(VectorOperators.NE, 0).toLong();
4142
}
42-
previousFourUtf8Bytes = utf8Vector.reinterpretAsInts().lane(IntVector.SPECIES_256.length() - 1);
43+
previousFourUtf8Bytes = utf8Vector.reinterpretAsInts().lane(StructuralIndexer.INT_SPECIES.length() - 1);
4344
}
4445

4546
// if the input file doesn't align with the vector width, pad the missing bytes with zero

src/test/java/org/simdjson/TestUtils.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ static String padWithSpaces(String str) {
1919
}
2020

2121
static ByteVector chunk(String str, int n) {
22-
return ByteVector.fromArray(StructuralIndexer.SPECIES, str.getBytes(UTF_8), n * StructuralIndexer.SPECIES.vectorByteSize());
22+
return ByteVector.fromArray(StructuralIndexer.BYTE_SPECIES, str.getBytes(UTF_8), n * StructuralIndexer.BYTE_SPECIES.vectorByteSize());
2323
}
2424

2525
static byte[] toUtf8(String str) {

src/test/java/org/simdjson/Utf8ValidatorTest.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,17 @@
11
package org.simdjson;
22

3-
import jdk.incubator.vector.ByteVector;
43
import jdk.incubator.vector.VectorSpecies;
54
import org.junit.jupiter.api.Test;
65
import org.junit.jupiter.params.ParameterizedTest;
76
import org.junit.jupiter.params.provider.ValueSource;
87

98
import java.io.IOException;
109
import java.util.Arrays;
11-
import java.util.Objects;
1210

1311
import static org.assertj.core.api.Assertions.*;
1412

1513
class Utf8ValidatorTest {
16-
private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.SPECIES;
14+
private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.BYTE_SPECIES;
1715

1816

1917
/* ASCII / 1 BYTE TESTS */

0 commit comments

Comments
 (0)