[vfs] optimize ExtendibleHashMap.size()

+ scan through segments without allocating SegmentLayout object
+ more tests

GitOrigin-RevId: 85dd5aec9039d6177cd11a34d9a6273d6a38fbb7
This commit is contained in:
Ruslan Cheremin
2024-05-09 15:37:14 +02:00
committed by intellij-monorepo-bot
parent b37df5fb80
commit e4e404ff28
4 changed files with 169 additions and 42 deletions

View File

@@ -60,7 +60,7 @@ public abstract class KeyValueStoreTestBase<K, V, S extends KeyValueStore<K, V>>
* Converts int (substrate) into a Entry(key, value).
* Property: for different input substrate (int) function must return both Key and Value different
*/
private final IntFunction<Entry<K, V>> keyValueSubstrateDecoder;
private final IntFunction<? extends Entry<K, V>> keyValueSubstrateDecoder;
@BeforeAll
static void beforeAll() {
@@ -68,7 +68,7 @@ public abstract class KeyValueStoreTestBase<K, V, S extends KeyValueStore<K, V>>
}
protected KeyValueStoreTestBase(@NotNull IntFunction<Entry<K, V>> decoder) { keyValueSubstrateDecoder = decoder; }
protected KeyValueStoreTestBase(@NotNull IntFunction<? extends Entry<K, V>> decoder) { keyValueSubstrateDecoder = decoder; }
@BeforeEach
void setUp(@TempDir Path tempDir) throws IOException {
@@ -194,4 +194,8 @@ public abstract class KeyValueStoreTestBase<K, V, S extends KeyValueStore<K, V>>
storage = factory().open(storagePath);
return storage;
}
protected Path storagePath() {
return storagePath;
}
}

View File

@@ -125,6 +125,41 @@ public abstract class DurableIntToMultiIntMapTestBase<M extends DurableIntToMult
assertTrue(values.contains(3), "Values for key=1 must contain 3");
}
@Test
public void manyKeyValues_PutRemovedAndPutAgain_AreAllExistInTheMap() throws IOException {
long[] packedKeysValues = generateUniqueKeyValues(entriesCountToTest);
//test for entries removing-overwriting: add/remove/add again the entries from the map,
// and check map behave as-if only the last added entries are there
putAllEntries(multimap, packedKeysValues);
removeAllEntries(multimap, packedKeysValues);
assertEquals(0, multimap.size(),
"Map must be empty since all entries were removed"
);
for (long packedKeyValue : packedKeysValues) {
int key = key(packedKeyValue);
int value = value(packedKeyValue);
boolean reallyPut = multimap.put(key, value);
assertTrue(reallyPut, "(" + key + ", " + value + ") must be a new entry for the map");
}
assertEquals(packedKeysValues.length,
multimap.size(),
"Map.size must be equal to the number of entries added"
);
for (long packedKeyValue : packedKeysValues) {
int key = key(packedKeyValue);
int value = value(packedKeyValue);
assertTrue(multimap.has(key, value),
"(" + key + ", " + value + ") must exist in the map");
}
}
@Test
public void withManyKeyValuesPut_MultimapSizeIsEqualToNumberOfTruthReturned() throws IOException {
@@ -174,11 +209,7 @@ public abstract class DurableIntToMultiIntMapTestBase<M extends DurableIntToMult
@Test
public void remove_DeletesKeyValueFromTheMultimap_AndReturnsTrueIfKeyValuePreviouslyExists() throws IOException {
long[] packedKeysValues = generateUniqueKeyValues(entriesCountToTest);
for (long packedKeyValue : packedKeysValues) {
int key = key(packedKeyValue);
int value = value(packedKeyValue);
multimap.put(key, value);
}
putAllEntries(multimap, packedKeysValues);
for (long packedKeyValue : packedKeysValues) {
int key = key(packedKeyValue);
@@ -256,13 +287,33 @@ public abstract class DurableIntToMultiIntMapTestBase<M extends DurableIntToMult
/* ======================== infrastructure: ================================================================ */
private int lookupSingleValue(int key,
int value) throws IOException {
protected static void putAllEntries(@NotNull DurableIntToMultiIntMap multimap,
long[] packedKeysValues) throws IOException {
for (long packedKeyValue : packedKeysValues) {
int key = key(packedKeyValue);
int value = value(packedKeyValue);
multimap.put(key, value);
}
}
private static void removeAllEntries(@NotNull DurableIntToMultiIntMap multimap,
long[] packedKeysValues) throws IOException {
for (long packedKeyValue : packedKeysValues) {
int key = key(packedKeyValue);
int value = value(packedKeyValue);
multimap.remove(key, value);
}
}
protected int lookupSingleValue(int key,
int value) throws IOException {
return multimap.lookup(key, v -> (v == value));
}
private static @NotNull IntOpenHashSet lookupAllValues(@NotNull DurableIntToMultiIntMap multimap,
int key) throws IOException {
protected static @NotNull IntOpenHashSet lookupAllValues(@NotNull DurableIntToMultiIntMap multimap,
int key) throws IOException {
IntOpenHashSet values = new IntOpenHashSet();
multimap.lookup(key, value -> {
values.add(value);
@@ -271,19 +322,19 @@ public abstract class DurableIntToMultiIntMapTestBase<M extends DurableIntToMult
return values;
}
private static int key(long packedKeyValue) {
protected static int key(long packedKeyValue) {
return (int)(packedKeyValue >> 32);
}
private static int value(long packedKeyValue) {
protected static int value(long packedKeyValue) {
return (int)packedKeyValue;
}
private static long[] generateUniqueKeyValues(int size) {
protected static long[] generateUniqueKeyValues(int size) {
return ThreadLocalRandom.current().longs()
//generate more multi-value-keys, to better check appropriate branches: +1,+2,... is almost always
// have same upper 32 bits (=key), but different lower 32 bits (=value) => this switch creates
// approximately 1/14 of keys with 2 values, and another 3/14 of keys with 5 values, and 10/14 of
// approximately 3/14 of keys with 2 values, another 1/14 of keys with 5 values, and 10/14 of
// keys with a single value
.flatMap(v -> switch ((int)(v % 14)) {
case 13 -> LongStream.of(v, v + 1, v - 1, v + 42, v - 42);

View File

@@ -1,6 +1,8 @@
// Copyright 2000-2024 JetBrains s.r.o. and contributors. Use of this source code is governed by the Apache 2.0 license.
package com.intellij.platform.util.io.storages.intmultimaps.extendiblehashmap;
import com.intellij.platform.util.io.storages.intmultimaps.extendiblehashmap.ExtendibleHashMap.HashMapSegmentLayout;
import org.jetbrains.annotations.NotNull;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -12,15 +14,31 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
public class SegmentLayoutTest {
private ExtendibleHashMap.HashMapSegmentLayout segmentLayout;
public static final int SEGMENT_INDEX = 1;
public static final int SEGMENT_SIZE = 1 << 16;
private HashMapSegmentLayout segmentLayout;
private ExtendibleHashMap.BufferSource bufferSource;
@BeforeEach
void setUp() throws IOException {
ByteBuffer buffer = ByteBuffer.allocate(1 << 16);
segmentLayout = new ExtendibleHashMap.HashMapSegmentLayout(
(offsetInFile, length) -> buffer,
1,
buffer.capacity()
ByteBuffer buffer = ByteBuffer.allocate(SEGMENT_SIZE);
bufferSource = new ExtendibleHashMap.BufferSource() {
@Override
public @NotNull ByteBuffer slice(long offsetInFile, int length) throws IOException {
return buffer;
}
@Override
public int getInt(long offsetInFile) throws IOException {
//we emulate 1st segment (because first segment is header, it is special), but buffer is 0-based
return buffer.getInt((int)offsetInFile - SEGMENT_SIZE);
}
};
segmentLayout = new HashMapSegmentLayout(
bufferSource,
SEGMENT_INDEX,
SEGMENT_SIZE
);
}
@@ -111,4 +129,24 @@ public class SegmentLayoutTest {
new int[]{0b0011, 0b0111, 0b1011, 0b1111}
);
}
@Test
void updated_aliveEntriesCount_MustBeReadViaBothAccessors() throws IOException {
for (int aliveEntriesCount : new int[]{1, 10, 42, Integer.MAX_VALUE / 2}) {
segmentLayout.updateAliveEntriesCount(aliveEntriesCount);
assertEquals(
aliveEntriesCount,
segmentLayout.aliveEntriesCount(),
"Must be the value just updated in"
);
assertEquals(
aliveEntriesCount,
HashMapSegmentLayout.aliveEntriesCount(bufferSource, 1, SEGMENT_SIZE),
"Must be the value just updated in"
);
}
}
}

View File

@@ -79,7 +79,7 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
// 0) remove 'synchronized' and put synchronization on clients to decide?
// 1) prune tombstones (see comment in a .split() method for details)
// 2) .size() is now O(N), make it O(1)
// 3) Half-utilized segmentTable room (see HeaderLayout)
// 3) Half-utilized segmentTable room (see HeaderLayout comments)
private final MMappedFileStorage storage;
private transient BufferSource bufferSource;
@@ -110,12 +110,7 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
this.storage = storage;
boolean fileIsEmpty = (storage.actualFileSize() == 0);
bufferSource = (offsetInFile, length) -> {
ByteBuffer buffer = storage.pageByOffset(offsetInFile).rawPageBuffer();
int offsetInPage = storage.toOffsetInPage(offsetInFile);
return buffer.slice(offsetInPage, length)
.order(buffer.order());
};
bufferSource = new BufferSourceOverMMappedFileStorage(storage);
header = new HeaderLayout(bufferSource, segmentSize);
@@ -245,8 +240,7 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
int segmentsCount = header.actualSegmentsCount();
int totalEntries = 0;
for (int segmentIndex = 1; segmentIndex <= segmentsCount; segmentIndex++) {
HashMapSegmentLayout segment = new HashMapSegmentLayout(bufferSource, segmentIndex, segmentSize);
totalEntries += segment.aliveEntriesCount();
totalEntries += HashMapSegmentLayout.aliveEntriesCount(bufferSource, segmentIndex, segmentSize);
}
return totalEntries;
}
@@ -257,8 +251,8 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
int segmentSize = header.segmentSize();
int segmentsCount = header.actualSegmentsCount();
for (int segmentIndex = 1; segmentIndex <= segmentsCount; segmentIndex++) {
HashMapSegmentLayout segment = new HashMapSegmentLayout(bufferSource, segmentIndex, segmentSize);
if (segment.aliveEntriesCount() > 0) {
int aliveEntriesCount = HashMapSegmentLayout.aliveEntriesCount(bufferSource, segmentIndex, segmentSize);
if (aliveEntriesCount > 0) {
return false;
}
}
@@ -771,6 +765,20 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
return segmentBuffer.getInt(LIVE_ENTRIES_COUNT_OFFSET);
}
/**
* 'Inlined' version of {@code new HashMapSegmentLayout(bufferSource, segmentIndex, segmentSize).aliveEntriesCount()}
* with reduced allocations and slicing
*/
public static int aliveEntriesCount(@NotNull BufferSource bufferSource,
int segmentIndex,
int segmentSize) throws IOException {
if (segmentIndex < 1) {
throw new IllegalArgumentException("segmentIndex(=" + segmentIndex + ") must be >=1 (0-th segment is a header)");
}
long offsetInFile = segmentIndex * (long)segmentSize;
return bufferSource.getInt(offsetInFile + LIVE_ENTRIES_COUNT_OFFSET);
}
@Override
public void updateAliveEntriesCount(int aliveCount) {
segmentBuffer.putInt(LIVE_ENTRIES_COUNT_OFFSET, aliveCount);
@@ -868,11 +876,13 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
}
}
@FunctionalInterface
public interface BufferSource {
@NotNull
ByteBuffer slice(long offsetInFile,
int length) throws IOException;
/** == {@code slice(offsetInFile, 4).getInt()} */
int getInt(long offsetInFile) throws IOException;
}
/** Abstracts data storage for open-addressing hash-table implementation */
@@ -893,15 +903,16 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
int value);
}
public static final class HashMapAlgo {
private static final class HashMapAlgo {
public static final int NO_VALUE = 0;
private final float loadFactor;
//MAYBE RC: in-memory open-addressing hashtables usually employ loadFactor=0.5 -- to prevent excessive
// clustering and probe sequence length increasing. But for on-disk hash table it may worth to actually
// increase loadFactor up to 0.6-0.7 -- because less IO due to more compact representation may easily
// outweigh more probing.
//MAYBE RC: load factor and probing algorithm choice: in-memory open-addressing hash tables usually employ
// loadFactor=0.5 -- to prevent excessive clustering and probe sequence length increasing. For multi-valued
// maps this may be even down to 0.4 due to more excessive clustering (see comments in Int2IntMultimap).
// But for on-disk hash table it may worth to actually increase loadFactor up to 0.6-0.7 -- because less
// IO due to more compact representation may easily outweigh more probing.
// Tuning probing sequence could be also a thing: currently we use linear probing, since it is the
// fastest, and also has spatial locality -- consequent probes are one-after-another, which is beneficial
// given probing is done over IO-backed storage. But linear probing is also more susceptible to clustering,
@@ -919,7 +930,7 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
// (key: NO_VALUE, value != NO_VALUE) = 'tombstone', i.e. deleted slot (key-value pair was inserted and removed)
public HashMapAlgo(float loadFactor) {
private HashMapAlgo(float loadFactor) {
this.loadFactor = loadFactor;
}
@@ -1017,9 +1028,9 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
// not very effective
if (aliveValues(table) == 0) {
//If there is 0 alive records => it is OK to clear all the tombstones.
// We can't clear all tombstones while there alive entries because such cleaning breaks lookup: we treat
// We can't clear all tombstones while alive entries exist because such a cleaning breaks lookup: we treat
// free slots and tombstones differently during the probing -- continue to probe over tombstones, but stop
// on free slots. Converting tombstone to free slot could stop probing earlier than it should stop, thus
// on free slots. Converting tombstone to free slot could stop the probing earlier than it should stop, thus
// making some existing entries unreachable.
// But if there are no alive entries anymore -- we finally _can_ clear everything without breaking anything!
@@ -1036,7 +1047,7 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
//Table must be resized well before such a condition occurs!
throw new AssertionError(
"Table is full: all " + capacity + " items were traversed, but no free slot found" +
"Table is full: all " + capacity + " items were traversed, but no free slot found " +
"table.aliveEntries: " + table.aliveEntriesCount() + ", table: " + table
);
}
@@ -1180,4 +1191,27 @@ public class ExtendibleHashMap implements DurableIntToMultiIntMap, Unmappable {
}
}
}
private record BufferSourceOverMMappedFileStorage(@NotNull MMappedFileStorage storage) implements BufferSource {
@Override
public @NotNull ByteBuffer slice(long offsetInFile,
int length) throws IOException {
ByteBuffer buffer = storage.pageByOffset(offsetInFile).rawPageBuffer();
int offsetInPage = storage.toOffsetInPage(offsetInFile);
return buffer.slice(offsetInPage, length)
.order(buffer.order());
}
@Override
public int getInt(long offsetInFile) throws IOException {
ByteBuffer buffer = storage.pageByOffset(offsetInFile).rawPageBuffer();
int offsetInPage = storage.toOffsetInPage(offsetInFile);
return buffer.getInt(offsetInPage);
}
@Override
public String toString() {
return "BufferSourceOverMMappedFileStorage{" + storage + '}';
}
}
}